diff options
Diffstat (limited to 'mm')
143 files changed, 21129 insertions, 10513 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 0331f1461f81..ff7b209dec05 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -23,7 +23,7 @@ menuconfig SWAP in your computer. If unsure say Y. config ZSWAP - bool "Compressed cache for swap pages (EXPERIMENTAL)" + bool "Compressed cache for swap pages" depends on SWAP select FRONTSWAP select CRYPTO @@ -36,12 +36,6 @@ config ZSWAP in the case where decompressing from RAM is faster than swap device reads, can also improve workload performance. - This is marked experimental because it is a new feature (as of - v3.11) that interacts heavily with memory reclaim. While these - interactions don't cause any known issues on simple memory setups, - they have not be fully explored on the large set of potential - configurations and workloads that exist. - config ZSWAP_DEFAULT_ON bool "Enable the compressed cache for swap pages by default" depends on ZSWAP @@ -225,17 +219,43 @@ config SLUB and has enhanced diagnostics. SLUB is the default choice for a slab allocator. -config SLOB +config SLOB_DEPRECATED depends on EXPERT - bool "SLOB (Simple Allocator)" + bool "SLOB (Simple Allocator - DEPRECATED)" depends on !PREEMPT_RT help + Deprecated and scheduled for removal in a few cycles. SLUB + recommended as replacement. CONFIG_SLUB_TINY can be considered + on systems with 16MB or less RAM. + + If you need SLOB to stay, please contact linux-mm@kvack.org and + people listed in the SLAB ALLOCATOR section of MAINTAINERS file, + with your use case. + SLOB replaces the stock allocator with a drastically simpler allocator. SLOB is generally more space efficient but does not perform as well on large systems. endchoice +config SLOB + bool + default y + depends on SLOB_DEPRECATED + +config SLUB_TINY + bool "Configure SLUB for minimal memory footprint" + depends on SLUB && EXPERT + select SLAB_MERGE_DEFAULT + help + Configures the SLUB allocator in a way to achieve minimal memory + footprint, sacrificing scalability, debugging and other features. + This is intended only for the smallest system that had used the + SLOB allocator and is not recommended for systems with more than + 16MB RAM. + + If unsure, say N. + config SLAB_MERGE_DEFAULT bool "Allow slab caches to be merged" default y @@ -253,7 +273,7 @@ config SLAB_MERGE_DEFAULT config SLAB_FREELIST_RANDOM bool "Randomize slab freelist" - depends on SLAB || SLUB + depends on SLAB || (SLUB && !SLUB_TINY) help Randomizes the freelist order used on creating new pages. This security feature reduces the predictability of the kernel slab @@ -261,7 +281,7 @@ config SLAB_FREELIST_RANDOM config SLAB_FREELIST_HARDENED bool "Harden slab freelist metadata" - depends on SLAB || SLUB + depends on SLAB || (SLUB && !SLUB_TINY) help Many kernel heap attacks try to target slab cache metadata and other infrastructure. This options makes minor performance @@ -273,7 +293,7 @@ config SLAB_FREELIST_HARDENED config SLUB_STATS default n bool "Enable SLUB performance statistics" - depends on SLUB && SYSFS + depends on SLUB && SYSFS && !SLUB_TINY help SLUB statistics are useful to debug SLUBs allocation behavior in order find ways to optimize the allocator. This should never be @@ -285,7 +305,7 @@ config SLUB_STATS config SLUB_CPU_PARTIAL default y - depends on SLUB && SMP + depends on SLUB && SMP && !SLUB_TINY bool "SLUB per cpu partial cache" help Per cpu partial caches accelerate objects allocation and freeing @@ -579,6 +599,12 @@ config COMPACTION it and then we would be really interested to hear about that at linux-mm@kvack.org. +config COMPACT_UNEVICTABLE_DEFAULT + int + depends on COMPACTION + default 0 if PREEMPT_RT + default 1 + # # support for free page reporting config PAGE_REPORTING @@ -775,7 +801,7 @@ endchoice config THP_SWAP def_bool y - depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP + depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP && 64BIT help Swap transparent huge pages in one piece, without splitting. XXX: For now, swap cluster backing transparent huge page @@ -1005,6 +1031,14 @@ config ARCH_USES_HIGH_VMA_FLAGS config ARCH_HAS_PKEYS bool +config ARCH_USES_PG_ARCH_X + bool + help + Enable the definition of PG_arch_x page flags with x > 1. Only + suitable for 64-bit architectures with CONFIG_FLATMEM or + CONFIG_SPARSEMEM_VMEMMAP enabled, otherwise there may not be + enough room for additional bits in page->flags. + config VM_EVENT_COUNTERS default y bool "Enable VM event counters for /proc/vmstat" if EXPERT @@ -1044,7 +1078,7 @@ config GUP_TEST comment "GUP_TEST needs to have DEBUG_FS enabled" depends on !GUP_TEST && !DEBUG_FS -config GUP_GET_PTE_LOW_HIGH +config GUP_GET_PXX_LOW_HIGH bool config ARCH_HAS_PTE_SPECIAL @@ -1074,7 +1108,13 @@ config IO_MAPPING bool config SECRETMEM - def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED + default y + bool "Enable memfd_secret() system call" if EXPERT + depends on ARCH_HAS_SET_DIRECT_MAP + help + Enable the memfd_secret() system call with the ability to create + memory areas visible only in the context of the owning process and + not mapped to other processes and other kernel page tables. config ANON_VMA_NAME bool "Anonymous VMA name support" @@ -1107,23 +1147,42 @@ config HAVE_ARCH_USERFAULTFD_MINOR help Arch has userfaultfd minor fault support -config PTE_MARKER - bool - - help - Allows to create marker PTEs for file-backed memory. - config PTE_MARKER_UFFD_WP bool "Userfaultfd write protection support for shmem/hugetlbfs" default y depends on HAVE_ARCH_USERFAULTFD_WP - select PTE_MARKER help Allows to create marker PTEs for userfaultfd write protection purposes. It is required to enable userfaultfd write protection on file-backed memory types like shmem and hugetlbfs. +# multi-gen LRU { +config LRU_GEN + bool "Multi-Gen LRU" + depends on MMU + # make sure folio->flags has enough spare bits + depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP + help + A high performance LRU implementation to overcommit memory. See + Documentation/admin-guide/mm/multigen_lru.rst for details. + +config LRU_GEN_ENABLED + bool "Enable by default" + depends on LRU_GEN + help + This option enables the multi-gen LRU by default. + +config LRU_GEN_STATS + bool "Full stats for debugging" + depends on LRU_GEN + help + Do not enable this option unless you plan to look at historical stats + from evicted generations for debugging purpose. + + This option has a per-memcg and per-node memory overhead. +# } + source "mm/damon/Kconfig" endmenu diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index ce8dded36de9..fca699ad1fb0 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -56,7 +56,7 @@ config DEBUG_SLAB config SLUB_DEBUG default y bool "Enable SLUB debugging support" if EXPERT - depends on SLUB && SYSFS + depends on SLUB && SYSFS && !SLUB_TINY select STACKDEPOT if STACKTRACE_SUPPORT help SLUB has extensive debug support features. Disabling these can diff --git a/mm/Makefile b/mm/Makefile index 9a564f836403..8e105e5b3e29 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -52,7 +52,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o percpu.o slab_common.o \ - compaction.o vmacache.o \ + compaction.o \ interval_tree.o list_lru.o workingset.o \ debug.o gup.o mmap_lock.o $(mmu-y) @@ -89,14 +89,18 @@ obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_KASAN) += kasan/ obj-$(CONFIG_KFENCE) += kfence/ +obj-$(CONFIG_KMSAN) += kmsan/ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMTEST) += memtest.o obj-$(CONFIG_MIGRATION) += migrate.o +obj-$(CONFIG_NUMA) += memory-tiers.o obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o obj-$(CONFIG_PAGE_COUNTER) += page_counter.o obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o -obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o +ifdef CONFIG_SWAP +obj-$(CONFIG_MEMCG) += swap_cgroup.o +endif obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o obj-$(CONFIG_GUP_TEST) += gup_test.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index de65cb1e5f76..a53b9360b72e 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -178,7 +178,26 @@ static ssize_t min_ratio_store(struct device *dev, return ret; } -BDI_SHOW(min_ratio, bdi->min_ratio) +BDI_SHOW(min_ratio, bdi->min_ratio / BDI_RATIO_SCALE) + +static ssize_t min_ratio_fine_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned int ratio; + ssize_t ret; + + ret = kstrtouint(buf, 10, &ratio); + if (ret < 0) + return ret; + + ret = bdi_set_min_ratio_no_scale(bdi, ratio); + if (!ret) + ret = count; + + return ret; +} +BDI_SHOW(min_ratio_fine, bdi->min_ratio) static ssize_t max_ratio_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -197,7 +216,82 @@ static ssize_t max_ratio_store(struct device *dev, return ret; } -BDI_SHOW(max_ratio, bdi->max_ratio) +BDI_SHOW(max_ratio, bdi->max_ratio / BDI_RATIO_SCALE) + +static ssize_t max_ratio_fine_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned int ratio; + ssize_t ret; + + ret = kstrtouint(buf, 10, &ratio); + if (ret < 0) + return ret; + + ret = bdi_set_max_ratio_no_scale(bdi, ratio); + if (!ret) + ret = count; + + return ret; +} +BDI_SHOW(max_ratio_fine, bdi->max_ratio) + +static ssize_t min_bytes_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%llu\n", bdi_get_min_bytes(bdi)); +} + +static ssize_t min_bytes_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + u64 bytes; + ssize_t ret; + + ret = kstrtoull(buf, 10, &bytes); + if (ret < 0) + return ret; + + ret = bdi_set_min_bytes(bdi, bytes); + if (!ret) + ret = count; + + return ret; +} +DEVICE_ATTR_RW(min_bytes); + +static ssize_t max_bytes_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%llu\n", bdi_get_max_bytes(bdi)); +} + +static ssize_t max_bytes_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + u64 bytes; + ssize_t ret; + + ret = kstrtoull(buf, 10, &bytes); + if (ret < 0) + return ret; + + ret = bdi_set_max_bytes(bdi, bytes); + if (!ret) + ret = count; + + return ret; +} +DEVICE_ATTR_RW(max_bytes); static ssize_t stable_pages_required_show(struct device *dev, struct device_attribute *attr, @@ -209,11 +303,44 @@ static ssize_t stable_pages_required_show(struct device *dev, } static DEVICE_ATTR_RO(stable_pages_required); +static ssize_t strict_limit_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned int strict_limit; + ssize_t ret; + + ret = kstrtouint(buf, 10, &strict_limit); + if (ret < 0) + return ret; + + ret = bdi_set_strict_limit(bdi, strict_limit); + if (!ret) + ret = count; + + return ret; +} + +static ssize_t strict_limit_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%d\n", + !!(bdi->capabilities & BDI_CAP_STRICTLIMIT)); +} +static DEVICE_ATTR_RW(strict_limit); + static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, + &dev_attr_min_ratio_fine.attr, &dev_attr_max_ratio.attr, + &dev_attr_max_ratio_fine.attr, + &dev_attr_min_bytes.attr, + &dev_attr_max_bytes.attr, &dev_attr_stable_pages_required.attr, + &dev_attr_strict_limit.attr, NULL, }; ATTRIBUTE_GROUPS(bdi_dev); @@ -776,21 +903,17 @@ static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) int bdi_init(struct backing_dev_info *bdi) { - int ret; - bdi->dev = NULL; kref_init(&bdi->refcnt); bdi->min_ratio = 0; - bdi->max_ratio = 100; + bdi->max_ratio = 100 * BDI_RATIO_SCALE; bdi->max_prop_frac = FPROP_FRAC_BASE; INIT_LIST_HEAD(&bdi->bdi_list); INIT_LIST_HEAD(&bdi->wb_list); init_waitqueue_head(&bdi->wb_waitq); - ret = cgwb_bdi_init(bdi); - - return ret; + return cgwb_bdi_init(bdi); } struct backing_dev_info *bdi_alloc(int node_id) diff --git a/mm/cma_debug.c b/mm/cma_debug.c index c3ffe253e055..602fff89b15f 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -163,11 +163,8 @@ DEFINE_DEBUGFS_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry) { struct dentry *tmp; - char name[CMA_MAX_NAME]; - scnprintf(name, sizeof(name), "cma-%s", cma->name); - - tmp = debugfs_create_dir(name, root_dentry); + tmp = debugfs_create_dir(cma->name, root_dentry); debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops); debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops); diff --git a/mm/compaction.c b/mm/compaction.c index 640fa76228dd..ca1603524bbe 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -52,8 +52,6 @@ static inline void count_compact_events(enum vm_event_item item, long delta) #define block_start_pfn(pfn, order) round_down(pfn, 1UL << (order)) #define block_end_pfn(pfn, order) ALIGN((pfn) + 1, 1UL << (order)) -#define pageblock_start_pfn(pfn) block_start_pfn(pfn, pageblock_order) -#define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order) /* * Page order with-respect-to which proactive compaction @@ -404,7 +402,7 @@ static bool test_and_set_skip(struct compact_control *cc, struct page *page, if (cc->ignore_skip_hint) return false; - if (!IS_ALIGNED(pfn, pageblock_nr_pages)) + if (!pageblock_aligned(pfn)) return false; skip = get_pageblock_skip(page); @@ -886,7 +884,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * COMPACT_CLUSTER_MAX at a time so the second call must * not falsely conclude that the block should be skipped. */ - if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) { + if (!valid_page && pageblock_aligned(low_pfn)) { if (!isolation_suitable(cc, page)) { low_pfn = end_pfn; page = NULL; @@ -987,28 +985,28 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } /* + * Be careful not to clear PageLRU until after we're + * sure the page is not being freed elsewhere -- the + * page release code relies on it. + */ + if (unlikely(!get_page_unless_zero(page))) + goto isolate_fail; + + /* * Migration will fail if an anonymous page is pinned in memory, * so avoid taking lru_lock and isolating it unnecessarily in an * admittedly racy check. */ mapping = page_mapping(page); - if (!mapping && page_count(page) > page_mapcount(page)) - goto isolate_fail; + if (!mapping && (page_count(page) - 1) > total_mapcount(page)) + goto isolate_fail_put; /* * Only allow to migrate anonymous pages in GFP_NOFS context * because those do not depend on fs locks. */ if (!(cc->gfp_mask & __GFP_FS) && mapping) - goto isolate_fail; - - /* - * Be careful not to clear PageLRU until after we're - * sure the page is not being freed elsewhere -- the - * page release code relies on it. - */ - if (unlikely(!get_page_unless_zero(page))) - goto isolate_fail; + goto isolate_fail_put; /* Only take pages on LRU: a check now makes later tests safe */ if (!PageLRU(page)) @@ -1346,7 +1344,7 @@ move_freelist_tail(struct list_head *freelist, struct page *freepage) } static void -fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long nr_isolated) +fast_isolate_around(struct compact_control *cc, unsigned long pfn) { unsigned long start_pfn, end_pfn; struct page *page; @@ -1367,21 +1365,13 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long if (!page) return; - /* Scan before */ - if (start_pfn != pfn) { - isolate_freepages_block(cc, &start_pfn, pfn, &cc->freepages, 1, false); - if (cc->nr_freepages >= cc->nr_migratepages) - return; - } - - /* Scan after */ - start_pfn = pfn + nr_isolated; - if (start_pfn < end_pfn) - isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false); + isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false); /* Skip this pageblock in the future as it's full or nearly full */ if (cc->nr_freepages < cc->nr_migratepages) set_pageblock_skip(page); + + return; } /* Search orders in round-robin fashion */ @@ -1558,7 +1548,7 @@ fast_isolate_freepages(struct compact_control *cc) return cc->free_pfn; low_pfn = page_to_pfn(page); - fast_isolate_around(cc, low_pfn, nr_isolated); + fast_isolate_around(cc, low_pfn); return low_pfn; } @@ -1727,11 +1717,7 @@ typedef enum { * Allow userspace to control policy on scanning the unevictable LRU for * compactable pages. */ -#ifdef CONFIG_PREEMPT_RT -int sysctl_compact_unevictable_allowed __read_mostly = 0; -#else -int sysctl_compact_unevictable_allowed __read_mostly = 1; -#endif +int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNEVICTABLE_DEFAULT; static inline void update_fast_start_pfn(struct compact_control *cc, unsigned long pfn) @@ -1853,7 +1839,6 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) pfn = cc->zone->zone_start_pfn; cc->fast_search_fail = 0; found_block = true; - set_pageblock_skip(freepage); break; } } @@ -1939,7 +1924,7 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) * before making it "skip" so other compaction instances do * not scan the same block. */ - if (IS_ALIGNED(low_pfn, pageblock_nr_pages) && + if (pageblock_aligned(low_pfn) && !fast_find_block && !isolation_suitable(cc, page)) continue; @@ -1981,9 +1966,21 @@ static inline bool is_via_compact_memory(int order) return order == -1; } +/* + * Determine whether kswapd is (or recently was!) running on this node. + * + * pgdat_kswapd_lock() pins pgdat->kswapd, so a concurrent kswapd_stop() can't + * zero it. + */ static bool kswapd_is_running(pg_data_t *pgdat) { - return pgdat->kswapd && task_is_running(pgdat->kswapd); + bool running; + + pgdat_kswapd_lock(pgdat); + running = pgdat->kswapd && task_is_running(pgdat->kswapd); + pgdat_kswapd_unlock(pgdat); + + return running; } /* @@ -2113,7 +2110,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) * migration source is unmovable/reclaimable but it's not worth * special casing. */ - if (!IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages)) + if (!pageblock_aligned(cc->migrate_pfn)) return COMPACT_CONTINUE; /* Direct compactor: Is a suitable page free? */ diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 66265e3a9c65..7821fcb3f258 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -68,6 +68,9 @@ config DAMON_DBGFS If unsure, say N. + This will be removed after >5.15.y LTS kernel is released, so users + should move to the sysfs interface (DAMON_SYSFS). + config DAMON_DBGFS_KUNIT_TEST bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS depends on DAMON_DBGFS && KUNIT=y diff --git a/mm/damon/Makefile b/mm/damon/Makefile index 3e6b8ad73858..f7add3f4aa79 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -3,7 +3,7 @@ obj-y := core.o obj-$(CONFIG_DAMON_VADDR) += ops-common.o vaddr.o obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o -obj-$(CONFIG_DAMON_SYSFS) += sysfs.o +obj-$(CONFIG_DAMON_SYSFS) += sysfs-common.o sysfs-schemes.o sysfs.o obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o -obj-$(CONFIG_DAMON_RECLAIM) += reclaim.o -obj-$(CONFIG_DAMON_LRU_SORT) += lru_sort.o +obj-$(CONFIG_DAMON_RECLAIM) += modules-common.o reclaim.o +obj-$(CONFIG_DAMON_LRU_SORT) += modules-common.o lru_sort.o diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index 573669566f84..3db9b7368756 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -126,7 +126,7 @@ static void damon_test_split_at(struct kunit *test) t = damon_new_target(); r = damon_new_region(0, 100); damon_add_region(r, t); - damon_split_region_at(c, t, r, 25); + damon_split_region_at(t, r, 25); KUNIT_EXPECT_EQ(test, r->ar.start, 0ul); KUNIT_EXPECT_EQ(test, r->ar.end, 25ul); @@ -219,14 +219,14 @@ static void damon_test_split_regions_of(struct kunit *test) t = damon_new_target(); r = damon_new_region(0, 22); damon_add_region(r, t); - damon_split_regions_of(c, t, 2); + damon_split_regions_of(t, 2); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u); damon_free_target(t); t = damon_new_target(); r = damon_new_region(0, 220); damon_add_region(r, t); - damon_split_regions_of(c, t, 4); + damon_split_regions_of(t, 4); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u); damon_free_target(t); damon_destroy_ctx(c); @@ -267,6 +267,28 @@ static void damon_test_ops_registration(struct kunit *test) KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL); } +static void damon_test_set_regions(struct kunit *test) +{ + struct damon_target *t = damon_new_target(); + struct damon_region *r1 = damon_new_region(4, 16); + struct damon_region *r2 = damon_new_region(24, 32); + struct damon_addr_range range = {.start = 8, .end = 28}; + unsigned long expects[] = {8, 16, 16, 24, 24, 28}; + int expect_idx = 0; + struct damon_region *r; + + damon_add_region(r1, t); + damon_add_region(r2, t); + damon_set_regions(t, &range, 1); + + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 3); + damon_for_each_region(r, t) { + KUNIT_EXPECT_EQ(test, r->ar.start, expects[expect_idx++]); + KUNIT_EXPECT_EQ(test, r->ar.end, expects[expect_idx++]); + } + damon_destroy_target(t); +} + static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_target), KUNIT_CASE(damon_test_regions), @@ -276,6 +298,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_merge_regions_of), KUNIT_CASE(damon_test_split_regions_of), KUNIT_CASE(damon_test_ops_registration), + KUNIT_CASE(damon_test_set_regions), {}, }; diff --git a/mm/damon/core.c b/mm/damon/core.c index 7d25dc582fe3..ceec75b88ef9 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -29,6 +29,8 @@ static bool running_exclusive_ctxs; static DEFINE_MUTEX(damon_ops_lock); static struct damon_operations damon_registered_ops[NR_DAMON_OPS]; +static struct kmem_cache *damon_region_cache __ro_after_init; + /* Should be called under damon_ops_lock with id smaller than NR_DAMON_OPS */ static bool __damon_is_registered_ops(enum damon_ops_id id) { @@ -119,7 +121,7 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end) { struct damon_region *region; - region = kmalloc(sizeof(*region), GFP_KERNEL); + region = kmem_cache_alloc(damon_region_cache, GFP_KERNEL); if (!region) return NULL; @@ -148,7 +150,7 @@ static void damon_del_region(struct damon_region *r, struct damon_target *t) static void damon_free_region(struct damon_region *r) { - kfree(r); + kmem_cache_free(damon_region_cache, r); } void damon_destroy_region(struct damon_region *r, struct damon_target *t) @@ -169,6 +171,30 @@ static bool damon_intersect(struct damon_region *r, } /* + * Fill holes in regions with new regions. + */ +static int damon_fill_regions_holes(struct damon_region *first, + struct damon_region *last, struct damon_target *t) +{ + struct damon_region *r = first; + + damon_for_each_region_from(r, t) { + struct damon_region *next, *newr; + + if (r == last) + break; + next = damon_next_region(r); + if (r->ar.end != next->ar.start) { + newr = damon_new_region(r->ar.end, next->ar.start); + if (!newr) + return -ENOMEM; + damon_insert_region(newr, r, next, t); + } + } + return 0; +} + +/* * damon_set_regions() - Set regions of a target for given address ranges. * @t: the given target. * @ranges: array of new monitoring target ranges. @@ -184,6 +210,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, { struct damon_region *r, *next; unsigned int i; + int err; /* Remove regions which are not in the new ranges */ damon_for_each_region_safe(r, next, t) { @@ -195,6 +222,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, damon_destroy_region(r, t); } + r = damon_first_region(t); /* Add new regions or resize existing regions to fit in the ranges */ for (i = 0; i < nr_ranges; i++) { struct damon_region *first = NULL, *last, *newr; @@ -202,7 +230,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, range = &ranges[i]; /* Get the first/last regions intersecting with the range */ - damon_for_each_region(r, t) { + damon_for_each_region_from(r, t) { if (damon_intersect(r, range)) { if (!first) first = r; @@ -225,52 +253,46 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, first->ar.start = ALIGN_DOWN(range->start, DAMON_MIN_REGION); last->ar.end = ALIGN(range->end, DAMON_MIN_REGION); + + /* fill possible holes in the range */ + err = damon_fill_regions_holes(first, last, t); + if (err) + return err; } } return 0; } -struct damos *damon_new_scheme( - unsigned long min_sz_region, unsigned long max_sz_region, - unsigned int min_nr_accesses, unsigned int max_nr_accesses, - unsigned int min_age_region, unsigned int max_age_region, - enum damos_action action, struct damos_quota *quota, - struct damos_watermarks *wmarks) +/* initialize private fields of damos_quota and return the pointer */ +static struct damos_quota *damos_quota_init_priv(struct damos_quota *quota) +{ + quota->total_charged_sz = 0; + quota->total_charged_ns = 0; + quota->esz = 0; + quota->charged_sz = 0; + quota->charged_from = 0; + quota->charge_target_from = NULL; + quota->charge_addr_from = 0; + return quota; +} + +struct damos *damon_new_scheme(struct damos_access_pattern *pattern, + enum damos_action action, struct damos_quota *quota, + struct damos_watermarks *wmarks) { struct damos *scheme; scheme = kmalloc(sizeof(*scheme), GFP_KERNEL); if (!scheme) return NULL; - scheme->min_sz_region = min_sz_region; - scheme->max_sz_region = max_sz_region; - scheme->min_nr_accesses = min_nr_accesses; - scheme->max_nr_accesses = max_nr_accesses; - scheme->min_age_region = min_age_region; - scheme->max_age_region = max_age_region; + scheme->pattern = *pattern; scheme->action = action; scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); - scheme->quota.ms = quota->ms; - scheme->quota.sz = quota->sz; - scheme->quota.reset_interval = quota->reset_interval; - scheme->quota.weight_sz = quota->weight_sz; - scheme->quota.weight_nr_accesses = quota->weight_nr_accesses; - scheme->quota.weight_age = quota->weight_age; - scheme->quota.total_charged_sz = 0; - scheme->quota.total_charged_ns = 0; - scheme->quota.esz = 0; - scheme->quota.charged_sz = 0; - scheme->quota.charged_from = 0; - scheme->quota.charge_target_from = NULL; - scheme->quota.charge_addr_from = 0; - - scheme->wmarks.metric = wmarks->metric; - scheme->wmarks.interval = wmarks->interval; - scheme->wmarks.high = wmarks->high; - scheme->wmarks.mid = wmarks->mid; - scheme->wmarks.low = wmarks->low; + scheme->quota = *(damos_quota_init_priv(quota)); + + scheme->wmarks = *wmarks; scheme->wmarks.activated = true; return scheme; @@ -313,6 +335,7 @@ struct damon_target *damon_new_target(void) t->pid = NULL; t->nr_regions = 0; INIT_LIST_HEAD(&t->regions_list); + INIT_LIST_HEAD(&t->list); return t; } @@ -360,17 +383,17 @@ struct damon_ctx *damon_new_ctx(void) if (!ctx) return NULL; - ctx->sample_interval = 5 * 1000; - ctx->aggr_interval = 100 * 1000; - ctx->ops_update_interval = 60 * 1000 * 1000; + ctx->attrs.sample_interval = 5 * 1000; + ctx->attrs.aggr_interval = 100 * 1000; + ctx->attrs.ops_update_interval = 60 * 1000 * 1000; ktime_get_coarse_ts64(&ctx->last_aggregation); ctx->last_ops_update = ctx->last_aggregation; mutex_init(&ctx->kdamond_lock); - ctx->min_nr_regions = 10; - ctx->max_nr_regions = 1000; + ctx->attrs.min_nr_regions = 10; + ctx->attrs.max_nr_regions = 1000; INIT_LIST_HEAD(&ctx->adaptive_targets); INIT_LIST_HEAD(&ctx->schemes); @@ -406,32 +429,21 @@ void damon_destroy_ctx(struct damon_ctx *ctx) /** * damon_set_attrs() - Set attributes for the monitoring. * @ctx: monitoring context - * @sample_int: time interval between samplings - * @aggr_int: time interval between aggregations - * @ops_upd_int: time interval between monitoring operations updates - * @min_nr_reg: minimal number of regions - * @max_nr_reg: maximum number of regions + * @attrs: monitoring attributes * * This function should not be called while the kdamond is running. * Every time interval is in micro-seconds. * * Return: 0 on success, negative error code otherwise. */ -int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, - unsigned long aggr_int, unsigned long ops_upd_int, - unsigned long min_nr_reg, unsigned long max_nr_reg) +int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) { - if (min_nr_reg < 3) + if (attrs->min_nr_regions < 3) return -EINVAL; - if (min_nr_reg > max_nr_reg) + if (attrs->min_nr_regions > attrs->max_nr_regions) return -EINVAL; - ctx->sample_interval = sample_int; - ctx->aggr_interval = aggr_int; - ctx->ops_update_interval = ops_upd_int; - ctx->min_nr_regions = min_nr_reg; - ctx->max_nr_regions = max_nr_reg; - + ctx->attrs = *attrs; return 0; } @@ -443,10 +455,8 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, * * This function should not be called while the kdamond of the context is * running. - * - * Return: 0 if success, or negative error code otherwise. */ -int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, +void damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, ssize_t nr_schemes) { struct damos *s, *next; @@ -456,7 +466,6 @@ int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, damon_destroy_scheme(s); for (i = 0; i < nr_schemes; i++) damon_add_scheme(ctx, schemes[i]); - return 0; } /** @@ -482,11 +491,11 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { damon_for_each_region(r, t) - sz += r->ar.end - r->ar.start; + sz += damon_sz_region(r); } - if (ctx->min_nr_regions) - sz /= ctx->min_nr_regions; + if (ctx->attrs.min_nr_regions) + sz /= ctx->attrs.min_nr_regions; if (sz < DAMON_MIN_REGION) sz = DAMON_MIN_REGION; @@ -635,7 +644,7 @@ static bool damon_check_reset_time_interval(struct timespec64 *baseline, static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx) { return damon_check_reset_time_interval(&ctx->last_aggregation, - ctx->aggr_interval); + ctx->attrs.aggr_interval); } /* @@ -658,19 +667,20 @@ static void kdamond_reset_aggregated(struct damon_ctx *c) } } -static void damon_split_region_at(struct damon_ctx *ctx, - struct damon_target *t, struct damon_region *r, - unsigned long sz_r); +static void damon_split_region_at(struct damon_target *t, + struct damon_region *r, unsigned long sz_r); static bool __damos_valid_target(struct damon_region *r, struct damos *s) { unsigned long sz; - sz = r->ar.end - r->ar.start; - return s->min_sz_region <= sz && sz <= s->max_sz_region && - s->min_nr_accesses <= r->nr_accesses && - r->nr_accesses <= s->max_nr_accesses && - s->min_age_region <= r->age && r->age <= s->max_age_region; + sz = damon_sz_region(r); + return s->pattern.min_sz_region <= sz && + sz <= s->pattern.max_sz_region && + s->pattern.min_nr_accesses <= r->nr_accesses && + r->nr_accesses <= s->pattern.max_nr_accesses && + s->pattern.min_age_region <= r->age && + r->age <= s->pattern.max_age_region; } static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, @@ -684,6 +694,115 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, return c->ops.get_scheme_score(c, t, r, s) >= s->quota.min_score; } +/* + * damos_skip_charged_region() - Check if the given region or starting part of + * it is already charged for the DAMOS quota. + * @t: The target of the region. + * @rp: The pointer to the region. + * @s: The scheme to be applied. + * + * If a quota of a scheme has exceeded in a quota charge window, the scheme's + * action would applied to only a part of the target access pattern fulfilling + * regions. To avoid applying the scheme action to only already applied + * regions, DAMON skips applying the scheme action to the regions that charged + * in the previous charge window. + * + * This function checks if a given region should be skipped or not for the + * reason. If only the starting part of the region has previously charged, + * this function splits the region into two so that the second one covers the + * area that not charged in the previous charge widnow and saves the second + * region in *rp and returns false, so that the caller can apply DAMON action + * to the second one. + * + * Return: true if the region should be entirely skipped, false otherwise. + */ +static bool damos_skip_charged_region(struct damon_target *t, + struct damon_region **rp, struct damos *s) +{ + struct damon_region *r = *rp; + struct damos_quota *quota = &s->quota; + unsigned long sz_to_skip; + + /* Skip previously charged regions */ + if (quota->charge_target_from) { + if (t != quota->charge_target_from) + return true; + if (r == damon_last_region(t)) { + quota->charge_target_from = NULL; + quota->charge_addr_from = 0; + return true; + } + if (quota->charge_addr_from && + r->ar.end <= quota->charge_addr_from) + return true; + + if (quota->charge_addr_from && r->ar.start < + quota->charge_addr_from) { + sz_to_skip = ALIGN_DOWN(quota->charge_addr_from - + r->ar.start, DAMON_MIN_REGION); + if (!sz_to_skip) { + if (damon_sz_region(r) <= DAMON_MIN_REGION) + return true; + sz_to_skip = DAMON_MIN_REGION; + } + damon_split_region_at(t, r, sz_to_skip); + r = damon_next_region(r); + *rp = r; + } + quota->charge_target_from = NULL; + quota->charge_addr_from = 0; + } + return false; +} + +static void damos_update_stat(struct damos *s, + unsigned long sz_tried, unsigned long sz_applied) +{ + s->stat.nr_tried++; + s->stat.sz_tried += sz_tried; + if (sz_applied) + s->stat.nr_applied++; + s->stat.sz_applied += sz_applied; +} + +static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, + struct damon_region *r, struct damos *s) +{ + struct damos_quota *quota = &s->quota; + unsigned long sz = damon_sz_region(r); + struct timespec64 begin, end; + unsigned long sz_applied = 0; + int err = 0; + + if (c->ops.apply_scheme) { + if (quota->esz && quota->charged_sz + sz > quota->esz) { + sz = ALIGN_DOWN(quota->esz - quota->charged_sz, + DAMON_MIN_REGION); + if (!sz) + goto update_stat; + damon_split_region_at(t, r, sz); + } + ktime_get_coarse_ts64(&begin); + if (c->callback.before_damos_apply) + err = c->callback.before_damos_apply(c, t, r, s); + if (!err) + sz_applied = c->ops.apply_scheme(c, t, r, s); + ktime_get_coarse_ts64(&end); + quota->total_charged_ns += timespec64_to_ns(&end) - + timespec64_to_ns(&begin); + quota->charged_sz += sz; + if (quota->esz && quota->charged_sz >= quota->esz) { + quota->charge_target_from = t; + quota->charge_addr_from = r->ar.end + 1; + } + } + if (s->action != DAMOS_STAT) + r->age = 0; + +update_stat: + damos_update_stat(s, sz, sz_applied); +} + static void damon_do_apply_schemes(struct damon_ctx *c, struct damon_target *t, struct damon_region *r) @@ -692,9 +811,6 @@ static void damon_do_apply_schemes(struct damon_ctx *c, damon_for_each_scheme(s, c) { struct damos_quota *quota = &s->quota; - unsigned long sz = r->ar.end - r->ar.start; - struct timespec64 begin, end; - unsigned long sz_applied = 0; if (!s->wmarks.activated) continue; @@ -703,70 +819,13 @@ static void damon_do_apply_schemes(struct damon_ctx *c, if (quota->esz && quota->charged_sz >= quota->esz) continue; - /* Skip previously charged regions */ - if (quota->charge_target_from) { - if (t != quota->charge_target_from) - continue; - if (r == damon_last_region(t)) { - quota->charge_target_from = NULL; - quota->charge_addr_from = 0; - continue; - } - if (quota->charge_addr_from && - r->ar.end <= quota->charge_addr_from) - continue; - - if (quota->charge_addr_from && r->ar.start < - quota->charge_addr_from) { - sz = ALIGN_DOWN(quota->charge_addr_from - - r->ar.start, DAMON_MIN_REGION); - if (!sz) { - if (r->ar.end - r->ar.start <= - DAMON_MIN_REGION) - continue; - sz = DAMON_MIN_REGION; - } - damon_split_region_at(c, t, r, sz); - r = damon_next_region(r); - sz = r->ar.end - r->ar.start; - } - quota->charge_target_from = NULL; - quota->charge_addr_from = 0; - } + if (damos_skip_charged_region(t, &r, s)) + continue; if (!damos_valid_target(c, t, r, s)) continue; - /* Apply the scheme */ - if (c->ops.apply_scheme) { - if (quota->esz && - quota->charged_sz + sz > quota->esz) { - sz = ALIGN_DOWN(quota->esz - quota->charged_sz, - DAMON_MIN_REGION); - if (!sz) - goto update_stat; - damon_split_region_at(c, t, r, sz); - } - ktime_get_coarse_ts64(&begin); - sz_applied = c->ops.apply_scheme(c, t, r, s); - ktime_get_coarse_ts64(&end); - quota->total_charged_ns += timespec64_to_ns(&end) - - timespec64_to_ns(&begin); - quota->charged_sz += sz; - if (quota->esz && quota->charged_sz >= quota->esz) { - quota->charge_target_from = t; - quota->charge_addr_from = r->ar.end + 1; - } - } - if (s->action != DAMOS_STAT) - r->age = 0; - -update_stat: - s->stat.nr_tried++; - s->stat.sz_tried += sz; - if (sz_applied) - s->stat.nr_applied++; - s->stat.sz_applied += sz_applied; + damos_apply_scheme(c, t, r, s); } } @@ -793,60 +852,64 @@ static void damos_set_effective_quota(struct damos_quota *quota) quota->esz = esz; } -static void kdamond_apply_schemes(struct damon_ctx *c) +static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) { + struct damos_quota *quota = &s->quota; struct damon_target *t; - struct damon_region *r, *next_r; - struct damos *s; + struct damon_region *r; + unsigned long cumulated_sz; + unsigned int score, max_score = 0; - damon_for_each_scheme(s, c) { - struct damos_quota *quota = &s->quota; - unsigned long cumulated_sz; - unsigned int score, max_score = 0; + if (!quota->ms && !quota->sz) + return; - if (!s->wmarks.activated) - continue; + /* New charge window starts */ + if (time_after_eq(jiffies, quota->charged_from + + msecs_to_jiffies(quota->reset_interval))) { + if (quota->esz && quota->charged_sz >= quota->esz) + s->stat.qt_exceeds++; + quota->total_charged_sz += quota->charged_sz; + quota->charged_from = jiffies; + quota->charged_sz = 0; + damos_set_effective_quota(quota); + } - if (!quota->ms && !quota->sz) - continue; + if (!c->ops.get_scheme_score) + return; - /* New charge window starts */ - if (time_after_eq(jiffies, quota->charged_from + - msecs_to_jiffies( - quota->reset_interval))) { - if (quota->esz && quota->charged_sz >= quota->esz) - s->stat.qt_exceeds++; - quota->total_charged_sz += quota->charged_sz; - quota->charged_from = jiffies; - quota->charged_sz = 0; - damos_set_effective_quota(quota); + /* Fill up the score histogram */ + memset(quota->histogram, 0, sizeof(quota->histogram)); + damon_for_each_target(t, c) { + damon_for_each_region(r, t) { + if (!__damos_valid_target(r, s)) + continue; + score = c->ops.get_scheme_score(c, t, r, s); + quota->histogram[score] += damon_sz_region(r); + if (score > max_score) + max_score = score; } + } - if (!c->ops.get_scheme_score) - continue; + /* Set the min score limit */ + for (cumulated_sz = 0, score = max_score; ; score--) { + cumulated_sz += quota->histogram[score]; + if (cumulated_sz >= quota->esz || !score) + break; + } + quota->min_score = score; +} - /* Fill up the score histogram */ - memset(quota->histogram, 0, sizeof(quota->histogram)); - damon_for_each_target(t, c) { - damon_for_each_region(r, t) { - if (!__damos_valid_target(r, s)) - continue; - score = c->ops.get_scheme_score( - c, t, r, s); - quota->histogram[score] += - r->ar.end - r->ar.start; - if (score > max_score) - max_score = score; - } - } +static void kdamond_apply_schemes(struct damon_ctx *c) +{ + struct damon_target *t; + struct damon_region *r, *next_r; + struct damos *s; - /* Set the min score limit */ - for (cumulated_sz = 0, score = max_score; ; score--) { - cumulated_sz += quota->histogram[score]; - if (cumulated_sz >= quota->esz || !score) - break; - } - quota->min_score = score; + damon_for_each_scheme(s, c) { + if (!s->wmarks.activated) + continue; + + damos_adjust_quota(c, s); } damon_for_each_target(t, c) { @@ -855,18 +918,13 @@ static void kdamond_apply_schemes(struct damon_ctx *c) } } -static inline unsigned long sz_damon_region(struct damon_region *r) -{ - return r->ar.end - r->ar.start; -} - /* * Merge two adjacent regions into one region */ static void damon_merge_two_regions(struct damon_target *t, struct damon_region *l, struct damon_region *r) { - unsigned long sz_l = sz_damon_region(l), sz_r = sz_damon_region(r); + unsigned long sz_l = damon_sz_region(l), sz_r = damon_sz_region(r); l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) / (sz_l + sz_r); @@ -895,7 +953,7 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, if (prev && prev->ar.end == r->ar.start && abs(prev->nr_accesses - r->nr_accesses) <= thres && - sz_damon_region(prev) + sz_damon_region(r) <= sz_limit) + damon_sz_region(prev) + damon_sz_region(r) <= sz_limit) damon_merge_two_regions(t, prev, r); else prev = r; @@ -928,9 +986,8 @@ static void kdamond_merge_regions(struct damon_ctx *c, unsigned int threshold, * r the region to be split * sz_r size of the first sub-region that will be made */ -static void damon_split_region_at(struct damon_ctx *ctx, - struct damon_target *t, struct damon_region *r, - unsigned long sz_r) +static void damon_split_region_at(struct damon_target *t, + struct damon_region *r, unsigned long sz_r) { struct damon_region *new; @@ -947,15 +1004,14 @@ static void damon_split_region_at(struct damon_ctx *ctx, } /* Split every region in the given target into 'nr_subs' regions */ -static void damon_split_regions_of(struct damon_ctx *ctx, - struct damon_target *t, int nr_subs) +static void damon_split_regions_of(struct damon_target *t, int nr_subs) { struct damon_region *r, *next; unsigned long sz_region, sz_sub = 0; int i; damon_for_each_region_safe(r, next, t) { - sz_region = r->ar.end - r->ar.start; + sz_region = damon_sz_region(r); for (i = 0; i < nr_subs - 1 && sz_region > 2 * DAMON_MIN_REGION; i++) { @@ -969,7 +1025,7 @@ static void damon_split_regions_of(struct damon_ctx *ctx, if (sz_sub == 0 || sz_sub >= sz_region) continue; - damon_split_region_at(ctx, t, r, sz_sub); + damon_split_region_at(t, r, sz_sub); sz_region = sz_sub; } } @@ -995,16 +1051,16 @@ static void kdamond_split_regions(struct damon_ctx *ctx) damon_for_each_target(t, ctx) nr_regions += damon_nr_regions(t); - if (nr_regions > ctx->max_nr_regions / 2) + if (nr_regions > ctx->attrs.max_nr_regions / 2) return; /* Maybe the middle of the region has different access frequency */ if (last_nr_regions == nr_regions && - nr_regions < ctx->max_nr_regions / 3) + nr_regions < ctx->attrs.max_nr_regions / 3) nr_subregions = 3; damon_for_each_target(t, ctx) - damon_split_regions_of(ctx, t, nr_subregions); + damon_split_regions_of(t, nr_subregions); last_nr_regions = nr_regions; } @@ -1018,7 +1074,7 @@ static void kdamond_split_regions(struct damon_ctx *ctx) static bool kdamond_need_update_operations(struct damon_ctx *ctx) { return damon_check_reset_time_interval(&ctx->last_ops_update, - ctx->ops_update_interval); + ctx->attrs.ops_update_interval); } /* @@ -1142,32 +1198,27 @@ static int kdamond_fn(void *data) struct damon_region *r, *next; unsigned int max_nr_accesses = 0; unsigned long sz_limit = 0; - bool done = false; pr_debug("kdamond (%d) starts\n", current->pid); if (ctx->ops.init) ctx->ops.init(ctx); if (ctx->callback.before_start && ctx->callback.before_start(ctx)) - done = true; + goto done; sz_limit = damon_region_sz_limit(ctx); - while (!kdamond_need_stop(ctx) && !done) { - if (kdamond_wait_activation(ctx)) { - done = true; - continue; - } + while (!kdamond_need_stop(ctx)) { + if (kdamond_wait_activation(ctx)) + break; if (ctx->ops.prepare_access_checks) ctx->ops.prepare_access_checks(ctx); if (ctx->callback.after_sampling && - ctx->callback.after_sampling(ctx)) { - done = true; - continue; - } + ctx->callback.after_sampling(ctx)) + break; - kdamond_usleep(ctx->sample_interval); + kdamond_usleep(ctx->attrs.sample_interval); if (ctx->ops.check_accesses) max_nr_accesses = ctx->ops.check_accesses(ctx); @@ -1177,10 +1228,8 @@ static int kdamond_fn(void *data) max_nr_accesses / 10, sz_limit); if (ctx->callback.after_aggregation && - ctx->callback.after_aggregation(ctx)) { - done = true; - continue; - } + ctx->callback.after_aggregation(ctx)) + break; kdamond_apply_schemes(ctx); kdamond_reset_aggregated(ctx); kdamond_split_regions(ctx); @@ -1194,6 +1243,7 @@ static int kdamond_fn(void *data) sz_limit = damon_region_sz_limit(ctx); } } +done: damon_for_each_target(t, ctx) { damon_for_each_region_safe(r, next, t) damon_destroy_region(r, t); @@ -1218,4 +1268,90 @@ static int kdamond_fn(void *data) return 0; } +/* + * struct damon_system_ram_region - System RAM resource address region of + * [@start, @end). + * @start: Start address of the region (inclusive). + * @end: End address of the region (exclusive). + */ +struct damon_system_ram_region { + unsigned long start; + unsigned long end; +}; + +static int walk_system_ram(struct resource *res, void *arg) +{ + struct damon_system_ram_region *a = arg; + + if (a->end - a->start < resource_size(res)) { + a->start = res->start; + a->end = res->end; + } + return 0; +} + +/* + * Find biggest 'System RAM' resource and store its start and end address in + * @start and @end, respectively. If no System RAM is found, returns false. + */ +static bool damon_find_biggest_system_ram(unsigned long *start, + unsigned long *end) + +{ + struct damon_system_ram_region arg = {}; + + walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram); + if (arg.end <= arg.start) + return false; + + *start = arg.start; + *end = arg.end; + return true; +} + +/** + * damon_set_region_biggest_system_ram_default() - Set the region of the given + * monitoring target as requested, or biggest 'System RAM'. + * @t: The monitoring target to set the region. + * @start: The pointer to the start address of the region. + * @end: The pointer to the end address of the region. + * + * This function sets the region of @t as requested by @start and @end. If the + * values of @start and @end are zero, however, this function finds the biggest + * 'System RAM' resource and sets the region to cover the resource. In the + * latter case, this function saves the start and end addresses of the resource + * in @start and @end, respectively. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_set_region_biggest_system_ram_default(struct damon_target *t, + unsigned long *start, unsigned long *end) +{ + struct damon_addr_range addr_range; + + if (*start > *end) + return -EINVAL; + + if (!*start && !*end && + !damon_find_biggest_system_ram(start, end)) + return -EINVAL; + + addr_range.start = *start; + addr_range.end = *end; + return damon_set_regions(t, &addr_range, 1); +} + +static int __init damon_init(void) +{ + damon_region_cache = KMEM_CACHE(damon_region, 0); + if (unlikely(!damon_region_cache)) { + pr_err("creating damon_region_cache fails\n"); + return -ENOMEM; + } + + return 0; +} + +subsys_initcall(damon_init); + #include "core-test.h" diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index cfdf63132d5a..b3f454a5c682 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -55,9 +55,9 @@ static ssize_t dbgfs_attrs_read(struct file *file, mutex_lock(&ctx->kdamond_lock); ret = scnprintf(kbuf, ARRAY_SIZE(kbuf), "%lu %lu %lu %lu %lu\n", - ctx->sample_interval, ctx->aggr_interval, - ctx->ops_update_interval, ctx->min_nr_regions, - ctx->max_nr_regions); + ctx->attrs.sample_interval, ctx->attrs.aggr_interval, + ctx->attrs.ops_update_interval, + ctx->attrs.min_nr_regions, ctx->attrs.max_nr_regions); mutex_unlock(&ctx->kdamond_lock); return simple_read_from_buffer(buf, count, ppos, kbuf, ret); @@ -67,7 +67,7 @@ static ssize_t dbgfs_attrs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct damon_ctx *ctx = file->private_data; - unsigned long s, a, r, minr, maxr; + struct damon_attrs attrs; char *kbuf; ssize_t ret; @@ -76,7 +76,10 @@ static ssize_t dbgfs_attrs_write(struct file *file, return PTR_ERR(kbuf); if (sscanf(kbuf, "%lu %lu %lu %lu %lu", - &s, &a, &r, &minr, &maxr) != 5) { + &attrs.sample_interval, &attrs.aggr_interval, + &attrs.ops_update_interval, + &attrs.min_nr_regions, + &attrs.max_nr_regions) != 5) { ret = -EINVAL; goto out; } @@ -87,7 +90,7 @@ static ssize_t dbgfs_attrs_write(struct file *file, goto unlock_out; } - ret = damon_set_attrs(ctx, s, a, r, minr, maxr); + ret = damon_set_attrs(ctx, &attrs); if (!ret) ret = count; unlock_out: @@ -131,9 +134,12 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) damon_for_each_scheme(s, c) { rc = scnprintf(&buf[written], len - written, "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", - s->min_sz_region, s->max_sz_region, - s->min_nr_accesses, s->max_nr_accesses, - s->min_age_region, s->max_age_region, + s->pattern.min_sz_region, + s->pattern.max_sz_region, + s->pattern.min_nr_accesses, + s->pattern.max_nr_accesses, + s->pattern.min_age_region, + s->pattern.max_age_region, damos_action_to_dbgfs_scheme_action(s->action), s->quota.ms, s->quota.sz, s->quota.reset_interval, @@ -221,8 +227,6 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, struct damos *scheme, **schemes; const int max_nr_schemes = 256; int pos = 0, parsed, ret; - unsigned long min_sz, max_sz; - unsigned int min_nr_a, max_nr_a, min_age, max_age; unsigned int action_input; enum damos_action action; @@ -233,13 +237,18 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, *nr_schemes = 0; while (pos < len && *nr_schemes < max_nr_schemes) { + struct damos_access_pattern pattern = {}; struct damos_quota quota = {}; struct damos_watermarks wmarks; ret = sscanf(&str[pos], "%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n", - &min_sz, &max_sz, &min_nr_a, &max_nr_a, - &min_age, &max_age, &action_input, "a.ms, + &pattern.min_sz_region, &pattern.max_sz_region, + &pattern.min_nr_accesses, + &pattern.max_nr_accesses, + &pattern.min_age_region, + &pattern.max_age_region, + &action_input, "a.ms, "a.sz, "a.reset_interval, "a.weight_sz, "a.weight_nr_accesses, "a.weight_age, &wmarks.metric, @@ -251,7 +260,9 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, if ((int)action < 0) goto fail; - if (min_sz > max_sz || min_nr_a > max_nr_a || min_age > max_age) + if (pattern.min_sz_region > pattern.max_sz_region || + pattern.min_nr_accesses > pattern.max_nr_accesses || + pattern.min_age_region > pattern.max_age_region) goto fail; if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low || @@ -259,8 +270,7 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, goto fail; pos += parsed; - scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a, - min_age, max_age, action, "a, &wmarks); + scheme = damon_new_scheme(&pattern, action, "a, &wmarks); if (!scheme) goto fail; @@ -297,11 +307,9 @@ static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf, goto unlock_out; } - ret = damon_set_schemes(ctx, schemes, nr_schemes); - if (!ret) { - ret = count; - nr_schemes = 0; - } + damon_set_schemes(ctx, schemes, nr_schemes); + ret = count; + nr_schemes = 0; unlock_out: mutex_unlock(&ctx->kdamond_lock); @@ -882,8 +890,10 @@ out: static int dbgfs_rm_context(char *name) { struct dentry *root, *dir, **new_dirs; + struct inode *inode; struct damon_ctx **new_ctxs; int i, j; + int ret = 0; if (damon_nr_running_ctxs()) return -EBUSY; @@ -896,16 +906,24 @@ static int dbgfs_rm_context(char *name) if (!dir) return -ENOENT; + inode = d_inode(dir); + if (!S_ISDIR(inode->i_mode)) { + ret = -EINVAL; + goto out_dput; + } + new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs), GFP_KERNEL); - if (!new_dirs) - return -ENOMEM; + if (!new_dirs) { + ret = -ENOMEM; + goto out_dput; + } new_ctxs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_ctxs), GFP_KERNEL); if (!new_ctxs) { - kfree(new_dirs); - return -ENOMEM; + ret = -ENOMEM; + goto out_new_dirs; } for (i = 0, j = 0; i < dbgfs_nr_ctxs; i++) { @@ -925,7 +943,13 @@ static int dbgfs_rm_context(char *name) dbgfs_ctxs = new_ctxs; dbgfs_nr_ctxs--; - return 0; + goto out_dput; + +out_new_dirs: + kfree(new_dirs); +out_dput: + dput(dir); + return ret; } static ssize_t dbgfs_rm_context_write(struct file *file, @@ -1044,7 +1068,7 @@ static int __init __damon_dbgfs_init(void) fops[i]); dbgfs_fill_ctx_dir(dbgfs_root, dbgfs_ctxs[0]); - dbgfs_dirs = kmalloc_array(1, sizeof(dbgfs_root), GFP_KERNEL); + dbgfs_dirs = kmalloc(sizeof(dbgfs_root), GFP_KERNEL); if (!dbgfs_dirs) { debugfs_remove(dbgfs_root); return -ENOMEM; diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 9de6f00a71c5..7b8fce2f67a8 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -8,10 +8,10 @@ #define pr_fmt(fmt) "damon-lru-sort: " fmt #include <linux/damon.h> -#include <linux/ioport.h> +#include <linux/kstrtox.h> #include <linux/module.h> -#include <linux/sched.h> -#include <linux/workqueue.h> + +#include "modules-common.h" #ifdef MODULE_PARAM_PREFIX #undef MODULE_PARAM_PREFIX @@ -63,109 +63,35 @@ module_param(hot_thres_access_freq, ulong, 0600); static unsigned long cold_min_age __read_mostly = 120000000; module_param(cold_min_age, ulong, 0600); -/* - * Limit of time for trying the LRU lists sorting in milliseconds. - * - * DAMON_LRU_SORT tries to use only up to this time within a time window - * (quota_reset_interval_ms) for trying LRU lists sorting. This can be used - * for limiting CPU consumption of DAMON_LRU_SORT. If the value is zero, the - * limit is disabled. - * - * 10 ms by default. - */ -static unsigned long quota_ms __read_mostly = 10; -module_param(quota_ms, ulong, 0600); - -/* - * The time quota charge reset interval in milliseconds. - * - * The charge reset interval for the quota of time (quota_ms). That is, - * DAMON_LRU_SORT does not try LRU-lists sorting for more than quota_ms - * milliseconds or quota_sz bytes within quota_reset_interval_ms milliseconds. - * - * 1 second by default. - */ -static unsigned long quota_reset_interval_ms __read_mostly = 1000; -module_param(quota_reset_interval_ms, ulong, 0600); - -/* - * The watermarks check time interval in microseconds. - * - * Minimal time to wait before checking the watermarks, when DAMON_LRU_SORT is - * enabled but inactive due to its watermarks rule. 5 seconds by default. - */ -static unsigned long wmarks_interval __read_mostly = 5000000; -module_param(wmarks_interval, ulong, 0600); - -/* - * Free memory rate (per thousand) for the high watermark. - * - * If free memory of the system in bytes per thousand bytes is higher than - * this, DAMON_LRU_SORT becomes inactive, so it does nothing but periodically - * checks the watermarks. 200 (20%) by default. - */ -static unsigned long wmarks_high __read_mostly = 200; -module_param(wmarks_high, ulong, 0600); - -/* - * Free memory rate (per thousand) for the middle watermark. - * - * If free memory of the system in bytes per thousand bytes is between this and - * the low watermark, DAMON_LRU_SORT becomes active, so starts the monitoring - * and the LRU-lists sorting. 150 (15%) by default. - */ -static unsigned long wmarks_mid __read_mostly = 150; -module_param(wmarks_mid, ulong, 0600); - -/* - * Free memory rate (per thousand) for the low watermark. - * - * If free memory of the system in bytes per thousand bytes is lower than this, - * DAMON_LRU_SORT becomes inactive, so it does nothing but periodically checks - * the watermarks. 50 (5%) by default. - */ -static unsigned long wmarks_low __read_mostly = 50; -module_param(wmarks_low, ulong, 0600); - -/* - * Sampling interval for the monitoring in microseconds. - * - * The sampling interval of DAMON for the hot/cold memory monitoring. Please - * refer to the DAMON documentation for more detail. 5 ms by default. - */ -static unsigned long sample_interval __read_mostly = 5000; -module_param(sample_interval, ulong, 0600); - -/* - * Aggregation interval for the monitoring in microseconds. - * - * The aggregation interval of DAMON for the hot/cold memory monitoring. - * Please refer to the DAMON documentation for more detail. 100 ms by default. - */ -static unsigned long aggr_interval __read_mostly = 100000; -module_param(aggr_interval, ulong, 0600); - -/* - * Minimum number of monitoring regions. - * - * The minimal number of monitoring regions of DAMON for the hot/cold memory - * monitoring. This can be used to set lower-bound of the monitoring quality. - * But, setting this too high could result in increased monitoring overhead. - * Please refer to the DAMON documentation for more detail. 10 by default. - */ -static unsigned long min_nr_regions __read_mostly = 10; -module_param(min_nr_regions, ulong, 0600); - -/* - * Maximum number of monitoring regions. - * - * The maximum number of monitoring regions of DAMON for the hot/cold memory - * monitoring. This can be used to set upper-bound of the monitoring overhead. - * However, setting this too low could result in bad monitoring quality. - * Please refer to the DAMON documentation for more detail. 1000 by default. - */ -static unsigned long max_nr_regions __read_mostly = 1000; -module_param(max_nr_regions, ulong, 0600); +static struct damos_quota damon_lru_sort_quota = { + /* Use up to 10 ms per 1 sec, by default */ + .ms = 10, + .sz = 0, + .reset_interval = 1000, + /* Within the quota, mark hotter regions accessed first. */ + .weight_sz = 0, + .weight_nr_accesses = 1, + .weight_age = 0, +}; +DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(damon_lru_sort_quota); + +static struct damos_watermarks damon_lru_sort_wmarks = { + .metric = DAMOS_WMARK_FREE_MEM_RATE, + .interval = 5000000, /* 5 seconds */ + .high = 200, /* 20 percent */ + .mid = 150, /* 15 percent */ + .low = 50, /* 5 percent */ +}; +DEFINE_DAMON_MODULES_WMARKS_PARAMS(damon_lru_sort_wmarks); + +static struct damon_attrs damon_lru_sort_mon_attrs = { + .sample_interval = 5000, /* 5 ms */ + .aggr_interval = 100000, /* 100 ms */ + .ops_update_interval = 0, + .min_nr_regions = 10, + .max_nr_regions = 1000, +}; +DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_lru_sort_mon_attrs); /* * Start of the target memory region in physical address. @@ -194,222 +120,97 @@ module_param(monitor_region_end, ulong, 0600); static int kdamond_pid __read_mostly = -1; module_param(kdamond_pid, int, 0400); -/* - * Number of hot memory regions that tried to be LRU-sorted. - */ -static unsigned long nr_lru_sort_tried_hot_regions __read_mostly; -module_param(nr_lru_sort_tried_hot_regions, ulong, 0400); - -/* - * Total bytes of hot memory regions that tried to be LRU-sorted. - */ -static unsigned long bytes_lru_sort_tried_hot_regions __read_mostly; -module_param(bytes_lru_sort_tried_hot_regions, ulong, 0400); - -/* - * Number of hot memory regions that successfully be LRU-sorted. - */ -static unsigned long nr_lru_sorted_hot_regions __read_mostly; -module_param(nr_lru_sorted_hot_regions, ulong, 0400); - -/* - * Total bytes of hot memory regions that successfully be LRU-sorted. - */ -static unsigned long bytes_lru_sorted_hot_regions __read_mostly; -module_param(bytes_lru_sorted_hot_regions, ulong, 0400); - -/* - * Number of times that the time quota limit for hot regions have exceeded - */ -static unsigned long nr_hot_quota_exceeds __read_mostly; -module_param(nr_hot_quota_exceeds, ulong, 0400); - -/* - * Number of cold memory regions that tried to be LRU-sorted. - */ -static unsigned long nr_lru_sort_tried_cold_regions __read_mostly; -module_param(nr_lru_sort_tried_cold_regions, ulong, 0400); - -/* - * Total bytes of cold memory regions that tried to be LRU-sorted. - */ -static unsigned long bytes_lru_sort_tried_cold_regions __read_mostly; -module_param(bytes_lru_sort_tried_cold_regions, ulong, 0400); - -/* - * Number of cold memory regions that successfully be LRU-sorted. - */ -static unsigned long nr_lru_sorted_cold_regions __read_mostly; -module_param(nr_lru_sorted_cold_regions, ulong, 0400); - -/* - * Total bytes of cold memory regions that successfully be LRU-sorted. - */ -static unsigned long bytes_lru_sorted_cold_regions __read_mostly; -module_param(bytes_lru_sorted_cold_regions, ulong, 0400); - -/* - * Number of times that the time quota limit for cold regions have exceeded - */ -static unsigned long nr_cold_quota_exceeds __read_mostly; -module_param(nr_cold_quota_exceeds, ulong, 0400); +static struct damos_stat damon_lru_sort_hot_stat; +DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_hot_stat, + lru_sort_tried_hot_regions, lru_sorted_hot_regions, + hot_quota_exceeds); + +static struct damos_stat damon_lru_sort_cold_stat; +DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_cold_stat, + lru_sort_tried_cold_regions, lru_sorted_cold_regions, + cold_quota_exceeds); + +static struct damos_access_pattern damon_lru_sort_stub_pattern = { + /* Find regions having PAGE_SIZE or larger size */ + .min_sz_region = PAGE_SIZE, + .max_sz_region = ULONG_MAX, + /* no matter its access frequency */ + .min_nr_accesses = 0, + .max_nr_accesses = UINT_MAX, + /* no matter its age */ + .min_age_region = 0, + .max_age_region = UINT_MAX, +}; static struct damon_ctx *ctx; static struct damon_target *target; -struct damon_lru_sort_ram_walk_arg { - unsigned long start; - unsigned long end; -}; - -static int walk_system_ram(struct resource *res, void *arg) +static struct damos *damon_lru_sort_new_scheme( + struct damos_access_pattern *pattern, enum damos_action action) { - struct damon_lru_sort_ram_walk_arg *a = arg; - - if (a->end - a->start < resource_size(res)) { - a->start = res->start; - a->end = res->end; - } - return 0; -} + struct damos_quota quota = damon_lru_sort_quota; -/* - * Find biggest 'System RAM' resource and store its start and end address in - * @start and @end, respectively. If no System RAM is found, returns false. - */ -static bool get_monitoring_region(unsigned long *start, unsigned long *end) -{ - struct damon_lru_sort_ram_walk_arg arg = {}; + /* Use half of total quota for hot/cold pages sorting */ + quota.ms = quota.ms / 2; - walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram); - if (arg.end <= arg.start) - return false; - - *start = arg.start; - *end = arg.end; - return true; + return damon_new_scheme( + /* find the pattern, and */ + pattern, + /* (de)prioritize on LRU-lists */ + action, + /* under the quota. */ + "a, + /* (De)activate this according to the watermarks. */ + &damon_lru_sort_wmarks); } /* Create a DAMON-based operation scheme for hot memory regions */ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) { - struct damos_watermarks wmarks = { - .metric = DAMOS_WMARK_FREE_MEM_RATE, - .interval = wmarks_interval, - .high = wmarks_high, - .mid = wmarks_mid, - .low = wmarks_low, - }; - struct damos_quota quota = { - /* - * Do not try LRU-lists sorting of hot pages for more than half - * of quota_ms milliseconds within quota_reset_interval_ms. - */ - .ms = quota_ms / 2, - .sz = 0, - .reset_interval = quota_reset_interval_ms, - /* Within the quota, mark hotter regions accessed first. */ - .weight_sz = 0, - .weight_nr_accesses = 1, - .weight_age = 0, - }; - struct damos *scheme = damon_new_scheme( - /* Find regions having PAGE_SIZE or larger size */ - PAGE_SIZE, ULONG_MAX, - /* and accessed for more than the threshold */ - hot_thres, UINT_MAX, - /* no matter its age */ - 0, UINT_MAX, - /* prioritize those on LRU lists, as soon as found */ - DAMOS_LRU_PRIO, - /* under the quota. */ - "a, - /* (De)activate this according to the watermarks. */ - &wmarks); + struct damos_access_pattern pattern = damon_lru_sort_stub_pattern; - return scheme; + pattern.min_nr_accesses = hot_thres; + return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_PRIO); } /* Create a DAMON-based operation scheme for cold memory regions */ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) { - struct damos_watermarks wmarks = { - .metric = DAMOS_WMARK_FREE_MEM_RATE, - .interval = wmarks_interval, - .high = wmarks_high, - .mid = wmarks_mid, - .low = wmarks_low, - }; - struct damos_quota quota = { - /* - * Do not try LRU-lists sorting of cold pages for more than - * half of quota_ms milliseconds within - * quota_reset_interval_ms. - */ - .ms = quota_ms / 2, - .sz = 0, - .reset_interval = quota_reset_interval_ms, - /* Within the quota, mark colder regions not accessed first. */ - .weight_sz = 0, - .weight_nr_accesses = 0, - .weight_age = 1, - }; - struct damos *scheme = damon_new_scheme( - /* Find regions having PAGE_SIZE or larger size */ - PAGE_SIZE, ULONG_MAX, - /* and not accessed at all */ - 0, 0, - /* for cold_thres or more micro-seconds, and */ - cold_thres, UINT_MAX, - /* mark those as not accessed, as soon as found */ - DAMOS_LRU_DEPRIO, - /* under the quota. */ - "a, - /* (De)activate this according to the watermarks. */ - &wmarks); + struct damos_access_pattern pattern = damon_lru_sort_stub_pattern; - return scheme; + pattern.max_nr_accesses = 0; + pattern.min_age_region = cold_thres; + return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_DEPRIO); } static int damon_lru_sort_apply_parameters(void) { - struct damos *scheme, *next_scheme; - struct damon_addr_range addr_range; + struct damos *scheme; unsigned int hot_thres, cold_thres; int err = 0; - err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0, - min_nr_regions, max_nr_regions); + err = damon_set_attrs(ctx, &damon_lru_sort_mon_attrs); if (err) return err; - /* free previously set schemes */ - damon_for_each_scheme_safe(scheme, next_scheme, ctx) - damon_destroy_scheme(scheme); - /* aggr_interval / sample_interval is the maximum nr_accesses */ - hot_thres = aggr_interval / sample_interval * hot_thres_access_freq / - 1000; + hot_thres = damon_lru_sort_mon_attrs.aggr_interval / + damon_lru_sort_mon_attrs.sample_interval * + hot_thres_access_freq / 1000; scheme = damon_lru_sort_new_hot_scheme(hot_thres); if (!scheme) return -ENOMEM; - damon_add_scheme(ctx, scheme); + damon_set_schemes(ctx, &scheme, 1); - cold_thres = cold_min_age / aggr_interval; + cold_thres = cold_min_age / damon_lru_sort_mon_attrs.aggr_interval; scheme = damon_lru_sort_new_cold_scheme(cold_thres); if (!scheme) return -ENOMEM; damon_add_scheme(ctx, scheme); - if (monitor_region_start > monitor_region_end) - return -EINVAL; - if (!monitor_region_start && !monitor_region_end && - !get_monitoring_region(&monitor_region_start, - &monitor_region_end)) - return -EINVAL; - addr_range.start = monitor_region_start; - addr_range.end = monitor_region_end; - return damon_set_regions(target, &addr_range, 1); + return damon_set_region_biggest_system_ram_default(target, + &monitor_region_start, + &monitor_region_end); } static int damon_lru_sort_turn(bool on) @@ -434,38 +235,31 @@ static int damon_lru_sort_turn(bool on) return 0; } -static struct delayed_work damon_lru_sort_timer; -static void damon_lru_sort_timer_fn(struct work_struct *work) -{ - static bool last_enabled; - bool now_enabled; - - now_enabled = enabled; - if (last_enabled != now_enabled) { - if (!damon_lru_sort_turn(now_enabled)) - last_enabled = now_enabled; - else - enabled = last_enabled; - } -} -static DECLARE_DELAYED_WORK(damon_lru_sort_timer, damon_lru_sort_timer_fn); - -static bool damon_lru_sort_initialized; - static int damon_lru_sort_enabled_store(const char *val, const struct kernel_param *kp) { - int rc = param_set_bool(val, kp); + bool is_enabled = enabled; + bool enable; + int err; - if (rc < 0) - return rc; + err = kstrtobool(val, &enable); + if (err) + return err; + + if (is_enabled == enable) + return 0; - if (!damon_lru_sort_initialized) - return rc; + /* Called before init function. The function will handle this. */ + if (!ctx) + goto set_param_out; - schedule_delayed_work(&damon_lru_sort_timer, 0); + err = damon_lru_sort_turn(enable); + if (err) + return err; - return 0; +set_param_out: + enabled = enable; + return err; } static const struct kernel_param_ops enabled_param_ops = { @@ -495,19 +289,10 @@ static int damon_lru_sort_after_aggregation(struct damon_ctx *c) /* update the stats parameter */ damon_for_each_scheme(s, c) { - if (s->action == DAMOS_LRU_PRIO) { - nr_lru_sort_tried_hot_regions = s->stat.nr_tried; - bytes_lru_sort_tried_hot_regions = s->stat.sz_tried; - nr_lru_sorted_hot_regions = s->stat.nr_applied; - bytes_lru_sorted_hot_regions = s->stat.sz_applied; - nr_hot_quota_exceeds = s->stat.qt_exceeds; - } else if (s->action == DAMOS_LRU_DEPRIO) { - nr_lru_sort_tried_cold_regions = s->stat.nr_tried; - bytes_lru_sort_tried_cold_regions = s->stat.sz_tried; - nr_lru_sorted_cold_regions = s->stat.nr_applied; - bytes_lru_sorted_cold_regions = s->stat.sz_applied; - nr_cold_quota_exceeds = s->stat.qt_exceeds; - } + if (s->action == DAMOS_LRU_PRIO) + damon_lru_sort_hot_stat = s->stat; + else if (s->action == DAMOS_LRU_DEPRIO) + damon_lru_sort_cold_stat = s->stat; } return damon_lru_sort_handle_commit_inputs(); @@ -520,29 +305,19 @@ static int damon_lru_sort_after_wmarks_check(struct damon_ctx *c) static int __init damon_lru_sort_init(void) { - ctx = damon_new_ctx(); - if (!ctx) - return -ENOMEM; + int err = damon_modules_new_paddr_ctx_target(&ctx, &target); - if (damon_select_ops(ctx, DAMON_OPS_PADDR)) { - damon_destroy_ctx(ctx); - return -EINVAL; - } + if (err) + return err; ctx->callback.after_wmarks_check = damon_lru_sort_after_wmarks_check; ctx->callback.after_aggregation = damon_lru_sort_after_aggregation; - target = damon_new_target(); - if (!target) { - damon_destroy_ctx(ctx); - return -ENOMEM; - } - damon_add_target(ctx, target); + /* 'enabled' has set before this function, probably via command line */ + if (enabled) + err = damon_lru_sort_turn(true); - schedule_delayed_work(&damon_lru_sort_timer, 0); - - damon_lru_sort_initialized = true; - return 0; + return err; } module_init(damon_lru_sort_init); diff --git a/mm/damon/modules-common.c b/mm/damon/modules-common.c new file mode 100644 index 000000000000..b2381a8466ec --- /dev/null +++ b/mm/damon/modules-common.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common Primitives for DAMON Modules + * + * Author: SeongJae Park <sjpark@amazon.de> + */ + +#include <linux/damon.h> + +#include "modules-common.h" + +/* + * Allocate, set, and return a DAMON context for the physical address space. + * @ctxp: Pointer to save the point to the newly created context + * @targetp: Pointer to save the point to the newly created target + */ +int damon_modules_new_paddr_ctx_target(struct damon_ctx **ctxp, + struct damon_target **targetp) +{ + struct damon_ctx *ctx; + struct damon_target *target; + + ctx = damon_new_ctx(); + if (!ctx) + return -ENOMEM; + + if (damon_select_ops(ctx, DAMON_OPS_PADDR)) { + damon_destroy_ctx(ctx); + return -EINVAL; + } + + target = damon_new_target(); + if (!target) { + damon_destroy_ctx(ctx); + return -ENOMEM; + } + damon_add_target(ctx, target); + + *ctxp = ctx; + *targetp = target; + return 0; +} diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h new file mode 100644 index 000000000000..f49cdb417005 --- /dev/null +++ b/mm/damon/modules-common.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common Primitives for DAMON Modules + * + * Author: SeongJae Park <sj@kernel.org> + */ + +#include <linux/moduleparam.h> + +#define DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(attrs) \ + module_param_named(sample_interval, attrs.sample_interval, \ + ulong, 0600); \ + module_param_named(aggr_interval, attrs.aggr_interval, ulong, \ + 0600); \ + module_param_named(min_nr_regions, attrs.min_nr_regions, ulong, \ + 0600); \ + module_param_named(max_nr_regions, attrs.max_nr_regions, ulong, \ + 0600); + +#define DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(quota) \ + module_param_named(quota_ms, quota.ms, ulong, 0600); \ + module_param_named(quota_reset_interval_ms, \ + quota.reset_interval, ulong, 0600); + +#define DEFINE_DAMON_MODULES_DAMOS_QUOTAS(quota) \ + DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(quota) \ + module_param_named(quota_sz, quota.sz, ulong, 0600); + +#define DEFINE_DAMON_MODULES_WMARKS_PARAMS(wmarks) \ + module_param_named(wmarks_interval, wmarks.interval, ulong, \ + 0600); \ + module_param_named(wmarks_high, wmarks.high, ulong, 0600); \ + module_param_named(wmarks_mid, wmarks.mid, ulong, 0600); \ + module_param_named(wmarks_low, wmarks.low, ulong, 0600); + +#define DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(stat, try_name, \ + succ_name, qt_exceed_name) \ + module_param_named(nr_##try_name, stat.nr_tried, ulong, 0400); \ + module_param_named(bytes_##try_name, stat.sz_tried, ulong, \ + 0400); \ + module_param_named(nr_##succ_name, stat.nr_applied, ulong, \ + 0400); \ + module_param_named(bytes_##succ_name, stat.sz_applied, ulong, \ + 0400); \ + module_param_named(nr_##qt_exceed_name, stat.qt_exceeds, ulong, \ + 0400); + +int damon_modules_new_paddr_ctx_target(struct damon_ctx **ctxp, + struct damon_target **targetp); diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index b1335de200e7..75409601f934 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -88,7 +88,7 @@ void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr) #define DAMON_MAX_SUBSCORE (100) #define DAMON_MAX_AGE_IN_LOG (32) -int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, +int damon_hot_score(struct damon_ctx *c, struct damon_region *r, struct damos *s) { unsigned int max_nr_accesses; @@ -99,10 +99,10 @@ int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, unsigned int age_weight = s->quota.weight_age; int hotness; - max_nr_accesses = c->aggr_interval / c->sample_interval; + max_nr_accesses = c->attrs.aggr_interval / c->attrs.sample_interval; freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses; - age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000; + age_in_sec = (unsigned long)r->age * c->attrs.aggr_interval / 1000000; for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec; age_in_log++, age_in_sec >>= 1) ; @@ -127,48 +127,14 @@ int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, */ hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE; - /* Return coldness of the region */ - return DAMOS_MAX_SCORE - hotness; + return hotness; } -int damon_hot_score(struct damon_ctx *c, struct damon_region *r, +int damon_cold_score(struct damon_ctx *c, struct damon_region *r, struct damos *s) { - unsigned int max_nr_accesses; - int freq_subscore; - unsigned int age_in_sec; - int age_in_log, age_subscore; - unsigned int freq_weight = s->quota.weight_nr_accesses; - unsigned int age_weight = s->quota.weight_age; - int hotness; - - max_nr_accesses = c->aggr_interval / c->sample_interval; - freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses; - - age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000; - for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec; - age_in_log++, age_in_sec >>= 1) - ; + int hotness = damon_hot_score(c, r, s); - /* If frequency is 0, higher age means it's colder */ - if (freq_subscore == 0) - age_in_log *= -1; - - /* - * Now age_in_log is in [-DAMON_MAX_AGE_IN_LOG, DAMON_MAX_AGE_IN_LOG]. - * Scale it to be in [0, 100] and set it as age subscore. - */ - age_in_log += DAMON_MAX_AGE_IN_LOG; - age_subscore = age_in_log * DAMON_MAX_SUBSCORE / - DAMON_MAX_AGE_IN_LOG / 2; - - hotness = (freq_weight * freq_subscore + age_weight * age_subscore); - if (freq_weight + age_weight) - hotness /= freq_weight + age_weight; - /* - * Transform it to fit in [0, DAMOS_MAX_SCORE] - */ - hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE; - - return hotness; + /* Return coldness of the region */ + return DAMOS_MAX_SCORE - hotness; } diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h index 52329ff361cd..8d82d3722204 100644 --- a/mm/damon/ops-common.h +++ b/mm/damon/ops-common.h @@ -12,7 +12,7 @@ struct page *damon_get_page(unsigned long pfn); void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr); void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr); -int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, +int damon_cold_score(struct damon_ctx *c, struct damon_region *r, struct damos *s); int damon_hot_score(struct damon_ctx *c, struct damon_region *r, struct damos *s); diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index dc131c6a5403..e1a4315c4be6 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -63,8 +63,7 @@ out: folio_put(folio); } -static void __damon_pa_prepare_access_check(struct damon_ctx *ctx, - struct damon_region *r) +static void __damon_pa_prepare_access_check(struct damon_region *r) { r->sampling_addr = damon_rand(r->ar.start, r->ar.end); @@ -78,7 +77,7 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { damon_for_each_region(r, t) - __damon_pa_prepare_access_check(ctx, r); + __damon_pa_prepare_access_check(r); } } @@ -166,8 +165,7 @@ out: return result.accessed; } -static void __damon_pa_check_access(struct damon_ctx *ctx, - struct damon_region *r) +static void __damon_pa_check_access(struct damon_region *r) { static unsigned long last_addr; static unsigned long last_page_sz = PAGE_SIZE; @@ -196,7 +194,7 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { damon_for_each_region(r, t) { - __damon_pa_check_access(ctx, r); + __damon_pa_check_access(r); max_nr_accesses = max(r->nr_accesses, max_nr_accesses); } } @@ -233,7 +231,8 @@ static unsigned long damon_pa_pageout(struct damon_region *r) return applied * PAGE_SIZE; } -static unsigned long damon_pa_mark_accessed(struct damon_region *r) +static inline unsigned long damon_pa_mark_accessed_or_deactivate( + struct damon_region *r, bool mark_accessed) { unsigned long addr, applied = 0; @@ -242,27 +241,24 @@ static unsigned long damon_pa_mark_accessed(struct damon_region *r) if (!page) continue; - mark_page_accessed(page); + if (mark_accessed) + mark_page_accessed(page); + else + deactivate_page(page); put_page(page); applied++; } return applied * PAGE_SIZE; } -static unsigned long damon_pa_deactivate_pages(struct damon_region *r) +static unsigned long damon_pa_mark_accessed(struct damon_region *r) { - unsigned long addr, applied = 0; - - for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { - struct page *page = damon_get_page(PHYS_PFN(addr)); + return damon_pa_mark_accessed_or_deactivate(r, true); +} - if (!page) - continue; - deactivate_page(page); - put_page(page); - applied++; - } - return applied * PAGE_SIZE; +static unsigned long damon_pa_deactivate_pages(struct damon_region *r) +{ + return damon_pa_mark_accessed_or_deactivate(r, false); } static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, @@ -276,7 +272,10 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, return damon_pa_mark_accessed(r); case DAMOS_LRU_DEPRIO: return damon_pa_deactivate_pages(r); + case DAMOS_STAT: + break; default: + /* DAMOS actions that not yet supported by 'paddr'. */ break; } return 0; @@ -288,11 +287,11 @@ static int damon_pa_scheme_score(struct damon_ctx *context, { switch (scheme->action) { case DAMOS_PAGEOUT: - return damon_pageout_score(context, r, scheme); + return damon_cold_score(context, r, scheme); case DAMOS_LRU_PRIO: return damon_hot_score(context, r, scheme); case DAMOS_LRU_DEPRIO: - return damon_pageout_score(context, r, scheme); + return damon_cold_score(context, r, scheme); default: break; } diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index a7faf51b4bd4..e82631f39481 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -8,10 +8,10 @@ #define pr_fmt(fmt) "damon-reclaim: " fmt #include <linux/damon.h> -#include <linux/ioport.h> +#include <linux/kstrtox.h> #include <linux/module.h> -#include <linux/sched.h> -#include <linux/workqueue.h> + +#include "modules-common.h" #ifdef MODULE_PARAM_PREFIX #undef MODULE_PARAM_PREFIX @@ -50,124 +50,35 @@ module_param(commit_inputs, bool, 0600); static unsigned long min_age __read_mostly = 120000000; module_param(min_age, ulong, 0600); -/* - * Limit of time for trying the reclamation in milliseconds. - * - * DAMON_RECLAIM tries to use only up to this time within a time window - * (quota_reset_interval_ms) for trying reclamation of cold pages. This can be - * used for limiting CPU consumption of DAMON_RECLAIM. If the value is zero, - * the limit is disabled. - * - * 10 ms by default. - */ -static unsigned long quota_ms __read_mostly = 10; -module_param(quota_ms, ulong, 0600); - -/* - * Limit of size of memory for the reclamation in bytes. - * - * DAMON_RECLAIM charges amount of memory which it tried to reclaim within a - * time window (quota_reset_interval_ms) and makes no more than this limit is - * tried. This can be used for limiting consumption of CPU and IO. If this - * value is zero, the limit is disabled. - * - * 128 MiB by default. - */ -static unsigned long quota_sz __read_mostly = 128 * 1024 * 1024; -module_param(quota_sz, ulong, 0600); - -/* - * The time/size quota charge reset interval in milliseconds. - * - * The charge reset interval for the quota of time (quota_ms) and size - * (quota_sz). That is, DAMON_RECLAIM does not try reclamation for more than - * quota_ms milliseconds or quota_sz bytes within quota_reset_interval_ms - * milliseconds. - * - * 1 second by default. - */ -static unsigned long quota_reset_interval_ms __read_mostly = 1000; -module_param(quota_reset_interval_ms, ulong, 0600); - -/* - * The watermarks check time interval in microseconds. - * - * Minimal time to wait before checking the watermarks, when DAMON_RECLAIM is - * enabled but inactive due to its watermarks rule. 5 seconds by default. - */ -static unsigned long wmarks_interval __read_mostly = 5000000; -module_param(wmarks_interval, ulong, 0600); - -/* - * Free memory rate (per thousand) for the high watermark. - * - * If free memory of the system in bytes per thousand bytes is higher than - * this, DAMON_RECLAIM becomes inactive, so it does nothing but periodically - * checks the watermarks. 500 (50%) by default. - */ -static unsigned long wmarks_high __read_mostly = 500; -module_param(wmarks_high, ulong, 0600); - -/* - * Free memory rate (per thousand) for the middle watermark. - * - * If free memory of the system in bytes per thousand bytes is between this and - * the low watermark, DAMON_RECLAIM becomes active, so starts the monitoring - * and the reclaiming. 400 (40%) by default. - */ -static unsigned long wmarks_mid __read_mostly = 400; -module_param(wmarks_mid, ulong, 0600); - -/* - * Free memory rate (per thousand) for the low watermark. - * - * If free memory of the system in bytes per thousand bytes is lower than this, - * DAMON_RECLAIM becomes inactive, so it does nothing but periodically checks - * the watermarks. In the case, the system falls back to the LRU-based page - * granularity reclamation logic. 200 (20%) by default. - */ -static unsigned long wmarks_low __read_mostly = 200; -module_param(wmarks_low, ulong, 0600); - -/* - * Sampling interval for the monitoring in microseconds. - * - * The sampling interval of DAMON for the cold memory monitoring. Please refer - * to the DAMON documentation for more detail. 5 ms by default. - */ -static unsigned long sample_interval __read_mostly = 5000; -module_param(sample_interval, ulong, 0600); - -/* - * Aggregation interval for the monitoring in microseconds. - * - * The aggregation interval of DAMON for the cold memory monitoring. Please - * refer to the DAMON documentation for more detail. 100 ms by default. - */ -static unsigned long aggr_interval __read_mostly = 100000; -module_param(aggr_interval, ulong, 0600); - -/* - * Minimum number of monitoring regions. - * - * The minimal number of monitoring regions of DAMON for the cold memory - * monitoring. This can be used to set lower-bound of the monitoring quality. - * But, setting this too high could result in increased monitoring overhead. - * Please refer to the DAMON documentation for more detail. 10 by default. - */ -static unsigned long min_nr_regions __read_mostly = 10; -module_param(min_nr_regions, ulong, 0600); - -/* - * Maximum number of monitoring regions. - * - * The maximum number of monitoring regions of DAMON for the cold memory - * monitoring. This can be used to set upper-bound of the monitoring overhead. - * However, setting this too low could result in bad monitoring quality. - * Please refer to the DAMON documentation for more detail. 1000 by default. - */ -static unsigned long max_nr_regions __read_mostly = 1000; -module_param(max_nr_regions, ulong, 0600); +static struct damos_quota damon_reclaim_quota = { + /* use up to 10 ms time, reclaim up to 128 MiB per 1 sec by default */ + .ms = 10, + .sz = 128 * 1024 * 1024, + .reset_interval = 1000, + /* Within the quota, page out older regions first. */ + .weight_sz = 0, + .weight_nr_accesses = 0, + .weight_age = 1 +}; +DEFINE_DAMON_MODULES_DAMOS_QUOTAS(damon_reclaim_quota); + +static struct damos_watermarks damon_reclaim_wmarks = { + .metric = DAMOS_WMARK_FREE_MEM_RATE, + .interval = 5000000, /* 5 seconds */ + .high = 500, /* 50 percent */ + .mid = 400, /* 40 percent */ + .low = 200, /* 20 percent */ +}; +DEFINE_DAMON_MODULES_WMARKS_PARAMS(damon_reclaim_wmarks); + +static struct damon_attrs damon_reclaim_mon_attrs = { + .sample_interval = 5000, /* 5 ms */ + .aggr_interval = 100000, /* 100 ms */ + .ops_update_interval = 0, + .min_nr_regions = 10, + .max_nr_regions = 1000, +}; +DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_reclaim_mon_attrs); /* * Start of the target memory region in physical address. @@ -196,119 +107,44 @@ module_param(monitor_region_end, ulong, 0600); static int kdamond_pid __read_mostly = -1; module_param(kdamond_pid, int, 0400); -/* - * Number of memory regions that tried to be reclaimed. - */ -static unsigned long nr_reclaim_tried_regions __read_mostly; -module_param(nr_reclaim_tried_regions, ulong, 0400); - -/* - * Total bytes of memory regions that tried to be reclaimed. - */ -static unsigned long bytes_reclaim_tried_regions __read_mostly; -module_param(bytes_reclaim_tried_regions, ulong, 0400); - -/* - * Number of memory regions that successfully be reclaimed. - */ -static unsigned long nr_reclaimed_regions __read_mostly; -module_param(nr_reclaimed_regions, ulong, 0400); - -/* - * Total bytes of memory regions that successfully be reclaimed. - */ -static unsigned long bytes_reclaimed_regions __read_mostly; -module_param(bytes_reclaimed_regions, ulong, 0400); - -/* - * Number of times that the time/space quota limits have exceeded - */ -static unsigned long nr_quota_exceeds __read_mostly; -module_param(nr_quota_exceeds, ulong, 0400); +static struct damos_stat damon_reclaim_stat; +DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_reclaim_stat, + reclaim_tried_regions, reclaimed_regions, quota_exceeds); static struct damon_ctx *ctx; static struct damon_target *target; -struct damon_reclaim_ram_walk_arg { - unsigned long start; - unsigned long end; -}; - -static int walk_system_ram(struct resource *res, void *arg) -{ - struct damon_reclaim_ram_walk_arg *a = arg; - - if (a->end - a->start < resource_size(res)) { - a->start = res->start; - a->end = res->end; - } - return 0; -} - -/* - * Find biggest 'System RAM' resource and store its start and end address in - * @start and @end, respectively. If no System RAM is found, returns false. - */ -static bool get_monitoring_region(unsigned long *start, unsigned long *end) -{ - struct damon_reclaim_ram_walk_arg arg = {}; - - walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram); - if (arg.end <= arg.start) - return false; - - *start = arg.start; - *end = arg.end; - return true; -} - static struct damos *damon_reclaim_new_scheme(void) { - struct damos_watermarks wmarks = { - .metric = DAMOS_WMARK_FREE_MEM_RATE, - .interval = wmarks_interval, - .high = wmarks_high, - .mid = wmarks_mid, - .low = wmarks_low, + struct damos_access_pattern pattern = { + /* Find regions having PAGE_SIZE or larger size */ + .min_sz_region = PAGE_SIZE, + .max_sz_region = ULONG_MAX, + /* and not accessed at all */ + .min_nr_accesses = 0, + .max_nr_accesses = 0, + /* for min_age or more micro-seconds */ + .min_age_region = min_age / + damon_reclaim_mon_attrs.aggr_interval, + .max_age_region = UINT_MAX, }; - struct damos_quota quota = { - /* - * Do not try reclamation for more than quota_ms milliseconds - * or quota_sz bytes within quota_reset_interval_ms. - */ - .ms = quota_ms, - .sz = quota_sz, - .reset_interval = quota_reset_interval_ms, - /* Within the quota, page out older regions first. */ - .weight_sz = 0, - .weight_nr_accesses = 0, - .weight_age = 1 - }; - struct damos *scheme = damon_new_scheme( - /* Find regions having PAGE_SIZE or larger size */ - PAGE_SIZE, ULONG_MAX, - /* and not accessed at all */ - 0, 0, - /* for min_age or more micro-seconds, and */ - min_age / aggr_interval, UINT_MAX, + + return damon_new_scheme( + &pattern, /* page out those, as soon as found */ DAMOS_PAGEOUT, /* under the quota. */ - "a, + &damon_reclaim_quota, /* (De)activate this according to the watermarks. */ - &wmarks); - - return scheme; + &damon_reclaim_wmarks); } static int damon_reclaim_apply_parameters(void) { struct damos *scheme; - struct damon_addr_range addr_range; int err = 0; - err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0, - min_nr_regions, max_nr_regions); + err = damon_set_attrs(ctx, &damon_reclaim_mon_attrs); if (err) return err; @@ -316,19 +152,11 @@ static int damon_reclaim_apply_parameters(void) scheme = damon_reclaim_new_scheme(); if (!scheme) return -ENOMEM; - err = damon_set_schemes(ctx, &scheme, 1); - if (err) - return err; + damon_set_schemes(ctx, &scheme, 1); - if (monitor_region_start > monitor_region_end) - return -EINVAL; - if (!monitor_region_start && !monitor_region_end && - !get_monitoring_region(&monitor_region_start, - &monitor_region_end)) - return -EINVAL; - addr_range.start = monitor_region_start; - addr_range.end = monitor_region_end; - return damon_set_regions(target, &addr_range, 1); + return damon_set_region_biggest_system_ram_default(target, + &monitor_region_start, + &monitor_region_end); } static int damon_reclaim_turn(bool on) @@ -353,38 +181,31 @@ static int damon_reclaim_turn(bool on) return 0; } -static struct delayed_work damon_reclaim_timer; -static void damon_reclaim_timer_fn(struct work_struct *work) -{ - static bool last_enabled; - bool now_enabled; - - now_enabled = enabled; - if (last_enabled != now_enabled) { - if (!damon_reclaim_turn(now_enabled)) - last_enabled = now_enabled; - else - enabled = last_enabled; - } -} -static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn); - -static bool damon_reclaim_initialized; - static int damon_reclaim_enabled_store(const char *val, const struct kernel_param *kp) { - int rc = param_set_bool(val, kp); + bool is_enabled = enabled; + bool enable; + int err; + + err = kstrtobool(val, &enable); + if (err) + return err; - if (rc < 0) - return rc; + if (is_enabled == enable) + return 0; - /* system_wq might not initialized yet */ - if (!damon_reclaim_initialized) - return rc; + /* Called before init function. The function will handle this. */ + if (!ctx) + goto set_param_out; - schedule_delayed_work(&damon_reclaim_timer, 0); - return 0; + err = damon_reclaim_turn(enable); + if (err) + return err; + +set_param_out: + enabled = enable; + return err; } static const struct kernel_param_ops enabled_param_ops = { @@ -413,13 +234,8 @@ static int damon_reclaim_after_aggregation(struct damon_ctx *c) struct damos *s; /* update the stats parameter */ - damon_for_each_scheme(s, c) { - nr_reclaim_tried_regions = s->stat.nr_tried; - bytes_reclaim_tried_regions = s->stat.sz_tried; - nr_reclaimed_regions = s->stat.nr_applied; - bytes_reclaimed_regions = s->stat.sz_applied; - nr_quota_exceeds = s->stat.qt_exceeds; - } + damon_for_each_scheme(s, c) + damon_reclaim_stat = s->stat; return damon_reclaim_handle_commit_inputs(); } @@ -431,29 +247,19 @@ static int damon_reclaim_after_wmarks_check(struct damon_ctx *c) static int __init damon_reclaim_init(void) { - ctx = damon_new_ctx(); - if (!ctx) - return -ENOMEM; + int err = damon_modules_new_paddr_ctx_target(&ctx, &target); - if (damon_select_ops(ctx, DAMON_OPS_PADDR)) { - damon_destroy_ctx(ctx); - return -EINVAL; - } + if (err) + return err; ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check; ctx->callback.after_aggregation = damon_reclaim_after_aggregation; - target = damon_new_target(); - if (!target) { - damon_destroy_ctx(ctx); - return -ENOMEM; - } - damon_add_target(ctx, target); - - schedule_delayed_work(&damon_reclaim_timer, 0); + /* 'enabled' has set before this function, probably via command line */ + if (enabled) + err = damon_reclaim_turn(true); - damon_reclaim_initialized = true; - return 0; + return err; } module_init(damon_reclaim_init); diff --git a/mm/damon/sysfs-common.c b/mm/damon/sysfs-common.c new file mode 100644 index 000000000000..52bebf242f74 --- /dev/null +++ b/mm/damon/sysfs-common.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common Primitives for DAMON Sysfs Interface + * + * Author: SeongJae Park <sj@kernel.org> + */ + +#include <linux/slab.h> + +#include "sysfs-common.h" + +DEFINE_MUTEX(damon_sysfs_lock); + +/* + * unsigned long range directory + */ + +struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc( + unsigned long min, + unsigned long max) +{ + struct damon_sysfs_ul_range *range = kmalloc(sizeof(*range), + GFP_KERNEL); + + if (!range) + return NULL; + range->kobj = (struct kobject){}; + range->min = min; + range->max = max; + + return range; +} + +static ssize_t min_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + + return sysfs_emit(buf, "%lu\n", range->min); +} + +static ssize_t min_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + unsigned long min; + int err; + + err = kstrtoul(buf, 0, &min); + if (err) + return err; + + range->min = min; + return count; +} + +static ssize_t max_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + + return sysfs_emit(buf, "%lu\n", range->max); +} + +static ssize_t max_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + unsigned long max; + int err; + + err = kstrtoul(buf, 0, &max); + if (err) + return err; + + range->max = max; + return count; +} + +void damon_sysfs_ul_range_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_ul_range, kobj)); +} + +static struct kobj_attribute damon_sysfs_ul_range_min_attr = + __ATTR_RW_MODE(min, 0600); + +static struct kobj_attribute damon_sysfs_ul_range_max_attr = + __ATTR_RW_MODE(max, 0600); + +static struct attribute *damon_sysfs_ul_range_attrs[] = { + &damon_sysfs_ul_range_min_attr.attr, + &damon_sysfs_ul_range_max_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_ul_range); + +struct kobj_type damon_sysfs_ul_range_ktype = { + .release = damon_sysfs_ul_range_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_ul_range_groups, +}; + diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h new file mode 100644 index 000000000000..604a6cbc3ede --- /dev/null +++ b/mm/damon/sysfs-common.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common Primitives for DAMON Sysfs Interface + * + * Author: SeongJae Park <sj@kernel.org> + */ + +#include <linux/damon.h> +#include <linux/kobject.h> + +extern struct mutex damon_sysfs_lock; + +struct damon_sysfs_ul_range { + struct kobject kobj; + unsigned long min; + unsigned long max; +}; + +struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc( + unsigned long min, + unsigned long max); +void damon_sysfs_ul_range_release(struct kobject *kobj); + +extern struct kobj_type damon_sysfs_ul_range_ktype; + +/* + * schemes directory + */ + +struct damon_sysfs_schemes { + struct kobject kobj; + struct damon_sysfs_scheme **schemes_arr; + int nr; +}; + +struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void); +void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes); + +extern struct kobj_type damon_sysfs_schemes_ktype; + +int damon_sysfs_set_schemes(struct damon_ctx *ctx, + struct damon_sysfs_schemes *sysfs_schemes); + +void damon_sysfs_schemes_update_stats( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx); + +int damon_sysfs_schemes_update_regions_start( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx); + +int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx); + +int damon_sysfs_schemes_clear_regions( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx); diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c new file mode 100644 index 000000000000..81fc4d27f4e4 --- /dev/null +++ b/mm/damon/sysfs-schemes.c @@ -0,0 +1,1338 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON sysfs Interface + * + * Copyright (c) 2022 SeongJae Park <sj@kernel.org> + */ + +#include <linux/slab.h> + +#include "sysfs-common.h" + +/* + * scheme region directory + */ + +struct damon_sysfs_scheme_region { + struct kobject kobj; + struct damon_addr_range ar; + unsigned int nr_accesses; + unsigned int age; + struct list_head list; +}; + +static struct damon_sysfs_scheme_region *damon_sysfs_scheme_region_alloc( + struct damon_region *region) +{ + struct damon_sysfs_scheme_region *sysfs_region = kmalloc( + sizeof(*sysfs_region), GFP_KERNEL); + + if (!sysfs_region) + return NULL; + sysfs_region->kobj = (struct kobject){}; + sysfs_region->ar = region->ar; + sysfs_region->nr_accesses = region->nr_accesses; + sysfs_region->age = region->age; + INIT_LIST_HEAD(&sysfs_region->list); + return sysfs_region; +} + +static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + return sysfs_emit(buf, "%lu\n", region->ar.start); +} + +static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + return sysfs_emit(buf, "%lu\n", region->ar.end); +} + +static ssize_t nr_accesses_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + return sysfs_emit(buf, "%u\n", region->nr_accesses); +} + +static ssize_t age_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + return sysfs_emit(buf, "%u\n", region->age); +} + +static void damon_sysfs_scheme_region_release(struct kobject *kobj) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + list_del(®ion->list); + kfree(region); +} + +static struct kobj_attribute damon_sysfs_scheme_region_start_attr = + __ATTR_RO_MODE(start, 0400); + +static struct kobj_attribute damon_sysfs_scheme_region_end_attr = + __ATTR_RO_MODE(end, 0400); + +static struct kobj_attribute damon_sysfs_scheme_region_nr_accesses_attr = + __ATTR_RO_MODE(nr_accesses, 0400); + +static struct kobj_attribute damon_sysfs_scheme_region_age_attr = + __ATTR_RO_MODE(age, 0400); + +static struct attribute *damon_sysfs_scheme_region_attrs[] = { + &damon_sysfs_scheme_region_start_attr.attr, + &damon_sysfs_scheme_region_end_attr.attr, + &damon_sysfs_scheme_region_nr_accesses_attr.attr, + &damon_sysfs_scheme_region_age_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_scheme_region); + +static struct kobj_type damon_sysfs_scheme_region_ktype = { + .release = damon_sysfs_scheme_region_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_scheme_region_groups, +}; + +/* + * scheme regions directory + */ + +struct damon_sysfs_scheme_regions { + struct kobject kobj; + struct list_head regions_list; + int nr_regions; +}; + +static struct damon_sysfs_scheme_regions * +damon_sysfs_scheme_regions_alloc(void) +{ + struct damon_sysfs_scheme_regions *regions = kmalloc(sizeof(*regions), + GFP_KERNEL); + + regions->kobj = (struct kobject){}; + INIT_LIST_HEAD(®ions->regions_list); + regions->nr_regions = 0; + return regions; +} + +static void damon_sysfs_scheme_regions_rm_dirs( + struct damon_sysfs_scheme_regions *regions) +{ + struct damon_sysfs_scheme_region *r, *next; + + list_for_each_entry_safe(r, next, ®ions->regions_list, list) { + /* release function deletes it from the list */ + kobject_put(&r->kobj); + regions->nr_regions--; + } +} + +static void damon_sysfs_scheme_regions_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_scheme_regions, kobj)); +} + +static struct attribute *damon_sysfs_scheme_regions_attrs[] = { + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_scheme_regions); + +static struct kobj_type damon_sysfs_scheme_regions_ktype = { + .release = damon_sysfs_scheme_regions_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_scheme_regions_groups, +}; + +/* + * schemes/stats directory + */ + +struct damon_sysfs_stats { + struct kobject kobj; + unsigned long nr_tried; + unsigned long sz_tried; + unsigned long nr_applied; + unsigned long sz_applied; + unsigned long qt_exceeds; +}; + +static struct damon_sysfs_stats *damon_sysfs_stats_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_stats), GFP_KERNEL); +} + +static ssize_t nr_tried_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->nr_tried); +} + +static ssize_t sz_tried_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->sz_tried); +} + +static ssize_t nr_applied_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->nr_applied); +} + +static ssize_t sz_applied_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->sz_applied); +} + +static ssize_t qt_exceeds_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->qt_exceeds); +} + +static void damon_sysfs_stats_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_stats, kobj)); +} + +static struct kobj_attribute damon_sysfs_stats_nr_tried_attr = + __ATTR_RO_MODE(nr_tried, 0400); + +static struct kobj_attribute damon_sysfs_stats_sz_tried_attr = + __ATTR_RO_MODE(sz_tried, 0400); + +static struct kobj_attribute damon_sysfs_stats_nr_applied_attr = + __ATTR_RO_MODE(nr_applied, 0400); + +static struct kobj_attribute damon_sysfs_stats_sz_applied_attr = + __ATTR_RO_MODE(sz_applied, 0400); + +static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr = + __ATTR_RO_MODE(qt_exceeds, 0400); + +static struct attribute *damon_sysfs_stats_attrs[] = { + &damon_sysfs_stats_nr_tried_attr.attr, + &damon_sysfs_stats_sz_tried_attr.attr, + &damon_sysfs_stats_nr_applied_attr.attr, + &damon_sysfs_stats_sz_applied_attr.attr, + &damon_sysfs_stats_qt_exceeds_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_stats); + +static struct kobj_type damon_sysfs_stats_ktype = { + .release = damon_sysfs_stats_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_stats_groups, +}; + +/* + * watermarks directory + */ + +struct damon_sysfs_watermarks { + struct kobject kobj; + enum damos_wmark_metric metric; + unsigned long interval_us; + unsigned long high; + unsigned long mid; + unsigned long low; +}; + +static struct damon_sysfs_watermarks *damon_sysfs_watermarks_alloc( + enum damos_wmark_metric metric, unsigned long interval_us, + unsigned long high, unsigned long mid, unsigned long low) +{ + struct damon_sysfs_watermarks *watermarks = kmalloc( + sizeof(*watermarks), GFP_KERNEL); + + if (!watermarks) + return NULL; + watermarks->kobj = (struct kobject){}; + watermarks->metric = metric; + watermarks->interval_us = interval_us; + watermarks->high = high; + watermarks->mid = mid; + watermarks->low = low; + return watermarks; +} + +/* Should match with enum damos_wmark_metric */ +static const char * const damon_sysfs_wmark_metric_strs[] = { + "none", + "free_mem_rate", +}; + +static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%s\n", + damon_sysfs_wmark_metric_strs[watermarks->metric]); +} + +static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + enum damos_wmark_metric metric; + + for (metric = 0; metric < NR_DAMOS_WMARK_METRICS; metric++) { + if (sysfs_streq(buf, damon_sysfs_wmark_metric_strs[metric])) { + watermarks->metric = metric; + return count; + } + } + return -EINVAL; +} + +static ssize_t interval_us_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->interval_us); +} + +static ssize_t interval_us_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->interval_us); + + return err ? err : count; +} + +static ssize_t high_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->high); +} + +static ssize_t high_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->high); + + return err ? err : count; +} + +static ssize_t mid_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->mid); +} + +static ssize_t mid_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->mid); + + return err ? err : count; +} + +static ssize_t low_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->low); +} + +static ssize_t low_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->low); + + return err ? err : count; +} + +static void damon_sysfs_watermarks_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_watermarks, kobj)); +} + +static struct kobj_attribute damon_sysfs_watermarks_metric_attr = + __ATTR_RW_MODE(metric, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_interval_us_attr = + __ATTR_RW_MODE(interval_us, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_high_attr = + __ATTR_RW_MODE(high, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_mid_attr = + __ATTR_RW_MODE(mid, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_low_attr = + __ATTR_RW_MODE(low, 0600); + +static struct attribute *damon_sysfs_watermarks_attrs[] = { + &damon_sysfs_watermarks_metric_attr.attr, + &damon_sysfs_watermarks_interval_us_attr.attr, + &damon_sysfs_watermarks_high_attr.attr, + &damon_sysfs_watermarks_mid_attr.attr, + &damon_sysfs_watermarks_low_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_watermarks); + +static struct kobj_type damon_sysfs_watermarks_ktype = { + .release = damon_sysfs_watermarks_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_watermarks_groups, +}; + +/* + * scheme/weights directory + */ + +struct damon_sysfs_weights { + struct kobject kobj; + unsigned int sz; + unsigned int nr_accesses; + unsigned int age; +}; + +static struct damon_sysfs_weights *damon_sysfs_weights_alloc(unsigned int sz, + unsigned int nr_accesses, unsigned int age) +{ + struct damon_sysfs_weights *weights = kmalloc(sizeof(*weights), + GFP_KERNEL); + + if (!weights) + return NULL; + weights->kobj = (struct kobject){}; + weights->sz = sz; + weights->nr_accesses = nr_accesses; + weights->age = age; + return weights; +} + +static ssize_t sz_permil_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + + return sysfs_emit(buf, "%u\n", weights->sz); +} + +static ssize_t sz_permil_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + int err = kstrtouint(buf, 0, &weights->sz); + + return err ? err : count; +} + +static ssize_t nr_accesses_permil_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + + return sysfs_emit(buf, "%u\n", weights->nr_accesses); +} + +static ssize_t nr_accesses_permil_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + int err = kstrtouint(buf, 0, &weights->nr_accesses); + + return err ? err : count; +} + +static ssize_t age_permil_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + + return sysfs_emit(buf, "%u\n", weights->age); +} + +static ssize_t age_permil_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + int err = kstrtouint(buf, 0, &weights->age); + + return err ? err : count; +} + +static void damon_sysfs_weights_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_weights, kobj)); +} + +static struct kobj_attribute damon_sysfs_weights_sz_attr = + __ATTR_RW_MODE(sz_permil, 0600); + +static struct kobj_attribute damon_sysfs_weights_nr_accesses_attr = + __ATTR_RW_MODE(nr_accesses_permil, 0600); + +static struct kobj_attribute damon_sysfs_weights_age_attr = + __ATTR_RW_MODE(age_permil, 0600); + +static struct attribute *damon_sysfs_weights_attrs[] = { + &damon_sysfs_weights_sz_attr.attr, + &damon_sysfs_weights_nr_accesses_attr.attr, + &damon_sysfs_weights_age_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_weights); + +static struct kobj_type damon_sysfs_weights_ktype = { + .release = damon_sysfs_weights_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_weights_groups, +}; + +/* + * quotas directory + */ + +struct damon_sysfs_quotas { + struct kobject kobj; + struct damon_sysfs_weights *weights; + unsigned long ms; + unsigned long sz; + unsigned long reset_interval_ms; +}; + +static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_quotas), GFP_KERNEL); +} + +static int damon_sysfs_quotas_add_dirs(struct damon_sysfs_quotas *quotas) +{ + struct damon_sysfs_weights *weights; + int err; + + weights = damon_sysfs_weights_alloc(0, 0, 0); + if (!weights) + return -ENOMEM; + + err = kobject_init_and_add(&weights->kobj, &damon_sysfs_weights_ktype, + "as->kobj, "weights"); + if (err) + kobject_put(&weights->kobj); + else + quotas->weights = weights; + return err; +} + +static void damon_sysfs_quotas_rm_dirs(struct damon_sysfs_quotas *quotas) +{ + kobject_put("as->weights->kobj); +} + +static ssize_t ms_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%lu\n", quotas->ms); +} + +static ssize_t ms_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int err = kstrtoul(buf, 0, "as->ms); + + if (err) + return -EINVAL; + return count; +} + +static ssize_t bytes_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%lu\n", quotas->sz); +} + +static ssize_t bytes_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int err = kstrtoul(buf, 0, "as->sz); + + if (err) + return -EINVAL; + return count; +} + +static ssize_t reset_interval_ms_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%lu\n", quotas->reset_interval_ms); +} + +static ssize_t reset_interval_ms_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int err = kstrtoul(buf, 0, "as->reset_interval_ms); + + if (err) + return -EINVAL; + return count; +} + +static void damon_sysfs_quotas_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_quotas, kobj)); +} + +static struct kobj_attribute damon_sysfs_quotas_ms_attr = + __ATTR_RW_MODE(ms, 0600); + +static struct kobj_attribute damon_sysfs_quotas_sz_attr = + __ATTR_RW_MODE(bytes, 0600); + +static struct kobj_attribute damon_sysfs_quotas_reset_interval_ms_attr = + __ATTR_RW_MODE(reset_interval_ms, 0600); + +static struct attribute *damon_sysfs_quotas_attrs[] = { + &damon_sysfs_quotas_ms_attr.attr, + &damon_sysfs_quotas_sz_attr.attr, + &damon_sysfs_quotas_reset_interval_ms_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_quotas); + +static struct kobj_type damon_sysfs_quotas_ktype = { + .release = damon_sysfs_quotas_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_quotas_groups, +}; + +/* + * access_pattern directory + */ + +struct damon_sysfs_access_pattern { + struct kobject kobj; + struct damon_sysfs_ul_range *sz; + struct damon_sysfs_ul_range *nr_accesses; + struct damon_sysfs_ul_range *age; +}; + +static +struct damon_sysfs_access_pattern *damon_sysfs_access_pattern_alloc(void) +{ + struct damon_sysfs_access_pattern *access_pattern = + kmalloc(sizeof(*access_pattern), GFP_KERNEL); + + if (!access_pattern) + return NULL; + access_pattern->kobj = (struct kobject){}; + return access_pattern; +} + +static int damon_sysfs_access_pattern_add_range_dir( + struct damon_sysfs_access_pattern *access_pattern, + struct damon_sysfs_ul_range **range_dir_ptr, + char *name) +{ + struct damon_sysfs_ul_range *range = damon_sysfs_ul_range_alloc(0, 0); + int err; + + if (!range) + return -ENOMEM; + err = kobject_init_and_add(&range->kobj, &damon_sysfs_ul_range_ktype, + &access_pattern->kobj, name); + if (err) + kobject_put(&range->kobj); + else + *range_dir_ptr = range; + return err; +} + +static int damon_sysfs_access_pattern_add_dirs( + struct damon_sysfs_access_pattern *access_pattern) +{ + int err; + + err = damon_sysfs_access_pattern_add_range_dir(access_pattern, + &access_pattern->sz, "sz"); + if (err) + goto put_sz_out; + + err = damon_sysfs_access_pattern_add_range_dir(access_pattern, + &access_pattern->nr_accesses, "nr_accesses"); + if (err) + goto put_nr_accesses_sz_out; + + err = damon_sysfs_access_pattern_add_range_dir(access_pattern, + &access_pattern->age, "age"); + if (err) + goto put_age_nr_accesses_sz_out; + return 0; + +put_age_nr_accesses_sz_out: + kobject_put(&access_pattern->age->kobj); + access_pattern->age = NULL; +put_nr_accesses_sz_out: + kobject_put(&access_pattern->nr_accesses->kobj); + access_pattern->nr_accesses = NULL; +put_sz_out: + kobject_put(&access_pattern->sz->kobj); + access_pattern->sz = NULL; + return err; +} + +static void damon_sysfs_access_pattern_rm_dirs( + struct damon_sysfs_access_pattern *access_pattern) +{ + kobject_put(&access_pattern->sz->kobj); + kobject_put(&access_pattern->nr_accesses->kobj); + kobject_put(&access_pattern->age->kobj); +} + +static void damon_sysfs_access_pattern_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_access_pattern, kobj)); +} + +static struct attribute *damon_sysfs_access_pattern_attrs[] = { + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_access_pattern); + +static struct kobj_type damon_sysfs_access_pattern_ktype = { + .release = damon_sysfs_access_pattern_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_access_pattern_groups, +}; + +/* + * scheme directory + */ + +struct damon_sysfs_scheme { + struct kobject kobj; + enum damos_action action; + struct damon_sysfs_access_pattern *access_pattern; + struct damon_sysfs_quotas *quotas; + struct damon_sysfs_watermarks *watermarks; + struct damon_sysfs_stats *stats; + struct damon_sysfs_scheme_regions *tried_regions; +}; + +/* This should match with enum damos_action */ +static const char * const damon_sysfs_damos_action_strs[] = { + "willneed", + "cold", + "pageout", + "hugepage", + "nohugepage", + "lru_prio", + "lru_deprio", + "stat", +}; + +static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc( + enum damos_action action) +{ + struct damon_sysfs_scheme *scheme = kmalloc(sizeof(*scheme), + GFP_KERNEL); + + if (!scheme) + return NULL; + scheme->kobj = (struct kobject){}; + scheme->action = action; + return scheme; +} + +static int damon_sysfs_scheme_set_access_pattern( + struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_access_pattern *access_pattern; + int err; + + access_pattern = damon_sysfs_access_pattern_alloc(); + if (!access_pattern) + return -ENOMEM; + err = kobject_init_and_add(&access_pattern->kobj, + &damon_sysfs_access_pattern_ktype, &scheme->kobj, + "access_pattern"); + if (err) + goto out; + err = damon_sysfs_access_pattern_add_dirs(access_pattern); + if (err) + goto out; + scheme->access_pattern = access_pattern; + return 0; + +out: + kobject_put(&access_pattern->kobj); + return err; +} + +static int damon_sysfs_scheme_set_quotas(struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_quotas *quotas = damon_sysfs_quotas_alloc(); + int err; + + if (!quotas) + return -ENOMEM; + err = kobject_init_and_add("as->kobj, &damon_sysfs_quotas_ktype, + &scheme->kobj, "quotas"); + if (err) + goto out; + err = damon_sysfs_quotas_add_dirs(quotas); + if (err) + goto out; + scheme->quotas = quotas; + return 0; + +out: + kobject_put("as->kobj); + return err; +} + +static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_watermarks *watermarks = + damon_sysfs_watermarks_alloc(DAMOS_WMARK_NONE, 0, 0, 0, 0); + int err; + + if (!watermarks) + return -ENOMEM; + err = kobject_init_and_add(&watermarks->kobj, + &damon_sysfs_watermarks_ktype, &scheme->kobj, + "watermarks"); + if (err) + kobject_put(&watermarks->kobj); + else + scheme->watermarks = watermarks; + return err; +} + +static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_stats *stats = damon_sysfs_stats_alloc(); + int err; + + if (!stats) + return -ENOMEM; + err = kobject_init_and_add(&stats->kobj, &damon_sysfs_stats_ktype, + &scheme->kobj, "stats"); + if (err) + kobject_put(&stats->kobj); + else + scheme->stats = stats; + return err; +} + +static int damon_sysfs_scheme_set_tried_regions( + struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_scheme_regions *tried_regions = + damon_sysfs_scheme_regions_alloc(); + int err; + + if (!tried_regions) + return -ENOMEM; + err = kobject_init_and_add(&tried_regions->kobj, + &damon_sysfs_scheme_regions_ktype, &scheme->kobj, + "tried_regions"); + if (err) + kobject_put(&tried_regions->kobj); + else + scheme->tried_regions = tried_regions; + return err; +} + +static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme) +{ + int err; + + err = damon_sysfs_scheme_set_access_pattern(scheme); + if (err) + return err; + err = damon_sysfs_scheme_set_quotas(scheme); + if (err) + goto put_access_pattern_out; + err = damon_sysfs_scheme_set_watermarks(scheme); + if (err) + goto put_quotas_access_pattern_out; + err = damon_sysfs_scheme_set_stats(scheme); + if (err) + goto put_watermarks_quotas_access_pattern_out; + err = damon_sysfs_scheme_set_tried_regions(scheme); + if (err) + goto put_tried_regions_out; + return 0; + +put_tried_regions_out: + kobject_put(&scheme->tried_regions->kobj); + scheme->tried_regions = NULL; +put_watermarks_quotas_access_pattern_out: + kobject_put(&scheme->watermarks->kobj); + scheme->watermarks = NULL; +put_quotas_access_pattern_out: + kobject_put(&scheme->quotas->kobj); + scheme->quotas = NULL; +put_access_pattern_out: + kobject_put(&scheme->access_pattern->kobj); + scheme->access_pattern = NULL; + return err; +} + +static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme) +{ + damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern); + kobject_put(&scheme->access_pattern->kobj); + damon_sysfs_quotas_rm_dirs(scheme->quotas); + kobject_put(&scheme->quotas->kobj); + kobject_put(&scheme->watermarks->kobj); + kobject_put(&scheme->stats->kobj); + damon_sysfs_scheme_regions_rm_dirs(scheme->tried_regions); + kobject_put(&scheme->tried_regions->kobj); +} + +static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + + return sysfs_emit(buf, "%s\n", + damon_sysfs_damos_action_strs[scheme->action]); +} + +static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + enum damos_action action; + + for (action = 0; action < NR_DAMOS_ACTIONS; action++) { + if (sysfs_streq(buf, damon_sysfs_damos_action_strs[action])) { + scheme->action = action; + return count; + } + } + return -EINVAL; +} + +static void damon_sysfs_scheme_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_scheme, kobj)); +} + +static struct kobj_attribute damon_sysfs_scheme_action_attr = + __ATTR_RW_MODE(action, 0600); + +static struct attribute *damon_sysfs_scheme_attrs[] = { + &damon_sysfs_scheme_action_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_scheme); + +static struct kobj_type damon_sysfs_scheme_ktype = { + .release = damon_sysfs_scheme_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_scheme_groups, +}; + +/* + * schemes directory + */ + +struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_schemes), GFP_KERNEL); +} + +void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes) +{ + struct damon_sysfs_scheme **schemes_arr = schemes->schemes_arr; + int i; + + for (i = 0; i < schemes->nr; i++) { + damon_sysfs_scheme_rm_dirs(schemes_arr[i]); + kobject_put(&schemes_arr[i]->kobj); + } + schemes->nr = 0; + kfree(schemes_arr); + schemes->schemes_arr = NULL; +} + +static int damon_sysfs_schemes_add_dirs(struct damon_sysfs_schemes *schemes, + int nr_schemes) +{ + struct damon_sysfs_scheme **schemes_arr, *scheme; + int err, i; + + damon_sysfs_schemes_rm_dirs(schemes); + if (!nr_schemes) + return 0; + + schemes_arr = kmalloc_array(nr_schemes, sizeof(*schemes_arr), + GFP_KERNEL | __GFP_NOWARN); + if (!schemes_arr) + return -ENOMEM; + schemes->schemes_arr = schemes_arr; + + for (i = 0; i < nr_schemes; i++) { + scheme = damon_sysfs_scheme_alloc(DAMOS_STAT); + if (!scheme) { + damon_sysfs_schemes_rm_dirs(schemes); + return -ENOMEM; + } + + err = kobject_init_and_add(&scheme->kobj, + &damon_sysfs_scheme_ktype, &schemes->kobj, + "%d", i); + if (err) + goto out; + err = damon_sysfs_scheme_add_dirs(scheme); + if (err) + goto out; + + schemes_arr[i] = scheme; + schemes->nr++; + } + return 0; + +out: + damon_sysfs_schemes_rm_dirs(schemes); + kobject_put(&scheme->kobj); + return err; +} + +static ssize_t nr_schemes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_schemes *schemes = container_of(kobj, + struct damon_sysfs_schemes, kobj); + + return sysfs_emit(buf, "%d\n", schemes->nr); +} + +static ssize_t nr_schemes_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_schemes *schemes; + int nr, err = kstrtoint(buf, 0, &nr); + + if (err) + return err; + if (nr < 0) + return -EINVAL; + + schemes = container_of(kobj, struct damon_sysfs_schemes, kobj); + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damon_sysfs_schemes_add_dirs(schemes, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + return count; +} + +static void damon_sysfs_schemes_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_schemes, kobj)); +} + +static struct kobj_attribute damon_sysfs_schemes_nr_attr = + __ATTR_RW_MODE(nr_schemes, 0600); + +static struct attribute *damon_sysfs_schemes_attrs[] = { + &damon_sysfs_schemes_nr_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_schemes); + +struct kobj_type damon_sysfs_schemes_ktype = { + .release = damon_sysfs_schemes_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_schemes_groups, +}; + +static struct damos *damon_sysfs_mk_scheme( + struct damon_sysfs_scheme *sysfs_scheme) +{ + struct damon_sysfs_access_pattern *access_pattern = + sysfs_scheme->access_pattern; + struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; + struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; + struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; + + struct damos_access_pattern pattern = { + .min_sz_region = access_pattern->sz->min, + .max_sz_region = access_pattern->sz->max, + .min_nr_accesses = access_pattern->nr_accesses->min, + .max_nr_accesses = access_pattern->nr_accesses->max, + .min_age_region = access_pattern->age->min, + .max_age_region = access_pattern->age->max, + }; + struct damos_quota quota = { + .ms = sysfs_quotas->ms, + .sz = sysfs_quotas->sz, + .reset_interval = sysfs_quotas->reset_interval_ms, + .weight_sz = sysfs_weights->sz, + .weight_nr_accesses = sysfs_weights->nr_accesses, + .weight_age = sysfs_weights->age, + }; + struct damos_watermarks wmarks = { + .metric = sysfs_wmarks->metric, + .interval = sysfs_wmarks->interval_us, + .high = sysfs_wmarks->high, + .mid = sysfs_wmarks->mid, + .low = sysfs_wmarks->low, + }; + + return damon_new_scheme(&pattern, sysfs_scheme->action, "a, + &wmarks); +} + +static void damon_sysfs_update_scheme(struct damos *scheme, + struct damon_sysfs_scheme *sysfs_scheme) +{ + struct damon_sysfs_access_pattern *access_pattern = + sysfs_scheme->access_pattern; + struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; + struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; + struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; + + scheme->pattern.min_sz_region = access_pattern->sz->min; + scheme->pattern.max_sz_region = access_pattern->sz->max; + scheme->pattern.min_nr_accesses = access_pattern->nr_accesses->min; + scheme->pattern.max_nr_accesses = access_pattern->nr_accesses->max; + scheme->pattern.min_age_region = access_pattern->age->min; + scheme->pattern.max_age_region = access_pattern->age->max; + + scheme->action = sysfs_scheme->action; + + scheme->quota.ms = sysfs_quotas->ms; + scheme->quota.sz = sysfs_quotas->sz; + scheme->quota.reset_interval = sysfs_quotas->reset_interval_ms; + scheme->quota.weight_sz = sysfs_weights->sz; + scheme->quota.weight_nr_accesses = sysfs_weights->nr_accesses; + scheme->quota.weight_age = sysfs_weights->age; + + scheme->wmarks.metric = sysfs_wmarks->metric; + scheme->wmarks.interval = sysfs_wmarks->interval_us; + scheme->wmarks.high = sysfs_wmarks->high; + scheme->wmarks.mid = sysfs_wmarks->mid; + scheme->wmarks.low = sysfs_wmarks->low; +} + +int damon_sysfs_set_schemes(struct damon_ctx *ctx, + struct damon_sysfs_schemes *sysfs_schemes) +{ + struct damos *scheme, *next; + int i = 0; + + damon_for_each_scheme_safe(scheme, next, ctx) { + if (i < sysfs_schemes->nr) + damon_sysfs_update_scheme(scheme, + sysfs_schemes->schemes_arr[i]); + else + damon_destroy_scheme(scheme); + i++; + } + + for (; i < sysfs_schemes->nr; i++) { + struct damos *scheme, *next; + + scheme = damon_sysfs_mk_scheme(sysfs_schemes->schemes_arr[i]); + if (!scheme) { + damon_for_each_scheme_safe(scheme, next, ctx) + damon_destroy_scheme(scheme); + return -ENOMEM; + } + damon_add_scheme(ctx, scheme); + } + return 0; +} + +void damon_sysfs_schemes_update_stats( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx) +{ + struct damos *scheme; + int schemes_idx = 0; + + damon_for_each_scheme(scheme, ctx) { + struct damon_sysfs_stats *sysfs_stats; + + /* user could have removed the scheme sysfs dir */ + if (schemes_idx >= sysfs_schemes->nr) + break; + + sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats; + sysfs_stats->nr_tried = scheme->stat.nr_tried; + sysfs_stats->sz_tried = scheme->stat.sz_tried; + sysfs_stats->nr_applied = scheme->stat.nr_applied; + sysfs_stats->sz_applied = scheme->stat.sz_applied; + sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds; + } +} + +/* + * damon_sysfs_schemes that need to update its schemes regions dir. Protected + * by damon_sysfs_lock + */ +static struct damon_sysfs_schemes *damon_sysfs_schemes_for_damos_callback; +static int damon_sysfs_schemes_region_idx; + +/* + * DAMON callback that called before damos apply. While this callback is + * registered, damon_sysfs_lock should be held to ensure the regions + * directories exist. + */ +static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *s) +{ + struct damos *scheme; + struct damon_sysfs_scheme_regions *sysfs_regions; + struct damon_sysfs_scheme_region *region; + struct damon_sysfs_schemes *sysfs_schemes = + damon_sysfs_schemes_for_damos_callback; + int schemes_idx = 0; + + damon_for_each_scheme(scheme, ctx) { + if (scheme == s) + break; + schemes_idx++; + } + + /* user could have removed the scheme sysfs dir */ + if (schemes_idx >= sysfs_schemes->nr) + return 0; + + sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions; + region = damon_sysfs_scheme_region_alloc(r); + list_add_tail(®ion->list, &sysfs_regions->regions_list); + sysfs_regions->nr_regions++; + if (kobject_init_and_add(®ion->kobj, + &damon_sysfs_scheme_region_ktype, + &sysfs_regions->kobj, "%d", + damon_sysfs_schemes_region_idx++)) { + kobject_put(®ion->kobj); + } + return 0; +} + +/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ +int damon_sysfs_schemes_clear_regions( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx) +{ + struct damos *scheme; + int schemes_idx = 0; + + damon_for_each_scheme(scheme, ctx) { + struct damon_sysfs_scheme *sysfs_scheme; + + /* user could have removed the scheme sysfs dir */ + if (schemes_idx >= sysfs_schemes->nr) + break; + + sysfs_scheme = sysfs_schemes->schemes_arr[schemes_idx++]; + damon_sysfs_scheme_regions_rm_dirs( + sysfs_scheme->tried_regions); + } + return 0; +} + +/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ +int damon_sysfs_schemes_update_regions_start( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx) +{ + damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx); + damon_sysfs_schemes_for_damos_callback = sysfs_schemes; + ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply; + return 0; +} + +/* + * Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock. Caller + * should unlock damon_sysfs_lock which held before + * damon_sysfs_schemes_update_regions_start() + */ +int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx) +{ + damon_sysfs_schemes_for_damos_callback = NULL; + ctx->callback.before_damos_apply = NULL; + damon_sysfs_schemes_region_idx = 0; + return 0; +} diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 7488e27c87c3..aeb0beb1da91 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -5,1069 +5,11 @@ * Copyright (c) 2022 SeongJae Park <sj@kernel.org> */ -#include <linux/damon.h> -#include <linux/kobject.h> #include <linux/pid.h> #include <linux/sched.h> #include <linux/slab.h> -static DEFINE_MUTEX(damon_sysfs_lock); - -/* - * unsigned long range directory - */ - -struct damon_sysfs_ul_range { - struct kobject kobj; - unsigned long min; - unsigned long max; -}; - -static struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc( - unsigned long min, - unsigned long max) -{ - struct damon_sysfs_ul_range *range = kmalloc(sizeof(*range), - GFP_KERNEL); - - if (!range) - return NULL; - range->kobj = (struct kobject){}; - range->min = min; - range->max = max; - - return range; -} - -static ssize_t min_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_ul_range *range = container_of(kobj, - struct damon_sysfs_ul_range, kobj); - - return sysfs_emit(buf, "%lu\n", range->min); -} - -static ssize_t min_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t count) -{ - struct damon_sysfs_ul_range *range = container_of(kobj, - struct damon_sysfs_ul_range, kobj); - unsigned long min; - int err; - - err = kstrtoul(buf, 0, &min); - if (err) - return -EINVAL; - - range->min = min; - return count; -} - -static ssize_t max_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_ul_range *range = container_of(kobj, - struct damon_sysfs_ul_range, kobj); - - return sysfs_emit(buf, "%lu\n", range->max); -} - -static ssize_t max_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t count) -{ - struct damon_sysfs_ul_range *range = container_of(kobj, - struct damon_sysfs_ul_range, kobj); - unsigned long max; - int err; - - err = kstrtoul(buf, 0, &max); - if (err) - return -EINVAL; - - range->max = max; - return count; -} - -static void damon_sysfs_ul_range_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_ul_range, kobj)); -} - -static struct kobj_attribute damon_sysfs_ul_range_min_attr = - __ATTR_RW_MODE(min, 0600); - -static struct kobj_attribute damon_sysfs_ul_range_max_attr = - __ATTR_RW_MODE(max, 0600); - -static struct attribute *damon_sysfs_ul_range_attrs[] = { - &damon_sysfs_ul_range_min_attr.attr, - &damon_sysfs_ul_range_max_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_ul_range); - -static struct kobj_type damon_sysfs_ul_range_ktype = { - .release = damon_sysfs_ul_range_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_ul_range_groups, -}; - -/* - * schemes/stats directory - */ - -struct damon_sysfs_stats { - struct kobject kobj; - unsigned long nr_tried; - unsigned long sz_tried; - unsigned long nr_applied; - unsigned long sz_applied; - unsigned long qt_exceeds; -}; - -static struct damon_sysfs_stats *damon_sysfs_stats_alloc(void) -{ - return kzalloc(sizeof(struct damon_sysfs_stats), GFP_KERNEL); -} - -static ssize_t nr_tried_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_stats *stats = container_of(kobj, - struct damon_sysfs_stats, kobj); - - return sysfs_emit(buf, "%lu\n", stats->nr_tried); -} - -static ssize_t sz_tried_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_stats *stats = container_of(kobj, - struct damon_sysfs_stats, kobj); - - return sysfs_emit(buf, "%lu\n", stats->sz_tried); -} - -static ssize_t nr_applied_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_stats *stats = container_of(kobj, - struct damon_sysfs_stats, kobj); - - return sysfs_emit(buf, "%lu\n", stats->nr_applied); -} - -static ssize_t sz_applied_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_stats *stats = container_of(kobj, - struct damon_sysfs_stats, kobj); - - return sysfs_emit(buf, "%lu\n", stats->sz_applied); -} - -static ssize_t qt_exceeds_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_stats *stats = container_of(kobj, - struct damon_sysfs_stats, kobj); - - return sysfs_emit(buf, "%lu\n", stats->qt_exceeds); -} - -static void damon_sysfs_stats_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_stats, kobj)); -} - -static struct kobj_attribute damon_sysfs_stats_nr_tried_attr = - __ATTR_RO_MODE(nr_tried, 0400); - -static struct kobj_attribute damon_sysfs_stats_sz_tried_attr = - __ATTR_RO_MODE(sz_tried, 0400); - -static struct kobj_attribute damon_sysfs_stats_nr_applied_attr = - __ATTR_RO_MODE(nr_applied, 0400); - -static struct kobj_attribute damon_sysfs_stats_sz_applied_attr = - __ATTR_RO_MODE(sz_applied, 0400); - -static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr = - __ATTR_RO_MODE(qt_exceeds, 0400); - -static struct attribute *damon_sysfs_stats_attrs[] = { - &damon_sysfs_stats_nr_tried_attr.attr, - &damon_sysfs_stats_sz_tried_attr.attr, - &damon_sysfs_stats_nr_applied_attr.attr, - &damon_sysfs_stats_sz_applied_attr.attr, - &damon_sysfs_stats_qt_exceeds_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_stats); - -static struct kobj_type damon_sysfs_stats_ktype = { - .release = damon_sysfs_stats_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_stats_groups, -}; - -/* - * watermarks directory - */ - -struct damon_sysfs_watermarks { - struct kobject kobj; - enum damos_wmark_metric metric; - unsigned long interval_us; - unsigned long high; - unsigned long mid; - unsigned long low; -}; - -static struct damon_sysfs_watermarks *damon_sysfs_watermarks_alloc( - enum damos_wmark_metric metric, unsigned long interval_us, - unsigned long high, unsigned long mid, unsigned long low) -{ - struct damon_sysfs_watermarks *watermarks = kmalloc( - sizeof(*watermarks), GFP_KERNEL); - - if (!watermarks) - return NULL; - watermarks->kobj = (struct kobject){}; - watermarks->metric = metric; - watermarks->interval_us = interval_us; - watermarks->high = high; - watermarks->mid = mid; - watermarks->low = low; - return watermarks; -} - -/* Should match with enum damos_wmark_metric */ -static const char * const damon_sysfs_wmark_metric_strs[] = { - "none", - "free_mem_rate", -}; - -static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - - return sysfs_emit(buf, "%s\n", - damon_sysfs_wmark_metric_strs[watermarks->metric]); -} - -static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t count) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - enum damos_wmark_metric metric; - - for (metric = 0; metric < NR_DAMOS_WMARK_METRICS; metric++) { - if (sysfs_streq(buf, damon_sysfs_wmark_metric_strs[metric])) { - watermarks->metric = metric; - return count; - } - } - return -EINVAL; -} - -static ssize_t interval_us_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - - return sysfs_emit(buf, "%lu\n", watermarks->interval_us); -} - -static ssize_t interval_us_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - int err = kstrtoul(buf, 0, &watermarks->interval_us); - - if (err) - return -EINVAL; - return count; -} - -static ssize_t high_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - - return sysfs_emit(buf, "%lu\n", watermarks->high); -} - -static ssize_t high_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - int err = kstrtoul(buf, 0, &watermarks->high); - - if (err) - return -EINVAL; - return count; -} - -static ssize_t mid_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - - return sysfs_emit(buf, "%lu\n", watermarks->mid); -} - -static ssize_t mid_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - int err = kstrtoul(buf, 0, &watermarks->mid); - - if (err) - return -EINVAL; - return count; -} - -static ssize_t low_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - - return sysfs_emit(buf, "%lu\n", watermarks->low); -} - -static ssize_t low_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - int err = kstrtoul(buf, 0, &watermarks->low); - - if (err) - return -EINVAL; - return count; -} - -static void damon_sysfs_watermarks_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_watermarks, kobj)); -} - -static struct kobj_attribute damon_sysfs_watermarks_metric_attr = - __ATTR_RW_MODE(metric, 0600); - -static struct kobj_attribute damon_sysfs_watermarks_interval_us_attr = - __ATTR_RW_MODE(interval_us, 0600); - -static struct kobj_attribute damon_sysfs_watermarks_high_attr = - __ATTR_RW_MODE(high, 0600); - -static struct kobj_attribute damon_sysfs_watermarks_mid_attr = - __ATTR_RW_MODE(mid, 0600); - -static struct kobj_attribute damon_sysfs_watermarks_low_attr = - __ATTR_RW_MODE(low, 0600); - -static struct attribute *damon_sysfs_watermarks_attrs[] = { - &damon_sysfs_watermarks_metric_attr.attr, - &damon_sysfs_watermarks_interval_us_attr.attr, - &damon_sysfs_watermarks_high_attr.attr, - &damon_sysfs_watermarks_mid_attr.attr, - &damon_sysfs_watermarks_low_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_watermarks); - -static struct kobj_type damon_sysfs_watermarks_ktype = { - .release = damon_sysfs_watermarks_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_watermarks_groups, -}; - -/* - * scheme/weights directory - */ - -struct damon_sysfs_weights { - struct kobject kobj; - unsigned int sz; - unsigned int nr_accesses; - unsigned int age; -}; - -static struct damon_sysfs_weights *damon_sysfs_weights_alloc(unsigned int sz, - unsigned int nr_accesses, unsigned int age) -{ - struct damon_sysfs_weights *weights = kmalloc(sizeof(*weights), - GFP_KERNEL); - - if (!weights) - return NULL; - weights->kobj = (struct kobject){}; - weights->sz = sz; - weights->nr_accesses = nr_accesses; - weights->age = age; - return weights; -} - -static ssize_t sz_permil_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_weights *weights = container_of(kobj, - struct damon_sysfs_weights, kobj); - - return sysfs_emit(buf, "%u\n", weights->sz); -} - -static ssize_t sz_permil_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_weights *weights = container_of(kobj, - struct damon_sysfs_weights, kobj); - int err = kstrtouint(buf, 0, &weights->sz); - - if (err) - return -EINVAL; - return count; -} - -static ssize_t nr_accesses_permil_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_weights *weights = container_of(kobj, - struct damon_sysfs_weights, kobj); - - return sysfs_emit(buf, "%u\n", weights->nr_accesses); -} - -static ssize_t nr_accesses_permil_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_weights *weights = container_of(kobj, - struct damon_sysfs_weights, kobj); - int err = kstrtouint(buf, 0, &weights->nr_accesses); - - if (err) - return -EINVAL; - return count; -} - -static ssize_t age_permil_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_weights *weights = container_of(kobj, - struct damon_sysfs_weights, kobj); - - return sysfs_emit(buf, "%u\n", weights->age); -} - -static ssize_t age_permil_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_weights *weights = container_of(kobj, - struct damon_sysfs_weights, kobj); - int err = kstrtouint(buf, 0, &weights->age); - - if (err) - return -EINVAL; - return count; -} - -static void damon_sysfs_weights_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_weights, kobj)); -} - -static struct kobj_attribute damon_sysfs_weights_sz_attr = - __ATTR_RW_MODE(sz_permil, 0600); - -static struct kobj_attribute damon_sysfs_weights_nr_accesses_attr = - __ATTR_RW_MODE(nr_accesses_permil, 0600); - -static struct kobj_attribute damon_sysfs_weights_age_attr = - __ATTR_RW_MODE(age_permil, 0600); - -static struct attribute *damon_sysfs_weights_attrs[] = { - &damon_sysfs_weights_sz_attr.attr, - &damon_sysfs_weights_nr_accesses_attr.attr, - &damon_sysfs_weights_age_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_weights); - -static struct kobj_type damon_sysfs_weights_ktype = { - .release = damon_sysfs_weights_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_weights_groups, -}; - -/* - * quotas directory - */ - -struct damon_sysfs_quotas { - struct kobject kobj; - struct damon_sysfs_weights *weights; - unsigned long ms; - unsigned long sz; - unsigned long reset_interval_ms; -}; - -static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void) -{ - return kzalloc(sizeof(struct damon_sysfs_quotas), GFP_KERNEL); -} - -static int damon_sysfs_quotas_add_dirs(struct damon_sysfs_quotas *quotas) -{ - struct damon_sysfs_weights *weights; - int err; - - weights = damon_sysfs_weights_alloc(0, 0, 0); - if (!weights) - return -ENOMEM; - - err = kobject_init_and_add(&weights->kobj, &damon_sysfs_weights_ktype, - "as->kobj, "weights"); - if (err) - kobject_put(&weights->kobj); - else - quotas->weights = weights; - return err; -} - -static void damon_sysfs_quotas_rm_dirs(struct damon_sysfs_quotas *quotas) -{ - kobject_put("as->weights->kobj); -} - -static ssize_t ms_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_quotas *quotas = container_of(kobj, - struct damon_sysfs_quotas, kobj); - - return sysfs_emit(buf, "%lu\n", quotas->ms); -} - -static ssize_t ms_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t count) -{ - struct damon_sysfs_quotas *quotas = container_of(kobj, - struct damon_sysfs_quotas, kobj); - int err = kstrtoul(buf, 0, "as->ms); - - if (err) - return -EINVAL; - return count; -} - -static ssize_t bytes_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_quotas *quotas = container_of(kobj, - struct damon_sysfs_quotas, kobj); - - return sysfs_emit(buf, "%lu\n", quotas->sz); -} - -static ssize_t bytes_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_quotas *quotas = container_of(kobj, - struct damon_sysfs_quotas, kobj); - int err = kstrtoul(buf, 0, "as->sz); - - if (err) - return -EINVAL; - return count; -} - -static ssize_t reset_interval_ms_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_quotas *quotas = container_of(kobj, - struct damon_sysfs_quotas, kobj); - - return sysfs_emit(buf, "%lu\n", quotas->reset_interval_ms); -} - -static ssize_t reset_interval_ms_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_quotas *quotas = container_of(kobj, - struct damon_sysfs_quotas, kobj); - int err = kstrtoul(buf, 0, "as->reset_interval_ms); - - if (err) - return -EINVAL; - return count; -} - -static void damon_sysfs_quotas_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_quotas, kobj)); -} - -static struct kobj_attribute damon_sysfs_quotas_ms_attr = - __ATTR_RW_MODE(ms, 0600); - -static struct kobj_attribute damon_sysfs_quotas_sz_attr = - __ATTR_RW_MODE(bytes, 0600); - -static struct kobj_attribute damon_sysfs_quotas_reset_interval_ms_attr = - __ATTR_RW_MODE(reset_interval_ms, 0600); - -static struct attribute *damon_sysfs_quotas_attrs[] = { - &damon_sysfs_quotas_ms_attr.attr, - &damon_sysfs_quotas_sz_attr.attr, - &damon_sysfs_quotas_reset_interval_ms_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_quotas); - -static struct kobj_type damon_sysfs_quotas_ktype = { - .release = damon_sysfs_quotas_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_quotas_groups, -}; - -/* - * access_pattern directory - */ - -struct damon_sysfs_access_pattern { - struct kobject kobj; - struct damon_sysfs_ul_range *sz; - struct damon_sysfs_ul_range *nr_accesses; - struct damon_sysfs_ul_range *age; -}; - -static -struct damon_sysfs_access_pattern *damon_sysfs_access_pattern_alloc(void) -{ - struct damon_sysfs_access_pattern *access_pattern = - kmalloc(sizeof(*access_pattern), GFP_KERNEL); - - if (!access_pattern) - return NULL; - access_pattern->kobj = (struct kobject){}; - return access_pattern; -} - -static int damon_sysfs_access_pattern_add_range_dir( - struct damon_sysfs_access_pattern *access_pattern, - struct damon_sysfs_ul_range **range_dir_ptr, - char *name) -{ - struct damon_sysfs_ul_range *range = damon_sysfs_ul_range_alloc(0, 0); - int err; - - if (!range) - return -ENOMEM; - err = kobject_init_and_add(&range->kobj, &damon_sysfs_ul_range_ktype, - &access_pattern->kobj, name); - if (err) - kobject_put(&range->kobj); - else - *range_dir_ptr = range; - return err; -} - -static int damon_sysfs_access_pattern_add_dirs( - struct damon_sysfs_access_pattern *access_pattern) -{ - int err; - - err = damon_sysfs_access_pattern_add_range_dir(access_pattern, - &access_pattern->sz, "sz"); - if (err) - goto put_sz_out; - - err = damon_sysfs_access_pattern_add_range_dir(access_pattern, - &access_pattern->nr_accesses, "nr_accesses"); - if (err) - goto put_nr_accesses_sz_out; - - err = damon_sysfs_access_pattern_add_range_dir(access_pattern, - &access_pattern->age, "age"); - if (err) - goto put_age_nr_accesses_sz_out; - return 0; - -put_age_nr_accesses_sz_out: - kobject_put(&access_pattern->age->kobj); - access_pattern->age = NULL; -put_nr_accesses_sz_out: - kobject_put(&access_pattern->nr_accesses->kobj); - access_pattern->nr_accesses = NULL; -put_sz_out: - kobject_put(&access_pattern->sz->kobj); - access_pattern->sz = NULL; - return err; -} - -static void damon_sysfs_access_pattern_rm_dirs( - struct damon_sysfs_access_pattern *access_pattern) -{ - kobject_put(&access_pattern->sz->kobj); - kobject_put(&access_pattern->nr_accesses->kobj); - kobject_put(&access_pattern->age->kobj); -} - -static void damon_sysfs_access_pattern_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_access_pattern, kobj)); -} - -static struct attribute *damon_sysfs_access_pattern_attrs[] = { - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_access_pattern); - -static struct kobj_type damon_sysfs_access_pattern_ktype = { - .release = damon_sysfs_access_pattern_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_access_pattern_groups, -}; - -/* - * scheme directory - */ - -struct damon_sysfs_scheme { - struct kobject kobj; - enum damos_action action; - struct damon_sysfs_access_pattern *access_pattern; - struct damon_sysfs_quotas *quotas; - struct damon_sysfs_watermarks *watermarks; - struct damon_sysfs_stats *stats; -}; - -/* This should match with enum damos_action */ -static const char * const damon_sysfs_damos_action_strs[] = { - "willneed", - "cold", - "pageout", - "hugepage", - "nohugepage", - "lru_prio", - "lru_deprio", - "stat", -}; - -static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc( - enum damos_action action) -{ - struct damon_sysfs_scheme *scheme = kmalloc(sizeof(*scheme), - GFP_KERNEL); - - if (!scheme) - return NULL; - scheme->kobj = (struct kobject){}; - scheme->action = action; - return scheme; -} - -static int damon_sysfs_scheme_set_access_pattern( - struct damon_sysfs_scheme *scheme) -{ - struct damon_sysfs_access_pattern *access_pattern; - int err; - - access_pattern = damon_sysfs_access_pattern_alloc(); - if (!access_pattern) - return -ENOMEM; - err = kobject_init_and_add(&access_pattern->kobj, - &damon_sysfs_access_pattern_ktype, &scheme->kobj, - "access_pattern"); - if (err) - goto out; - err = damon_sysfs_access_pattern_add_dirs(access_pattern); - if (err) - goto out; - scheme->access_pattern = access_pattern; - return 0; - -out: - kobject_put(&access_pattern->kobj); - return err; -} - -static int damon_sysfs_scheme_set_quotas(struct damon_sysfs_scheme *scheme) -{ - struct damon_sysfs_quotas *quotas = damon_sysfs_quotas_alloc(); - int err; - - if (!quotas) - return -ENOMEM; - err = kobject_init_and_add("as->kobj, &damon_sysfs_quotas_ktype, - &scheme->kobj, "quotas"); - if (err) - goto out; - err = damon_sysfs_quotas_add_dirs(quotas); - if (err) - goto out; - scheme->quotas = quotas; - return 0; - -out: - kobject_put("as->kobj); - return err; -} - -static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme) -{ - struct damon_sysfs_watermarks *watermarks = - damon_sysfs_watermarks_alloc(DAMOS_WMARK_NONE, 0, 0, 0, 0); - int err; - - if (!watermarks) - return -ENOMEM; - err = kobject_init_and_add(&watermarks->kobj, - &damon_sysfs_watermarks_ktype, &scheme->kobj, - "watermarks"); - if (err) - kobject_put(&watermarks->kobj); - else - scheme->watermarks = watermarks; - return err; -} - -static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme) -{ - struct damon_sysfs_stats *stats = damon_sysfs_stats_alloc(); - int err; - - if (!stats) - return -ENOMEM; - err = kobject_init_and_add(&stats->kobj, &damon_sysfs_stats_ktype, - &scheme->kobj, "stats"); - if (err) - kobject_put(&stats->kobj); - else - scheme->stats = stats; - return err; -} - -static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme) -{ - int err; - - err = damon_sysfs_scheme_set_access_pattern(scheme); - if (err) - return err; - err = damon_sysfs_scheme_set_quotas(scheme); - if (err) - goto put_access_pattern_out; - err = damon_sysfs_scheme_set_watermarks(scheme); - if (err) - goto put_quotas_access_pattern_out; - err = damon_sysfs_scheme_set_stats(scheme); - if (err) - goto put_watermarks_quotas_access_pattern_out; - return 0; - -put_watermarks_quotas_access_pattern_out: - kobject_put(&scheme->watermarks->kobj); - scheme->watermarks = NULL; -put_quotas_access_pattern_out: - kobject_put(&scheme->quotas->kobj); - scheme->quotas = NULL; -put_access_pattern_out: - kobject_put(&scheme->access_pattern->kobj); - scheme->access_pattern = NULL; - return err; -} - -static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme) -{ - damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern); - kobject_put(&scheme->access_pattern->kobj); - damon_sysfs_quotas_rm_dirs(scheme->quotas); - kobject_put(&scheme->quotas->kobj); - kobject_put(&scheme->watermarks->kobj); - kobject_put(&scheme->stats->kobj); -} - -static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_scheme *scheme = container_of(kobj, - struct damon_sysfs_scheme, kobj); - - return sysfs_emit(buf, "%s\n", - damon_sysfs_damos_action_strs[scheme->action]); -} - -static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t count) -{ - struct damon_sysfs_scheme *scheme = container_of(kobj, - struct damon_sysfs_scheme, kobj); - enum damos_action action; - - for (action = 0; action < NR_DAMOS_ACTIONS; action++) { - if (sysfs_streq(buf, damon_sysfs_damos_action_strs[action])) { - scheme->action = action; - return count; - } - } - return -EINVAL; -} - -static void damon_sysfs_scheme_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_scheme, kobj)); -} - -static struct kobj_attribute damon_sysfs_scheme_action_attr = - __ATTR_RW_MODE(action, 0600); - -static struct attribute *damon_sysfs_scheme_attrs[] = { - &damon_sysfs_scheme_action_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_scheme); - -static struct kobj_type damon_sysfs_scheme_ktype = { - .release = damon_sysfs_scheme_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_scheme_groups, -}; - -/* - * schemes directory - */ - -struct damon_sysfs_schemes { - struct kobject kobj; - struct damon_sysfs_scheme **schemes_arr; - int nr; -}; - -static struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void) -{ - return kzalloc(sizeof(struct damon_sysfs_schemes), GFP_KERNEL); -} - -static void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes) -{ - struct damon_sysfs_scheme **schemes_arr = schemes->schemes_arr; - int i; - - for (i = 0; i < schemes->nr; i++) { - damon_sysfs_scheme_rm_dirs(schemes_arr[i]); - kobject_put(&schemes_arr[i]->kobj); - } - schemes->nr = 0; - kfree(schemes_arr); - schemes->schemes_arr = NULL; -} - -static int damon_sysfs_schemes_add_dirs(struct damon_sysfs_schemes *schemes, - int nr_schemes) -{ - struct damon_sysfs_scheme **schemes_arr, *scheme; - int err, i; - - damon_sysfs_schemes_rm_dirs(schemes); - if (!nr_schemes) - return 0; - - schemes_arr = kmalloc_array(nr_schemes, sizeof(*schemes_arr), - GFP_KERNEL | __GFP_NOWARN); - if (!schemes_arr) - return -ENOMEM; - schemes->schemes_arr = schemes_arr; - - for (i = 0; i < nr_schemes; i++) { - scheme = damon_sysfs_scheme_alloc(DAMOS_STAT); - if (!scheme) { - damon_sysfs_schemes_rm_dirs(schemes); - return -ENOMEM; - } - - err = kobject_init_and_add(&scheme->kobj, - &damon_sysfs_scheme_ktype, &schemes->kobj, - "%d", i); - if (err) - goto out; - err = damon_sysfs_scheme_add_dirs(scheme); - if (err) - goto out; - - schemes_arr[i] = scheme; - schemes->nr++; - } - return 0; - -out: - damon_sysfs_schemes_rm_dirs(schemes); - kobject_put(&scheme->kobj); - return err; -} - -static ssize_t nr_schemes_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_schemes *schemes = container_of(kobj, - struct damon_sysfs_schemes, kobj); - - return sysfs_emit(buf, "%d\n", schemes->nr); -} - -static ssize_t nr_schemes_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_schemes *schemes = container_of(kobj, - struct damon_sysfs_schemes, kobj); - int nr, err = kstrtoint(buf, 0, &nr); - - if (err) - return err; - if (nr < 0) - return -EINVAL; - - if (!mutex_trylock(&damon_sysfs_lock)) - return -EBUSY; - err = damon_sysfs_schemes_add_dirs(schemes, nr); - mutex_unlock(&damon_sysfs_lock); - if (err) - return err; - return count; -} - -static void damon_sysfs_schemes_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_schemes, kobj)); -} - -static struct kobj_attribute damon_sysfs_schemes_nr_attr = - __ATTR_RW_MODE(nr_schemes, 0600); - -static struct attribute *damon_sysfs_schemes_attrs[] = { - &damon_sysfs_schemes_nr_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_schemes); - -static struct kobj_type damon_sysfs_schemes_ktype = { - .release = damon_sysfs_schemes_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_schemes_groups, -}; +#include "sysfs-common.h" /* * init region directory @@ -1075,23 +17,12 @@ static struct kobj_type damon_sysfs_schemes_ktype = { struct damon_sysfs_region { struct kobject kobj; - unsigned long start; - unsigned long end; + struct damon_addr_range ar; }; -static struct damon_sysfs_region *damon_sysfs_region_alloc( - unsigned long start, - unsigned long end) +static struct damon_sysfs_region *damon_sysfs_region_alloc(void) { - struct damon_sysfs_region *region = kmalloc(sizeof(*region), - GFP_KERNEL); - - if (!region) - return NULL; - region->kobj = (struct kobject){}; - region->start = start; - region->end = end; - return region; + return kzalloc(sizeof(struct damon_sysfs_region), GFP_KERNEL); } static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr, @@ -1100,7 +31,7 @@ static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr, struct damon_sysfs_region *region = container_of(kobj, struct damon_sysfs_region, kobj); - return sysfs_emit(buf, "%lu\n", region->start); + return sysfs_emit(buf, "%lu\n", region->ar.start); } static ssize_t start_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -1108,11 +39,9 @@ static ssize_t start_store(struct kobject *kobj, struct kobj_attribute *attr, { struct damon_sysfs_region *region = container_of(kobj, struct damon_sysfs_region, kobj); - int err = kstrtoul(buf, 0, ®ion->start); + int err = kstrtoul(buf, 0, ®ion->ar.start); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr, @@ -1121,7 +50,7 @@ static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr, struct damon_sysfs_region *region = container_of(kobj, struct damon_sysfs_region, kobj); - return sysfs_emit(buf, "%lu\n", region->end); + return sysfs_emit(buf, "%lu\n", region->ar.end); } static ssize_t end_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -1129,11 +58,9 @@ static ssize_t end_store(struct kobject *kobj, struct kobj_attribute *attr, { struct damon_sysfs_region *region = container_of(kobj, struct damon_sysfs_region, kobj); - int err = kstrtoul(buf, 0, ®ion->end); + int err = kstrtoul(buf, 0, ®ion->ar.end); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static void damon_sysfs_region_release(struct kobject *kobj) @@ -1204,7 +131,7 @@ static int damon_sysfs_regions_add_dirs(struct damon_sysfs_regions *regions, regions->regions_arr = regions_arr; for (i = 0; i < nr_regions; i++) { - region = damon_sysfs_region_alloc(0, 0); + region = damon_sysfs_region_alloc(); if (!region) { damon_sysfs_regions_rm_dirs(regions); return -ENOMEM; @@ -1237,8 +164,7 @@ static ssize_t nr_regions_show(struct kobject *kobj, static ssize_t nr_regions_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - struct damon_sysfs_regions *regions = container_of(kobj, - struct damon_sysfs_regions, kobj); + struct damon_sysfs_regions *regions; int nr, err = kstrtoint(buf, 0, &nr); if (err) @@ -1246,6 +172,8 @@ static ssize_t nr_regions_store(struct kobject *kobj, if (nr < 0) return -EINVAL; + regions = container_of(kobj, struct damon_sysfs_regions, kobj); + if (!mutex_trylock(&damon_sysfs_lock)) return -EBUSY; err = damon_sysfs_regions_add_dirs(regions, nr); @@ -1440,8 +368,7 @@ static ssize_t nr_targets_show(struct kobject *kobj, static ssize_t nr_targets_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - struct damon_sysfs_targets *targets = container_of(kobj, - struct damon_sysfs_targets, kobj); + struct damon_sysfs_targets *targets; int nr, err = kstrtoint(buf, 0, &nr); if (err) @@ -1449,6 +376,8 @@ static ssize_t nr_targets_store(struct kobject *kobj, if (nr < 0) return -EINVAL; + targets = container_of(kobj, struct damon_sysfs_targets, kobj); + if (!mutex_trylock(&damon_sysfs_lock)) return -EBUSY; err = damon_sysfs_targets_add_dirs(targets, nr); @@ -1525,7 +454,7 @@ static ssize_t sample_us_store(struct kobject *kobj, int err = kstrtoul(buf, 0, &us); if (err) - return -EINVAL; + return err; intervals->sample_us = us; return count; @@ -1549,7 +478,7 @@ static ssize_t aggr_us_store(struct kobject *kobj, struct kobj_attribute *attr, int err = kstrtoul(buf, 0, &us); if (err) - return -EINVAL; + return err; intervals->aggr_us = us; return count; @@ -1573,7 +502,7 @@ static ssize_t update_us_store(struct kobject *kobj, int err = kstrtoul(buf, 0, &us); if (err) - return -EINVAL; + return err; intervals->update_us = us; return count; @@ -1962,8 +891,7 @@ static ssize_t nr_contexts_show(struct kobject *kobj, static ssize_t nr_contexts_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - struct damon_sysfs_contexts *contexts = container_of(kobj, - struct damon_sysfs_contexts, kobj); + struct damon_sysfs_contexts *contexts; int nr, err; err = kstrtoint(buf, 0, &nr); @@ -1973,6 +901,7 @@ static ssize_t nr_contexts_store(struct kobject *kobj, if (nr < 0 || 1 < nr) return -EINVAL; + contexts = container_of(kobj, struct damon_sysfs_contexts, kobj); if (!mutex_trylock(&damon_sysfs_lock)) return -EBUSY; err = damon_sysfs_contexts_add_dirs(contexts, nr); @@ -2071,6 +1000,16 @@ enum damon_sysfs_cmd { */ DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS, /* + * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: Update schemes tried + * regions + */ + DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS, + /* + * @DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS: Clear schemes tried + * regions + */ + DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS, + /* * @NR_DAMON_SYSFS_CMDS: Total number of DAMON sysfs commands. */ NR_DAMON_SYSFS_CMDS, @@ -2082,6 +1021,8 @@ static const char * const damon_sysfs_cmd_strs[] = { "off", "commit", "update_schemes_stats", + "update_schemes_tried_regions", + "clear_schemes_tried_regions", }; /* @@ -2127,18 +1068,23 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx, struct damon_sysfs_intervals *sys_intervals = sys_attrs->intervals; struct damon_sysfs_ul_range *sys_nr_regions = sys_attrs->nr_regions_range; - - return damon_set_attrs(ctx, sys_intervals->sample_us, - sys_intervals->aggr_us, sys_intervals->update_us, - sys_nr_regions->min, sys_nr_regions->max); + struct damon_attrs attrs = { + .sample_interval = sys_intervals->sample_us, + .aggr_interval = sys_intervals->aggr_us, + .ops_update_interval = sys_intervals->update_us, + .min_nr_regions = sys_nr_regions->min, + .max_nr_regions = sys_nr_regions->max, + }; + return damon_set_attrs(ctx, &attrs); } static void damon_sysfs_destroy_targets(struct damon_ctx *ctx) { struct damon_target *t, *next; + bool has_pid = damon_target_has_pid(ctx); damon_for_each_target_safe(t, next, ctx) { - if (damon_target_has_pid(ctx)) + if (has_pid) put_pid(t->pid); damon_destroy_target(t); } @@ -2157,11 +1103,11 @@ static int damon_sysfs_set_regions(struct damon_target *t, struct damon_sysfs_region *sys_region = sysfs_regions->regions_arr[i]; - if (sys_region->start > sys_region->end) + if (sys_region->ar.start > sys_region->ar.end) goto out; - ranges[i].start = sys_region->start; - ranges[i].end = sys_region->end; + ranges[i].start = sys_region->ar.start; + ranges[i].end = sys_region->ar.end; if (i == 0) continue; if (ranges[i - 1].end > ranges[i].start) @@ -2182,12 +1128,12 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target, if (!t) return -ENOMEM; + damon_add_target(ctx, t); if (damon_target_has_pid(ctx)) { t->pid = find_get_pid(sys_target->pid); if (!t->pid) goto destroy_targets_out; } - damon_add_target(ctx, t); err = damon_sysfs_set_regions(t, sys_target->regions); if (err) goto destroy_targets_out; @@ -2256,60 +1202,21 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx, return 0; } -static struct damos *damon_sysfs_mk_scheme( - struct damon_sysfs_scheme *sysfs_scheme) -{ - struct damon_sysfs_access_pattern *pattern = - sysfs_scheme->access_pattern; - struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; - struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; - struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; - struct damos_quota quota = { - .ms = sysfs_quotas->ms, - .sz = sysfs_quotas->sz, - .reset_interval = sysfs_quotas->reset_interval_ms, - .weight_sz = sysfs_weights->sz, - .weight_nr_accesses = sysfs_weights->nr_accesses, - .weight_age = sysfs_weights->age, - }; - struct damos_watermarks wmarks = { - .metric = sysfs_wmarks->metric, - .interval = sysfs_wmarks->interval_us, - .high = sysfs_wmarks->high, - .mid = sysfs_wmarks->mid, - .low = sysfs_wmarks->low, - }; - - return damon_new_scheme(pattern->sz->min, pattern->sz->max, - pattern->nr_accesses->min, pattern->nr_accesses->max, - pattern->age->min, pattern->age->max, - sysfs_scheme->action, "a, &wmarks); -} - -static int damon_sysfs_set_schemes(struct damon_ctx *ctx, - struct damon_sysfs_schemes *sysfs_schemes) -{ - int i; - - for (i = 0; i < sysfs_schemes->nr; i++) { - struct damos *scheme, *next; - - scheme = damon_sysfs_mk_scheme(sysfs_schemes->schemes_arr[i]); - if (!scheme) { - damon_for_each_scheme_safe(scheme, next, ctx) - damon_destroy_scheme(scheme); - return -ENOMEM; - } - damon_add_scheme(ctx, scheme); - } - return 0; -} - static void damon_sysfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; + struct damon_sysfs_kdamond *kdamond; - if (ctx->ops.id != DAMON_OPS_VADDR && ctx->ops.id != DAMON_OPS_FVADDR) + /* damon_sysfs_schemes_update_regions_stop() might not yet called */ + kdamond = damon_sysfs_cmd_request.kdamond; + if (kdamond && damon_sysfs_cmd_request.cmd == + DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS && + ctx == kdamond->damon_ctx) { + damon_sysfs_schemes_update_regions_stop(ctx); + mutex_unlock(&damon_sysfs_lock); + } + + if (!damon_target_has_pid(ctx)) return; mutex_lock(&ctx->kdamond_lock); @@ -2332,26 +1239,46 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx) static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond) { struct damon_ctx *ctx = kdamond->damon_ctx; - struct damon_sysfs_schemes *sysfs_schemes; - struct damos *scheme; - int schemes_idx = 0; if (!ctx) return -EINVAL; - sysfs_schemes = kdamond->contexts->contexts_arr[0]->schemes; - damon_for_each_scheme(scheme, ctx) { - struct damon_sysfs_stats *sysfs_stats; - - sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats; - sysfs_stats->nr_tried = scheme->stat.nr_tried; - sysfs_stats->sz_tried = scheme->stat.sz_tried; - sysfs_stats->nr_applied = scheme->stat.nr_applied; - sysfs_stats->sz_applied = scheme->stat.sz_applied; - sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds; - } + damon_sysfs_schemes_update_stats( + kdamond->contexts->contexts_arr[0]->schemes, ctx); return 0; } +static int damon_sysfs_upd_schemes_regions_start( + struct damon_sysfs_kdamond *kdamond) +{ + struct damon_ctx *ctx = kdamond->damon_ctx; + + if (!ctx) + return -EINVAL; + return damon_sysfs_schemes_update_regions_start( + kdamond->contexts->contexts_arr[0]->schemes, ctx); +} + +static int damon_sysfs_upd_schemes_regions_stop( + struct damon_sysfs_kdamond *kdamond) +{ + struct damon_ctx *ctx = kdamond->damon_ctx; + + if (!ctx) + return -EINVAL; + return damon_sysfs_schemes_update_regions_stop(ctx); +} + +static int damon_sysfs_clear_schemes_regions( + struct damon_sysfs_kdamond *kdamond) +{ + struct damon_ctx *ctx = kdamond->damon_ctx; + + if (!ctx) + return -EINVAL; + return damon_sysfs_schemes_clear_regions( + kdamond->contexts->contexts_arr[0]->schemes, ctx); +} + static inline bool damon_sysfs_kdamond_running( struct damon_sysfs_kdamond *kdamond) { @@ -2404,10 +1331,12 @@ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond) static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) { struct damon_sysfs_kdamond *kdamond; + static bool damon_sysfs_schemes_regions_updating; int err = 0; /* avoid deadlock due to concurrent state_store('off') */ - if (!mutex_trylock(&damon_sysfs_lock)) + if (!damon_sysfs_schemes_regions_updating && + !mutex_trylock(&damon_sysfs_lock)) return 0; kdamond = damon_sysfs_cmd_request.kdamond; if (!kdamond || kdamond->damon_ctx != c) @@ -2419,13 +1348,30 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) case DAMON_SYSFS_CMD_COMMIT: err = damon_sysfs_commit_input(kdamond); break; + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: + if (!damon_sysfs_schemes_regions_updating) { + err = damon_sysfs_upd_schemes_regions_start(kdamond); + if (!err) { + damon_sysfs_schemes_regions_updating = true; + goto keep_lock_out; + } + } else { + err = damon_sysfs_upd_schemes_regions_stop(kdamond); + damon_sysfs_schemes_regions_updating = false; + } + break; + case DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS: + err = damon_sysfs_clear_schemes_regions(kdamond); + break; default: break; } /* Mark the request as invalid now. */ damon_sysfs_cmd_request.kdamond = NULL; out: - mutex_unlock(&damon_sysfs_lock); + if (!damon_sysfs_schemes_regions_updating) + mutex_unlock(&damon_sysfs_lock); +keep_lock_out: return err; } @@ -2455,8 +1401,7 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) struct damon_ctx *ctx; int err; - if (kdamond->damon_ctx && - damon_sysfs_ctx_running(kdamond->damon_ctx)) + if (damon_sysfs_kdamond_running(kdamond)) return -EBUSY; if (damon_sysfs_cmd_request.kdamond == kdamond) return -EBUSY; @@ -2579,19 +1524,16 @@ static ssize_t pid_show(struct kobject *kobj, struct damon_sysfs_kdamond *kdamond = container_of(kobj, struct damon_sysfs_kdamond, kobj); struct damon_ctx *ctx; - int pid; + int pid = -1; if (!mutex_trylock(&damon_sysfs_lock)) return -EBUSY; ctx = kdamond->damon_ctx; - if (!ctx) { - pid = -1; + if (!ctx) goto out; - } + mutex_lock(&ctx->kdamond_lock); - if (!ctx->kdamond) - pid = -1; - else + if (ctx->kdamond) pid = ctx->kdamond->pid; mutex_unlock(&ctx->kdamond_lock); out: @@ -2657,23 +1599,18 @@ static void damon_sysfs_kdamonds_rm_dirs(struct damon_sysfs_kdamonds *kdamonds) kdamonds->kdamonds_arr = NULL; } -static int damon_sysfs_nr_running_ctxs(struct damon_sysfs_kdamond **kdamonds, +static bool damon_sysfs_kdamonds_busy(struct damon_sysfs_kdamond **kdamonds, int nr_kdamonds) { - int nr_running_ctxs = 0; int i; for (i = 0; i < nr_kdamonds; i++) { - struct damon_ctx *ctx = kdamonds[i]->damon_ctx; - - if (!ctx) - continue; - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) - nr_running_ctxs++; - mutex_unlock(&ctx->kdamond_lock); + if (damon_sysfs_kdamond_running(kdamonds[i]) || + damon_sysfs_cmd_request.kdamond == kdamonds[i]) + return true; } - return nr_running_ctxs; + + return false; } static int damon_sysfs_kdamonds_add_dirs(struct damon_sysfs_kdamonds *kdamonds, @@ -2682,15 +1619,9 @@ static int damon_sysfs_kdamonds_add_dirs(struct damon_sysfs_kdamonds *kdamonds, struct damon_sysfs_kdamond **kdamonds_arr, *kdamond; int err, i; - if (damon_sysfs_nr_running_ctxs(kdamonds->kdamonds_arr, kdamonds->nr)) + if (damon_sysfs_kdamonds_busy(kdamonds->kdamonds_arr, kdamonds->nr)) return -EBUSY; - for (i = 0; i < kdamonds->nr; i++) { - if (damon_sysfs_cmd_request.kdamond == - kdamonds->kdamonds_arr[i]) - return -EBUSY; - } - damon_sysfs_kdamonds_rm_dirs(kdamonds); if (!nr_kdamonds) return 0; @@ -2741,8 +1672,7 @@ static ssize_t nr_kdamonds_show(struct kobject *kobj, static ssize_t nr_kdamonds_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - struct damon_sysfs_kdamonds *kdamonds = container_of(kobj, - struct damon_sysfs_kdamonds, kobj); + struct damon_sysfs_kdamonds *kdamonds; int nr, err; err = kstrtoint(buf, 0, &nr); @@ -2751,6 +1681,8 @@ static ssize_t nr_kdamonds_store(struct kobject *kobj, if (nr < 0) return -EINVAL; + kdamonds = container_of(kobj, struct damon_sysfs_kdamonds, kobj); + if (!mutex_trylock(&damon_sysfs_lock)) return -EBUSY; err = damon_sysfs_kdamonds_add_dirs(kdamonds, nr); diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h index d4f55f349100..bce37c487540 100644 --- a/mm/damon/vaddr-test.h +++ b/mm/damon/vaddr-test.h @@ -14,33 +14,19 @@ #include <kunit/test.h> -static void __link_vmas(struct vm_area_struct *vmas, ssize_t nr_vmas) +static void __link_vmas(struct maple_tree *mt, struct vm_area_struct *vmas, + ssize_t nr_vmas) { - int i, j; - unsigned long largest_gap, gap; + int i; + MA_STATE(mas, mt, 0, 0); if (!nr_vmas) return; - for (i = 0; i < nr_vmas - 1; i++) { - vmas[i].vm_next = &vmas[i + 1]; - - vmas[i].vm_rb.rb_left = NULL; - vmas[i].vm_rb.rb_right = &vmas[i + 1].vm_rb; - - largest_gap = 0; - for (j = i; j < nr_vmas; j++) { - if (j == 0) - continue; - gap = vmas[j].vm_start - vmas[j - 1].vm_end; - if (gap > largest_gap) - largest_gap = gap; - } - vmas[i].rb_subtree_gap = largest_gap; - } - vmas[i].vm_next = NULL; - vmas[i].vm_rb.rb_right = NULL; - vmas[i].rb_subtree_gap = 0; + mas_lock(&mas); + for (i = 0; i < nr_vmas; i++) + vma_mas_store(&vmas[i], &mas); + mas_unlock(&mas); } /* @@ -72,6 +58,7 @@ static void __link_vmas(struct vm_area_struct *vmas, ssize_t nr_vmas) */ static void damon_test_three_regions_in_vmas(struct kunit *test) { + static struct mm_struct mm; struct damon_addr_range regions[3] = {0,}; /* 10-20-25, 200-210-220, 300-305, 307-330 */ struct vm_area_struct vmas[] = { @@ -83,9 +70,10 @@ static void damon_test_three_regions_in_vmas(struct kunit *test) (struct vm_area_struct) {.vm_start = 307, .vm_end = 330}, }; - __link_vmas(vmas, 6); + mt_init_flags(&mm.mm_mt, MM_MT_FLAGS); + __link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas)); - __damon_va_three_regions(&vmas[0], regions); + __damon_va_three_regions(&mm, regions); KUNIT_EXPECT_EQ(test, 10ul, regions[0].start); KUNIT_EXPECT_EQ(test, 25ul, regions[0].end); diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 3c7b9d6dca95..15f03df66db6 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -72,7 +72,7 @@ static int damon_va_evenly_split_region(struct damon_target *t, return -EINVAL; orig_end = r->ar.end; - sz_orig = r->ar.end - r->ar.start; + sz_orig = damon_sz_region(r); sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION); if (!sz_piece) @@ -113,37 +113,38 @@ static unsigned long sz_range(struct damon_addr_range *r) * * Returns 0 if success, or negative error code otherwise. */ -static int __damon_va_three_regions(struct vm_area_struct *vma, +static int __damon_va_three_regions(struct mm_struct *mm, struct damon_addr_range regions[3]) { - struct damon_addr_range gap = {0}, first_gap = {0}, second_gap = {0}; - struct vm_area_struct *last_vma = NULL; - unsigned long start = 0; - struct rb_root rbroot; - - /* Find two biggest gaps so that first_gap > second_gap > others */ - for (; vma; vma = vma->vm_next) { - if (!last_vma) { - start = vma->vm_start; - goto next; - } + struct damon_addr_range first_gap = {0}, second_gap = {0}; + VMA_ITERATOR(vmi, mm, 0); + struct vm_area_struct *vma, *prev = NULL; + unsigned long start; - if (vma->rb_subtree_gap <= sz_range(&second_gap)) { - rbroot.rb_node = &vma->vm_rb; - vma = rb_entry(rb_last(&rbroot), - struct vm_area_struct, vm_rb); + /* + * Find the two biggest gaps so that first_gap > second_gap > others. + * If this is too slow, it can be optimised to examine the maple + * tree gaps. + */ + for_each_vma(vmi, vma) { + unsigned long gap; + + if (!prev) { + start = vma->vm_start; goto next; } - - gap.start = last_vma->vm_end; - gap.end = vma->vm_start; - if (sz_range(&gap) > sz_range(&second_gap)) { - swap(gap, second_gap); - if (sz_range(&second_gap) > sz_range(&first_gap)) - swap(second_gap, first_gap); + gap = vma->vm_start - prev->vm_end; + + if (gap > sz_range(&first_gap)) { + second_gap = first_gap; + first_gap.start = prev->vm_end; + first_gap.end = vma->vm_start; + } else if (gap > sz_range(&second_gap)) { + second_gap.start = prev->vm_end; + second_gap.end = vma->vm_start; } next: - last_vma = vma; + prev = vma; } if (!sz_range(&second_gap) || !sz_range(&first_gap)) @@ -159,7 +160,7 @@ next: regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION); regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION); regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION); - regions[2].end = ALIGN(last_vma->vm_end, DAMON_MIN_REGION); + regions[2].end = ALIGN(prev->vm_end, DAMON_MIN_REGION); return 0; } @@ -180,7 +181,7 @@ static int damon_va_three_regions(struct damon_target *t, return -EINVAL; mmap_read_lock(mm); - rc = __damon_va_three_regions(mm->mmap, regions); + rc = __damon_va_three_regions(mm, regions); mmap_read_unlock(mm); mmput(mm); @@ -250,8 +251,8 @@ static void __damon_va_init_regions(struct damon_ctx *ctx, for (i = 0; i < 3; i++) sz += regions[i].end - regions[i].start; - if (ctx->min_nr_regions) - sz /= ctx->min_nr_regions; + if (ctx->attrs.min_nr_regions) + sz /= ctx->attrs.min_nr_regions; if (sz < DAMON_MIN_REGION) sz = DAMON_MIN_REGION; @@ -302,9 +303,14 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, pte_t *pte; spinlock_t *ptl; - if (pmd_huge(*pmd)) { + if (pmd_trans_huge(*pmd)) { ptl = pmd_lock(walk->mm, pmd); - if (pmd_huge(*pmd)) { + if (!pmd_present(*pmd)) { + spin_unlock(ptl); + return 0; + } + + if (pmd_trans_huge(*pmd)) { damon_pmdp_mkold(pmd, walk->mm, addr); spin_unlock(ptl); return 0; @@ -391,8 +397,8 @@ static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) * Functions for the access checking of the regions */ -static void __damon_va_prepare_access_check(struct damon_ctx *ctx, - struct mm_struct *mm, struct damon_region *r) +static void __damon_va_prepare_access_check(struct mm_struct *mm, + struct damon_region *r) { r->sampling_addr = damon_rand(r->ar.start, r->ar.end); @@ -410,7 +416,7 @@ static void damon_va_prepare_access_checks(struct damon_ctx *ctx) if (!mm) continue; damon_for_each_region(r, t) - __damon_va_prepare_access_check(ctx, mm, r); + __damon_va_prepare_access_check(mm, r); mmput(mm); } } @@ -429,9 +435,14 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, struct damon_young_walk_private *priv = walk->private; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (pmd_huge(*pmd)) { + if (pmd_trans_huge(*pmd)) { ptl = pmd_lock(walk->mm, pmd); - if (!pmd_huge(*pmd)) { + if (!pmd_present(*pmd)) { + spin_unlock(ptl); + return 0; + } + + if (!pmd_trans_huge(*pmd)) { spin_unlock(ptl); goto regular_page; } @@ -532,16 +543,15 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr, * mm 'mm_struct' for the given virtual address space * r the region to be checked */ -static void __damon_va_check_access(struct damon_ctx *ctx, - struct mm_struct *mm, struct damon_region *r) +static void __damon_va_check_access(struct mm_struct *mm, + struct damon_region *r, bool same_target) { - static struct mm_struct *last_mm; static unsigned long last_addr; static unsigned long last_page_sz = PAGE_SIZE; static bool last_accessed; /* If the region is in the last checked page, reuse the result */ - if (mm == last_mm && (ALIGN_DOWN(last_addr, last_page_sz) == + if (same_target && (ALIGN_DOWN(last_addr, last_page_sz) == ALIGN_DOWN(r->sampling_addr, last_page_sz))) { if (last_accessed) r->nr_accesses++; @@ -552,7 +562,6 @@ static void __damon_va_check_access(struct damon_ctx *ctx, if (last_accessed) r->nr_accesses++; - last_mm = mm; last_addr = r->sampling_addr; } @@ -562,14 +571,17 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) struct mm_struct *mm; struct damon_region *r; unsigned int max_nr_accesses = 0; + bool same_target; damon_for_each_target(t, ctx) { mm = damon_get_mm(t); if (!mm) continue; + same_target = false; damon_for_each_region(r, t) { - __damon_va_check_access(ctx, mm, r); + __damon_va_check_access(mm, r, same_target); max_nr_accesses = max(r->nr_accesses, max_nr_accesses); + same_target = true; } mmput(mm); } @@ -581,9 +593,8 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) * Functions for the target validity check and cleanup */ -static bool damon_va_target_valid(void *target) +static bool damon_va_target_valid(struct damon_target *t) { - struct damon_target *t = target; struct task_struct *task; task = damon_get_task_struct(t); @@ -607,7 +618,7 @@ static unsigned long damos_madvise(struct damon_target *target, { struct mm_struct *mm; unsigned long start = PAGE_ALIGN(r->ar.start); - unsigned long len = PAGE_ALIGN(r->ar.end - r->ar.start); + unsigned long len = PAGE_ALIGN(damon_sz_region(r)); unsigned long applied; mm = damon_get_mm(target); @@ -646,6 +657,9 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, case DAMOS_STAT: return 0; default: + /* + * DAMOS actions that are not yet supported by 'vaddr'. + */ return 0; } @@ -659,7 +673,7 @@ static int damon_va_scheme_score(struct damon_ctx *context, switch (scheme->action) { case DAMOS_PAGEOUT: - return damon_pageout_score(context, r, scheme); + return damon_cold_score(context, r, scheme); default: break; } diff --git a/mm/debug.c b/mm/debug.c index bef329bf28f0..7f8e5f744e42 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -94,9 +94,10 @@ static void __dump_page(struct page *page) page, page_ref_count(head), mapcount, mapping, page_to_pgoff(page), page_to_pfn(page)); if (compound) { - pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n", + pr_warn("head:%p order:%u compound_mapcount:%d subpages_mapcount:%d compound_pincount:%d\n", head, compound_order(head), - folio_entire_mapcount(folio), + head_compound_mapcount(head), + head_subpages_mapcount(head), head_compound_pincount(head)); } @@ -139,13 +140,11 @@ EXPORT_SYMBOL(dump_page); void dump_vma(const struct vm_area_struct *vma) { - pr_emerg("vma %px start %px end %px\n" - "next %px prev %px mm %px\n" + pr_emerg("vma %px start %px end %px mm %px\n" "prot %lx anon_vma %px vm_ops %px\n" "pgoff %lx file %px private_data %px\n" "flags: %#lx(%pGv)\n", - vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next, - vma->vm_prev, vma->vm_mm, + vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_mm, (unsigned long)pgprot_val(vma->vm_page_prot), vma->anon_vma, vma->vm_ops, vma->vm_pgoff, vma->vm_file, vma->vm_private_data, @@ -155,11 +154,11 @@ EXPORT_SYMBOL(dump_vma); void dump_mm(const struct mm_struct *mm) { - pr_emerg("mm %px mmap %px seqnum %llu task_size %lu\n" + pr_emerg("mm %px task_size %lu\n" #ifdef CONFIG_MMU "get_unmapped_area %px\n" #endif - "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" + "mmap_base %lu mmap_legacy_base %lu\n" "pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" "pinned_vm %llx data_vm %lx exec_vm %lx stack_vm %lx\n" @@ -183,11 +182,11 @@ void dump_mm(const struct mm_struct *mm) "tlb_flush_pending %d\n" "def_flags: %#lx(%pGv)\n", - mm, mm->mmap, (long long) mm->vmacache_seqnum, mm->task_size, + mm, mm->task_size, #ifdef CONFIG_MMU mm->get_unmapped_area, #endif - mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end, + mm->mmap_base, mm->mmap_legacy_base, mm->pgd, atomic_read(&mm->mm_users), atomic_read(&mm->mm_count), mm_pgtables_bytes(mm), diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index dc7df1254f0a..c631ade3f1d2 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -38,11 +38,7 @@ * Please refer Documentation/mm/arch_pgtable_helpers.rst for the semantics * expectations that are being validated here. All future changes in here * or the documentation need to be in sync. - */ - -#define VMFLAGS (VM_READ|VM_WRITE|VM_EXEC) - -/* + * * On s390 platform, the lower 4 bits are used to identify given page table * entry type. But these bits might affect the ability to clear entries with * pxx_clear() because of how dynamic page table folding works on s390. So @@ -175,18 +171,6 @@ static void __init pte_advanced_tests(struct pgtable_debug_args *args) ptep_get_and_clear_full(args->mm, args->vaddr, args->ptep, 1); } -static void __init pte_savedwrite_tests(struct pgtable_debug_args *args) -{ - pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot_none); - - if (!IS_ENABLED(CONFIG_NUMA_BALANCING)) - return; - - pr_debug("Validating PTE saved write\n"); - WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte)))); - WARN_ON(pte_savedwrite(pte_clear_savedwrite(pte_mk_savedwrite(pte)))); -} - #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) { @@ -306,22 +290,6 @@ static void __init pmd_leaf_tests(struct pgtable_debug_args *args) WARN_ON(!pmd_leaf(pmd)); } -static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args) -{ - pmd_t pmd; - - if (!IS_ENABLED(CONFIG_NUMA_BALANCING)) - return; - - if (!has_transparent_hugepage()) - return; - - pr_debug("Validating PMD saved write\n"); - pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot_none); - WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd)))); - WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd)))); -} - #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { @@ -455,7 +423,6 @@ static void __init pmd_advanced_tests(struct pgtable_debug_args *args) { } static void __init pud_advanced_tests(struct pgtable_debug_args *args) { } static void __init pmd_leaf_tests(struct pgtable_debug_args *args) { } static void __init pud_leaf_tests(struct pgtable_debug_args *args) { } -static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP @@ -1125,7 +1092,7 @@ static int __init init_args(struct pgtable_debug_args *args) */ memset(args, 0, sizeof(*args)); args->vaddr = get_random_vaddr(); - args->page_prot = vm_get_page_prot(VMFLAGS); + args->page_prot = vm_get_page_prot(VM_ACCESS_FLAGS); args->page_prot_none = vm_get_page_prot(VM_NONE); args->is_contiguous_page = false; args->pud_pfn = ULONG_MAX; @@ -1292,9 +1259,6 @@ static int __init debug_vm_pgtable(void) pmd_leaf_tests(&args); pud_leaf_tests(&args); - pte_savedwrite_tests(&args); - pmd_savedwrite_tests(&args); - pte_special_tests(&args); pte_protnone_tests(&args); pmd_protnone_tests(&args); diff --git a/mm/fadvise.c b/mm/fadvise.c index c76ee665355a..bf04fec87f35 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -72,7 +72,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) */ endbyte = (u64)offset + (u64)len; if (!len || endbyte < len) - endbyte = -1; + endbyte = LLONG_MAX; else endbyte--; /* inclusive */ diff --git a/mm/failslab.c b/mm/failslab.c index 58df9789f1d2..ffc420c0e767 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -16,6 +16,8 @@ static struct { bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags) { + int flags = 0; + /* No fault-injection for bootstrap cache */ if (unlikely(s == kmem_cache)) return false; @@ -30,10 +32,16 @@ bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags) if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB)) return false; + /* + * In some cases, it expects to specify __GFP_NOWARN + * to avoid printing any information(not just a warning), + * thus avoiding deadlocks. See commit 6b9dbedbe349 for + * details. + */ if (gfpflags & __GFP_NOWARN) - failslab.attr.no_warn = true; + flags |= FAULT_NOWARN; - return should_fail(&failslab.attr, s->object_size); + return should_fail_ex(&failslab.attr, s->object_size, flags); } static int __init setup_failslab(char *str) diff --git a/mm/filemap.c b/mm/filemap.c index 15800334147b..c4d4ace9cc70 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -506,9 +506,6 @@ static void __filemap_fdatawait_range(struct address_space *mapping, struct pagevec pvec; int nr_pages; - if (end_byte < start_byte) - return; - pagevec_init(&pvec); while (index <= end) { unsigned i; @@ -632,22 +629,23 @@ bool filemap_range_has_writeback(struct address_space *mapping, { XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); pgoff_t max = end_byte >> PAGE_SHIFT; - struct page *page; + struct folio *folio; if (end_byte < start_byte) return false; rcu_read_lock(); - xas_for_each(&xas, page, max) { - if (xas_retry(&xas, page)) + xas_for_each(&xas, folio, max) { + if (xas_retry(&xas, folio)) continue; - if (xa_is_value(page)) + if (xa_is_value(folio)) continue; - if (PageDirty(page) || PageLocked(page) || PageWriteback(page)) + if (folio_test_dirty(folio) || folio_test_locked(folio) || + folio_test_writeback(folio)) break; } rcu_read_unlock(); - return page != NULL; + return folio != NULL; } EXPORT_SYMBOL_GPL(filemap_range_has_writeback); @@ -669,6 +667,9 @@ int filemap_write_and_wait_range(struct address_space *mapping, { int err = 0, err2; + if (lend < lstart) + return 0; + if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); @@ -769,6 +770,9 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) int err = 0, err2; struct address_space *mapping = file->f_mapping; + if (lend < lstart) + return 0; + if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); @@ -784,56 +788,54 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) EXPORT_SYMBOL(file_write_and_wait_range); /** - * replace_page_cache_page - replace a pagecache page with a new one - * @old: page to be replaced - * @new: page to replace with - * - * This function replaces a page in the pagecache with a new one. On - * success it acquires the pagecache reference for the new page and - * drops it for the old page. Both the old and new pages must be - * locked. This function does not add the new page to the LRU, the + * replace_page_cache_folio - replace a pagecache folio with a new one + * @old: folio to be replaced + * @new: folio to replace with + * + * This function replaces a folio in the pagecache with a new one. On + * success it acquires the pagecache reference for the new folio and + * drops it for the old folio. Both the old and new folios must be + * locked. This function does not add the new folio to the LRU, the * caller must do that. * * The remove + add is atomic. This function cannot fail. */ -void replace_page_cache_page(struct page *old, struct page *new) +void replace_page_cache_folio(struct folio *old, struct folio *new) { - struct folio *fold = page_folio(old); - struct folio *fnew = page_folio(new); struct address_space *mapping = old->mapping; void (*free_folio)(struct folio *) = mapping->a_ops->free_folio; pgoff_t offset = old->index; XA_STATE(xas, &mapping->i_pages, offset); - VM_BUG_ON_PAGE(!PageLocked(old), old); - VM_BUG_ON_PAGE(!PageLocked(new), new); - VM_BUG_ON_PAGE(new->mapping, new); + VM_BUG_ON_FOLIO(!folio_test_locked(old), old); + VM_BUG_ON_FOLIO(!folio_test_locked(new), new); + VM_BUG_ON_FOLIO(new->mapping, new); - get_page(new); + folio_get(new); new->mapping = mapping; new->index = offset; - mem_cgroup_migrate(fold, fnew); + mem_cgroup_migrate(old, new); xas_lock_irq(&xas); xas_store(&xas, new); old->mapping = NULL; /* hugetlb pages do not participate in page cache accounting. */ - if (!PageHuge(old)) - __dec_lruvec_page_state(old, NR_FILE_PAGES); - if (!PageHuge(new)) - __inc_lruvec_page_state(new, NR_FILE_PAGES); - if (PageSwapBacked(old)) - __dec_lruvec_page_state(old, NR_SHMEM); - if (PageSwapBacked(new)) - __inc_lruvec_page_state(new, NR_SHMEM); + if (!folio_test_hugetlb(old)) + __lruvec_stat_sub_folio(old, NR_FILE_PAGES); + if (!folio_test_hugetlb(new)) + __lruvec_stat_add_folio(new, NR_FILE_PAGES); + if (folio_test_swapbacked(old)) + __lruvec_stat_sub_folio(old, NR_SHMEM); + if (folio_test_swapbacked(new)) + __lruvec_stat_add_folio(new, NR_SHMEM); xas_unlock_irq(&xas); if (free_folio) - free_folio(fold); - folio_put(fold); + free_folio(old); + folio_put(old); } -EXPORT_SYMBOL_GPL(replace_page_cache_page); +EXPORT_SYMBOL_GPL(replace_page_cache_folio); noinline int __filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp) @@ -1221,15 +1223,12 @@ static inline int folio_wait_bit_common(struct folio *folio, int bit_nr, struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; bool thrashing = false; - bool delayacct = false; unsigned long pflags; + bool in_thrashing; if (bit_nr == PG_locked && !folio_test_uptodate(folio) && folio_test_workingset(folio)) { - if (!folio_test_swapbacked(folio)) { - delayacct_thrashing_start(); - delayacct = true; - } + delayacct_thrashing_start(&in_thrashing); psi_memstall_enter(&pflags); thrashing = true; } @@ -1329,8 +1328,7 @@ repeat: finish_wait(q, wait); if (thrashing) { - if (delayacct) - delayacct_thrashing_end(); + delayacct_thrashing_end(&in_thrashing); psi_memstall_leave(&pflags); } @@ -1378,17 +1376,14 @@ void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep, struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; bool thrashing = false; - bool delayacct = false; unsigned long pflags; + bool in_thrashing; wait_queue_head_t *q; struct folio *folio = page_folio(pfn_swap_entry_to_page(entry)); q = folio_waitqueue(folio); if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) { - if (!folio_test_swapbacked(folio)) { - delayacct_thrashing_start(); - delayacct = true; - } + delayacct_thrashing_start(&in_thrashing); psi_memstall_enter(&pflags); thrashing = true; } @@ -1435,8 +1430,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep, finish_wait(q, wait); if (thrashing) { - if (delayacct) - delayacct_thrashing_end(); + delayacct_thrashing_end(&in_thrashing); psi_memstall_leave(&pflags); } } @@ -1467,7 +1461,7 @@ EXPORT_SYMBOL(folio_wait_bit_killable); * * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal. */ -int folio_put_wait_locked(struct folio *folio, int state) +static int folio_put_wait_locked(struct folio *folio, int state) { return folio_wait_bit_common(folio, PG_locked, state, DROP); } @@ -1633,24 +1627,26 @@ EXPORT_SYMBOL(folio_end_writeback); */ void page_endio(struct page *page, bool is_write, int err) { + struct folio *folio = page_folio(page); + if (!is_write) { if (!err) { - SetPageUptodate(page); + folio_mark_uptodate(folio); } else { - ClearPageUptodate(page); - SetPageError(page); + folio_clear_uptodate(folio); + folio_set_error(folio); } - unlock_page(page); + folio_unlock(folio); } else { if (err) { struct address_space *mapping; - SetPageError(page); - mapping = page_mapping(page); + folio_set_error(folio); + mapping = folio_mapping(folio); if (mapping) mapping_set_error(mapping, err); } - end_page_writeback(page); + folio_end_writeback(folio); } } EXPORT_SYMBOL_GPL(page_endio); @@ -2053,10 +2049,10 @@ reset: * * Return: The number of entries which were found. */ -unsigned find_get_entries(struct address_space *mapping, pgoff_t start, +unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) { - XA_STATE(xas, &mapping->i_pages, start); + XA_STATE(xas, &mapping->i_pages, *start); struct folio *folio; rcu_read_lock(); @@ -2067,6 +2063,15 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t start, } rcu_read_unlock(); + if (folio_batch_count(fbatch)) { + unsigned long nr = 1; + int idx = folio_batch_count(fbatch) - 1; + + folio = fbatch->folios[idx]; + if (!xa_is_value(folio) && !folio_test_hugetlb(folio)) + nr = folio_nr_pages(folio); + *start = indices[idx] + nr; + } return folio_batch_count(fbatch); } @@ -2090,16 +2095,16 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t start, * * Return: The number of entries which were found. */ -unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, +unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) { - XA_STATE(xas, &mapping->i_pages, start); + XA_STATE(xas, &mapping->i_pages, *start); struct folio *folio; rcu_read_lock(); while ((folio = find_get_entry(&xas, end, XA_PRESENT))) { if (!xa_is_value(folio)) { - if (folio->index < start) + if (folio->index < *start) goto put; if (folio->index + folio_nr_pages(folio) - 1 > end) goto put; @@ -2122,6 +2127,15 @@ put: } rcu_read_unlock(); + if (folio_batch_count(fbatch)) { + unsigned long nr = 1; + int idx = folio_batch_count(fbatch) - 1; + + folio = fbatch->folios[idx]; + if (!xa_is_value(folio) && !folio_test_hugetlb(folio)) + nr = folio_nr_pages(folio); + *start = indices[idx] + nr; + } return folio_batch_count(fbatch); } @@ -2195,30 +2209,31 @@ bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max) } /** - * find_get_pages_contig - gang contiguous pagecache lookup + * filemap_get_folios_contig - Get a batch of contiguous folios * @mapping: The address_space to search - * @index: The starting page index - * @nr_pages: The maximum number of pages - * @pages: Where the resulting pages are placed + * @start: The starting page index + * @end: The final page index (inclusive) + * @fbatch: The batch to fill * - * find_get_pages_contig() works exactly like find_get_pages_range(), - * except that the returned number of pages are guaranteed to be - * contiguous. + * filemap_get_folios_contig() works exactly like filemap_get_folios(), + * except the returned folios are guaranteed to be contiguous. This may + * not return all contiguous folios if the batch gets filled up. * - * Return: the number of pages which were found. + * Return: The number of folios found. + * Also update @start to be positioned for traversal of the next folio. */ -unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, - unsigned int nr_pages, struct page **pages) + +unsigned filemap_get_folios_contig(struct address_space *mapping, + pgoff_t *start, pgoff_t end, struct folio_batch *fbatch) { - XA_STATE(xas, &mapping->i_pages, index); + XA_STATE(xas, &mapping->i_pages, *start); + unsigned long nr; struct folio *folio; - unsigned int ret = 0; - - if (unlikely(!nr_pages)) - return 0; rcu_read_lock(); - for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { + + for (folio = xas_load(&xas); folio && xas.xa_index <= end; + folio = xas_next(&xas)) { if (xas_retry(&xas, folio)) continue; /* @@ -2226,33 +2241,45 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, * No current caller is looking for DAX entries. */ if (xa_is_value(folio)) - break; + goto update_start; if (!folio_try_get_rcu(folio)) goto retry; if (unlikely(folio != xas_reload(&xas))) - goto put_page; + goto put_folio; -again: - pages[ret] = folio_file_page(folio, xas.xa_index); - if (++ret == nr_pages) - break; - if (folio_more_pages(folio, xas.xa_index, ULONG_MAX)) { - xas.xa_index++; - folio_ref_inc(folio); - goto again; + if (!folio_batch_add(fbatch, folio)) { + nr = folio_nr_pages(folio); + + if (folio_test_hugetlb(folio)) + nr = 1; + *start = folio->index + nr; + goto out; } continue; -put_page: +put_folio: folio_put(folio); + retry: xas_reset(&xas); } + +update_start: + nr = folio_batch_count(fbatch); + + if (nr) { + folio = fbatch->folios[nr - 1]; + if (folio_test_hugetlb(folio)) + *start = folio->index + 1; + else + *start = folio->index + folio_nr_pages(folio); + } +out: rcu_read_unlock(); - return ret; + return folio_batch_count(fbatch); } -EXPORT_SYMBOL(find_get_pages_contig); +EXPORT_SYMBOL(filemap_get_folios_contig); /** * find_get_pages_range_tag - Find and return head pages matching @tag. @@ -2382,6 +2409,8 @@ retry: static int filemap_read_folio(struct file *file, filler_t filler, struct folio *folio) { + bool workingset = folio_test_workingset(folio); + unsigned long pflags; int error; /* @@ -2390,8 +2419,13 @@ static int filemap_read_folio(struct file *file, filler_t filler, * fails. */ folio_clear_error(folio); + /* Start the actual read. The read will unlock the page. */ + if (unlikely(workingset)) + psi_memstall_enter(&pflags); error = filler(file, folio); + if (unlikely(workingset)) + psi_memstall_leave(&pflags); if (error) return error; @@ -3712,7 +3746,7 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) unsigned long offset; /* Offset into pagecache page */ unsigned long bytes; /* Bytes to write to page */ size_t copied; /* Bytes copied from user */ - void *fsdata; + void *fsdata = NULL; offset = (pos & (PAGE_SIZE - 1)); bytes = min_t(unsigned long, PAGE_SIZE - offset, diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 458618c7302c..69ed25790c68 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -39,12 +39,6 @@ void wait_for_stable_page(struct page *page) } EXPORT_SYMBOL_GPL(wait_for_stable_page); -bool page_mapped(struct page *page) -{ - return folio_mapped(page_folio(page)); -} -EXPORT_SYMBOL(page_mapped); - void mark_page_accessed(struct page *page) { folio_mark_accessed(page_folio(page)); @@ -82,11 +76,11 @@ bool redirty_page_for_writepage(struct writeback_control *wbc, } EXPORT_SYMBOL(redirty_page_for_writepage); -void lru_cache_add(struct page *page) +void lru_cache_add_inactive_or_unevictable(struct page *page, + struct vm_area_struct *vma) { - folio_add_lru(page_folio(page)); + folio_add_lru_vma(page_folio(page), vma); } -EXPORT_SYMBOL(lru_cache_add); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp) @@ -102,7 +96,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, struct folio *folio; folio = __filemap_get_folio(mapping, index, fgp_flags, gfp); - if ((fgp_flags & FGP_HEAD) || !folio || xa_is_value(folio)) + if (!folio || xa_is_value(folio)) return &folio->page; return folio_file_page(folio, index); } @@ -118,17 +112,6 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, } EXPORT_SYMBOL(grab_cache_page_write_begin); -void delete_from_page_cache(struct page *page) -{ - return filemap_remove_folio(page_folio(page)); -} - -int try_to_release_page(struct page *page, gfp_t gfp) -{ - return filemap_release_folio(page_folio(page), gfp); -} -EXPORT_SYMBOL(try_to_release_page); - int isolate_lru_page(struct page *page) { if (WARN_RATELIMIT(PageTail(page), "trying to isolate tail page")) diff --git a/mm/frontswap.c b/mm/frontswap.c index 1a97610308cb..279e55b4ed87 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -125,6 +125,9 @@ void frontswap_init(unsigned type, unsigned long *map) * p->frontswap set to something valid to work properly. */ frontswap_map_set(sis, map); + + if (!frontswap_enabled()) + return; frontswap_ops->init(type); } @@ -123,6 +123,9 @@ retry: */ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) { + if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) + return NULL; + if (flags & FOLL_GET) return try_get_folio(page, refs); else if (flags & FOLL_PIN) { @@ -158,6 +161,13 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) else folio_ref_add(folio, refs * (GUP_PIN_COUNTING_BIAS - 1)); + /* + * Adjust the pincount before re-checking the PTE for changes. + * This is essentially a smp_mb() and is paired with a memory + * barrier in page_try_share_anon_rmap(). + */ + smp_mb__after_atomic(); + node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); return folio; @@ -195,17 +205,22 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) * time. Cases: please see the try_grab_folio() documentation, with * "refs=1". * - * Return: true for success, or if no action was required (if neither FOLL_PIN - * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or - * FOLL_PIN was set, but the page could not be grabbed. + * Return: 0 for success, or if no action was required (if neither FOLL_PIN + * nor FOLL_GET was set, nothing is done). A negative error code for failure: + * + * -ENOMEM FOLL_GET or FOLL_PIN was set, but the page could not + * be grabbed. */ -bool __must_check try_grab_page(struct page *page, unsigned int flags) +int __must_check try_grab_page(struct page *page, unsigned int flags) { struct folio *folio = page_folio(page); WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN)); if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) - return false; + return -ENOMEM; + + if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) + return -EREMOTEIO; if (flags & FOLL_GET) folio_ref_inc(folio); @@ -225,7 +240,7 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags) node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1); } - return true; + return 0; } /** @@ -530,31 +545,14 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) return ERR_PTR(-EINVAL); -retry: if (unlikely(pmd_bad(*pmd))) return no_page_table(vma, flags); ptep = pte_offset_map_lock(mm, pmd, address, &ptl); pte = *ptep; - if (!pte_present(pte)) { - swp_entry_t entry; - /* - * KSM's break_ksm() relies upon recognizing a ksm page - * even while it is being migrated, so for that case we - * need migration_entry_wait(). - */ - if (likely(!(flags & FOLL_MIGRATION))) - goto no_page; - if (pte_none(pte)) - goto no_page; - entry = pte_to_swp_entry(pte); - if (!is_migration_entry(entry)) - goto no_page; - pte_unmap_unlock(ptep, ptl); - migration_entry_wait(mm, pmd, address); - goto retry; - } - if ((flags & FOLL_NUMA) && pte_protnone(pte)) + if (!pte_present(pte)) + goto no_page; + if (pte_protnone(pte) && !gup_can_follow_protnone(flags)) goto no_page; page = vm_normal_page(vma, address, pte); @@ -596,7 +594,7 @@ retry: } } - if (!pte_write(pte) && gup_must_unshare(flags, page)) { + if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) { page = ERR_PTR(-EMLINK); goto out; } @@ -605,10 +603,12 @@ retry: !PageAnonExclusive(page), page); /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */ - if (unlikely(!try_grab_page(page, flags))) { - page = ERR_PTR(-ENOMEM); + ret = try_grab_page(page, flags); + if (unlikely(ret)) { + page = ERR_PTR(ret); goto out; } + /* * We need to make the page accessible if and only if we are going * to access its content (the FOLL_PIN case). Please see @@ -661,42 +661,8 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, pmdval = READ_ONCE(*pmd); if (pmd_none(pmdval)) return no_page_table(vma, flags); - if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) { - page = follow_huge_pmd(mm, address, pmd, flags); - if (page) - return page; - return no_page_table(vma, flags); - } - if (is_hugepd(__hugepd(pmd_val(pmdval)))) { - page = follow_huge_pd(vma, address, - __hugepd(pmd_val(pmdval)), flags, - PMD_SHIFT); - if (page) - return page; + if (!pmd_present(pmdval)) return no_page_table(vma, flags); - } -retry: - if (!pmd_present(pmdval)) { - /* - * Should never reach here, if thp migration is not supported; - * Otherwise, it must be a thp migration entry. - */ - VM_BUG_ON(!thp_migration_supported() || - !is_pmd_migration_entry(pmdval)); - - if (likely(!(flags & FOLL_MIGRATION))) - return no_page_table(vma, flags); - - pmd_migration_entry_wait(mm, pmd); - pmdval = READ_ONCE(*pmd); - /* - * MADV_DONTNEED may convert the pmd to null because - * mmap_lock is held in read mode - */ - if (pmd_none(pmdval)) - return no_page_table(vma, flags); - goto retry; - } if (pmd_devmap(pmdval)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); @@ -707,21 +673,13 @@ retry: if (likely(!pmd_trans_huge(pmdval))) return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); - if ((flags & FOLL_NUMA) && pmd_protnone(pmdval)) + if (pmd_protnone(pmdval) && !gup_can_follow_protnone(flags)) return no_page_table(vma, flags); -retry_locked: ptl = pmd_lock(mm, pmd); - if (unlikely(pmd_none(*pmd))) { - spin_unlock(ptl); - return no_page_table(vma, flags); - } if (unlikely(!pmd_present(*pmd))) { spin_unlock(ptl); - if (likely(!(flags & FOLL_MIGRATION))) - return no_page_table(vma, flags); - pmd_migration_entry_wait(mm, pmd); - goto retry_locked; + return no_page_table(vma, flags); } if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl); @@ -764,20 +722,6 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, pud = pud_offset(p4dp, address); if (pud_none(*pud)) return no_page_table(vma, flags); - if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) { - page = follow_huge_pud(mm, address, pud, flags); - if (page) - return page; - return no_page_table(vma, flags); - } - if (is_hugepd(__hugepd(pud_val(*pud)))) { - page = follow_huge_pd(vma, address, - __hugepd(pud_val(*pud)), flags, - PUD_SHIFT); - if (page) - return page; - return no_page_table(vma, flags); - } if (pud_devmap(*pud)) { ptl = pud_lock(mm, pud); page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap); @@ -797,7 +741,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, struct follow_page_context *ctx) { p4d_t *p4d; - struct page *page; p4d = p4d_offset(pgdp, address); if (p4d_none(*p4d)) @@ -806,14 +749,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, if (unlikely(p4d_bad(*p4d))) return no_page_table(vma, flags); - if (is_hugepd(__hugepd(p4d_val(*p4d)))) { - page = follow_huge_pd(vma, address, - __hugepd(p4d_val(*p4d)), flags, - P4D_SHIFT); - if (page) - return page; - return no_page_table(vma, flags); - } return follow_pud_mask(vma, address, p4d, flags, ctx); } @@ -851,10 +786,18 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, ctx->page_mask = 0; - /* make this handle hugepd */ - page = follow_huge_addr(mm, address, flags & FOLL_WRITE); - if (!IS_ERR(page)) { - WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN)); + /* + * Call hugetlb_follow_page_mask for hugetlb vmas as it will use + * special hugetlb page table walking code. This eliminates the + * need to check for hugetlb entries in the general walking code. + * + * hugetlb_follow_page_mask is only for follow_page() handling here. + * Ordinary GUP uses follow_hugetlb_page for hugetlb processing. + */ + if (is_vm_hugetlb_page(vma)) { + page = hugetlb_follow_page_mask(vma, address, flags); + if (!page) + page = no_page_table(vma, flags); return page; } @@ -863,21 +806,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) return no_page_table(vma, flags); - if (pgd_huge(*pgd)) { - page = follow_huge_pgd(mm, address, pgd, flags); - if (page) - return page; - return no_page_table(vma, flags); - } - if (is_hugepd(__hugepd(pgd_val(*pgd)))) { - page = follow_huge_pd(vma, address, - __hugepd(pgd_val(*pgd)), flags, - PGDIR_SHIFT); - if (page) - return page; - return no_page_table(vma, flags); - } - return follow_p4d_mask(vma, address, pgd, flags, ctx); } @@ -941,10 +869,9 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, goto unmap; *page = pte_page(*pte); } - if (unlikely(!try_grab_page(*page, gup_flags))) { - ret = -ENOMEM; + ret = try_grab_page(*page, gup_flags); + if (unlikely(ret)) goto unmap; - } out: ret = 0; unmap: @@ -970,8 +897,17 @@ static int faultin_page(struct vm_area_struct *vma, fault_flags |= FAULT_FLAG_WRITE; if (*flags & FOLL_REMOTE) fault_flags |= FAULT_FLAG_REMOTE; - if (locked) + if (locked) { fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + /* + * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set + * FOLL_INTERRUPTIBLE to enable FAULT_FLAG_INTERRUPTIBLE. + * That's because some callers may not be prepared to + * handle early exits caused by non-fatal signals. + */ + if (*flags & FOLL_INTERRUPTIBLE) + fault_flags |= FAULT_FLAG_INTERRUPTIBLE; + } if (*flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; if (*flags & FOLL_TRIED) { @@ -1039,6 +975,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma)) return -EOPNOTSUPP; + if ((gup_flags & FOLL_LONGTERM) && (gup_flags & FOLL_PCI_P2PDMA)) + return -EOPNOTSUPP; + if (vma_is_secretmem(vma)) return -EFAULT; @@ -1046,6 +985,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if (!(vm_flags & VM_WRITE)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; + /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */ + if (is_vm_hugetlb_page(vma)) + return -EFAULT; /* * We used to let the write,force case do COW in a * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could @@ -1153,14 +1095,6 @@ static long __get_user_pages(struct mm_struct *mm, VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); - /* - * If FOLL_FORCE is set then do not force a full fault as the hinting - * fault information is unrelated to the reference behaviour of a task - * using the address space - */ - if (!(gup_flags & FOLL_FORCE)) - gup_flags |= FOLL_NUMA; - do { struct page *page; unsigned int foll_flags = gup_flags; @@ -1381,6 +1315,22 @@ retry: EXPORT_SYMBOL_GPL(fixup_user_fault); /* + * GUP always responds to fatal signals. When FOLL_INTERRUPTIBLE is + * specified, it'll also respond to generic signals. The caller of GUP + * that has FOLL_INTERRUPTIBLE should take care of the GUP interruption. + */ +static bool gup_signal_pending(unsigned int flags) +{ + if (fatal_signal_pending(current)) + return true; + + if (!(flags & FOLL_INTERRUPTIBLE)) + return false; + + return signal_pending(current); +} + +/* * Please note that this function, unlike __get_user_pages will not * return 0 for nr_pages > 0 without FOLL_NOWAIT */ @@ -1461,11 +1411,11 @@ retry: * Repeat on the address that fired VM_FAULT_RETRY * with both FAULT_FLAG_ALLOW_RETRY and * FAULT_FLAG_TRIED. Note that GUP can be interrupted - * by fatal signals, so we need to check it before we + * by fatal signals of even common signals, depending on + * the caller's request. So we need to check it before we * start trying again otherwise it can loop forever. */ - - if (fatal_signal_pending(current)) { + if (gup_signal_pending(flags)) { if (!pages_done) pages_done = -EINTR; break; @@ -1667,10 +1617,11 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) if (!locked) { locked = 1; mmap_read_lock(mm); - vma = find_vma(mm, nstart); + vma = find_vma_intersection(mm, nstart, end); } else if (nstart >= vma->vm_end) - vma = vma->vm_next; - if (!vma || vma->vm_start >= end) + vma = find_vma_intersection(mm, vma->vm_end, end); + + if (!vma) break; /* * Set [nstart; nend) to intersection of desired address @@ -1927,20 +1878,16 @@ struct page *get_dump_page(unsigned long addr) #ifdef CONFIG_MIGRATION /* - * Check whether all pages are pinnable, if so return number of pages. If some - * pages are not pinnable, migrate them, and unpin all pages. Return zero if - * pages were migrated, or if some pages were not successfully isolated. - * Return negative error if migration fails. + * Returns the number of collected pages. Return value is always >= 0. */ -static long check_and_migrate_movable_pages(unsigned long nr_pages, - struct page **pages, - unsigned int gup_flags) +static unsigned long collect_longterm_unpinnable_pages( + struct list_head *movable_page_list, + unsigned long nr_pages, + struct page **pages) { - unsigned long isolation_error_count = 0, i; + unsigned long i, collected = 0; struct folio *prev_folio = NULL; - LIST_HEAD(movable_page_list); - bool drain_allow = true, coherent_pages = false; - int ret = 0; + bool drain_allow = true; for (i = 0; i < nr_pages; i++) { struct folio *folio = page_folio(pages[i]); @@ -1949,45 +1896,16 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, continue; prev_folio = folio; - /* - * Device coherent pages are managed by a driver and should not - * be pinned indefinitely as it prevents the driver moving the - * page. So when trying to pin with FOLL_LONGTERM instead try - * to migrate the page out of device memory. - */ - if (folio_is_device_coherent(folio)) { - /* - * We always want a new GUP lookup with device coherent - * pages. - */ - pages[i] = 0; - coherent_pages = true; - - /* - * Migration will fail if the page is pinned, so convert - * the pin on the source page to a normal reference. - */ - if (gup_flags & FOLL_PIN) { - get_page(&folio->page); - unpin_user_page(&folio->page); - } + if (folio_is_longterm_pinnable(folio)) + continue; - ret = migrate_device_coherent_page(&folio->page); - if (ret) - goto unpin_pages; + collected++; + if (folio_is_device_coherent(folio)) continue; - } - if (folio_is_longterm_pinnable(folio)) - continue; - /* - * Try to move out any movable page before pinning the range. - */ if (folio_test_hugetlb(folio)) { - if (isolate_hugetlb(&folio->page, - &movable_page_list)) - isolation_error_count++; + isolate_hugetlb(&folio->page, movable_page_list); continue; } @@ -1996,63 +1914,124 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, drain_allow = false; } - if (folio_isolate_lru(folio)) { - isolation_error_count++; + if (!folio_isolate_lru(folio)) continue; - } - list_add_tail(&folio->lru, &movable_page_list); + + list_add_tail(&folio->lru, movable_page_list); node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); } - if (!list_empty(&movable_page_list) || isolation_error_count || - coherent_pages) - goto unpin_pages; + return collected; +} - /* - * If list is empty, and no isolation errors, means that all pages are - * in the correct zone. - */ - return nr_pages; +/* + * Unpins all pages and migrates device coherent pages and movable_page_list. + * Returns -EAGAIN if all pages were successfully migrated or -errno for failure + * (or partial success). + */ +static int migrate_longterm_unpinnable_pages( + struct list_head *movable_page_list, + unsigned long nr_pages, + struct page **pages) +{ + int ret; + unsigned long i; -unpin_pages: - /* - * pages[i] might be NULL if any device coherent pages were found. - */ for (i = 0; i < nr_pages; i++) { - if (!pages[i]) + struct folio *folio = page_folio(pages[i]); + + if (folio_is_device_coherent(folio)) { + /* + * Migration will fail if the page is pinned, so convert + * the pin on the source page to a normal reference. + */ + pages[i] = NULL; + folio_get(folio); + gup_put_folio(folio, 1, FOLL_PIN); + + if (migrate_device_coherent_page(&folio->page)) { + ret = -EBUSY; + goto err; + } + continue; + } - if (gup_flags & FOLL_PIN) - unpin_user_page(pages[i]); - else - put_page(pages[i]); + /* + * We can't migrate pages with unexpected references, so drop + * the reference obtained by __get_user_pages_locked(). + * Migrating pages have been added to movable_page_list after + * calling folio_isolate_lru() which takes a reference so the + * page won't be freed if it's migrating. + */ + unpin_user_page(pages[i]); + pages[i] = NULL; } - if (!list_empty(&movable_page_list)) { + if (!list_empty(movable_page_list)) { struct migration_target_control mtc = { .nid = NUMA_NO_NODE, .gfp_mask = GFP_USER | __GFP_NOWARN, }; - ret = migrate_pages(&movable_page_list, alloc_migration_target, - NULL, (unsigned long)&mtc, MIGRATE_SYNC, - MR_LONGTERM_PIN, NULL); - if (ret > 0) /* number of pages not migrated */ + if (migrate_pages(movable_page_list, alloc_migration_target, + NULL, (unsigned long)&mtc, MIGRATE_SYNC, + MR_LONGTERM_PIN, NULL)) { ret = -ENOMEM; + goto err; + } } - if (ret && !list_empty(&movable_page_list)) - putback_movable_pages(&movable_page_list); + putback_movable_pages(movable_page_list); + + return -EAGAIN; + +err: + for (i = 0; i < nr_pages; i++) + if (pages[i]) + unpin_user_page(pages[i]); + putback_movable_pages(movable_page_list); + return ret; } + +/* + * Check whether all pages are *allowed* to be pinned. Rather confusingly, all + * pages in the range are required to be pinned via FOLL_PIN, before calling + * this routine. + * + * If any pages in the range are not allowed to be pinned, then this routine + * will migrate those pages away, unpin all the pages in the range and return + * -EAGAIN. The caller should re-pin the entire range with FOLL_PIN and then + * call this routine again. + * + * If an error other than -EAGAIN occurs, this indicates a migration failure. + * The caller should give up, and propagate the error back up the call stack. + * + * If everything is OK and all pages in the range are allowed to be pinned, then + * this routine leaves all pages pinned and returns zero for success. + */ +static long check_and_migrate_movable_pages(unsigned long nr_pages, + struct page **pages) +{ + unsigned long collected; + LIST_HEAD(movable_page_list); + + collected = collect_longterm_unpinnable_pages(&movable_page_list, + nr_pages, pages); + if (!collected) + return 0; + + return migrate_longterm_unpinnable_pages(&movable_page_list, nr_pages, + pages); +} #else static long check_and_migrate_movable_pages(unsigned long nr_pages, - struct page **pages, - unsigned int gup_flags) + struct page **pages) { - return nr_pages; + return 0; } #endif /* CONFIG_MIGRATION */ @@ -2065,25 +2044,53 @@ static long __gup_longterm_locked(struct mm_struct *mm, unsigned long nr_pages, struct page **pages, struct vm_area_struct **vmas, + int *locked, unsigned int gup_flags) { + bool must_unlock = false; unsigned int flags; - long rc; + long rc, nr_pinned_pages; + + if (locked && WARN_ON_ONCE(!*locked)) + return -EINVAL; if (!(gup_flags & FOLL_LONGTERM)) return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, - NULL, gup_flags); + locked, gup_flags); + + /* + * If we get to this point then FOLL_LONGTERM is set, and FOLL_LONGTERM + * implies FOLL_PIN (although the reverse is not true). Therefore it is + * correct to unconditionally call check_and_migrate_movable_pages() + * which assumes pages have been pinned via FOLL_PIN. + * + * Enforce the above reasoning by asserting that FOLL_PIN is set. + */ + if (WARN_ON(!(gup_flags & FOLL_PIN))) + return -EINVAL; flags = memalloc_pin_save(); do { - rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, - NULL, gup_flags); - if (rc <= 0) + if (locked && !*locked) { + mmap_read_lock(mm); + must_unlock = true; + *locked = 1; + } + nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages, + pages, vmas, locked, + gup_flags); + if (nr_pinned_pages <= 0) { + rc = nr_pinned_pages; break; - rc = check_and_migrate_movable_pages(rc, pages, gup_flags); - } while (!rc); + } + rc = check_and_migrate_movable_pages(nr_pinned_pages, pages); + } while (rc == -EAGAIN); memalloc_pin_restore(flags); - return rc; + if (locked && *locked && must_unlock) { + mmap_read_unlock(mm); + *locked = 0; + } + return rc ? rc : nr_pinned_pages; } static bool is_valid_gup_flags(unsigned int gup_flags) @@ -2106,35 +2113,6 @@ static bool is_valid_gup_flags(unsigned int gup_flags) } #ifdef CONFIG_MMU -static long __get_user_pages_remote(struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked) -{ - /* - * Parts of FOLL_LONGTERM behavior are incompatible with - * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on - * vmas. However, this only comes up if locked is set, and there are - * callers that do request FOLL_LONGTERM, but do not set locked. So, - * allow what we can. - */ - if (gup_flags & FOLL_LONGTERM) { - if (WARN_ON_ONCE(locked)) - return -EINVAL; - /* - * This will check the vmas (even if our vmas arg is NULL) - * and return -ENOTSUPP if DAX isn't allowed in this case: - */ - return __gup_longterm_locked(mm, start, nr_pages, pages, - vmas, gup_flags | FOLL_TOUCH | - FOLL_REMOTE); - } - - return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, - locked, - gup_flags | FOLL_TOUCH | FOLL_REMOTE); -} - /** * get_user_pages_remote() - pin user pages in memory * @mm: mm_struct of target mm @@ -2203,8 +2181,8 @@ long get_user_pages_remote(struct mm_struct *mm, if (!is_valid_gup_flags(gup_flags)) return -EINVAL; - return __get_user_pages_remote(mm, start, nr_pages, gup_flags, - pages, vmas, locked); + return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, locked, + gup_flags | FOLL_TOUCH | FOLL_REMOTE); } EXPORT_SYMBOL(get_user_pages_remote); @@ -2216,14 +2194,6 @@ long get_user_pages_remote(struct mm_struct *mm, { return 0; } - -static long __get_user_pages_remote(struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked) -{ - return 0; -} #endif /* !CONFIG_MMU */ /** @@ -2250,7 +2220,7 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, return -EINVAL; return __gup_longterm_locked(current->mm, start, nr_pages, - pages, vmas, gup_flags | FOLL_TOUCH); + pages, vmas, NULL, gup_flags | FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages); @@ -2276,18 +2246,9 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, int locked = 1; long ret; - /* - * FIXME: Current FOLL_LONGTERM behavior is incompatible with - * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on - * vmas. As there are no users of this flag in this call we simply - * disallow this option for now. - */ - if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) - return -EINVAL; - mmap_read_lock(mm); - ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL, - &locked, gup_flags | FOLL_TOUCH); + ret = __gup_longterm_locked(mm, start, nr_pages, pages, NULL, &locked, + gup_flags | FOLL_TOUCH); if (locked) mmap_read_unlock(mm); return ret; @@ -2345,8 +2306,28 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, } #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL -static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, - unsigned int flags, struct page **pages, int *nr) +/* + * Fast-gup relies on pte change detection to avoid concurrent pgtable + * operations. + * + * To pin the page, fast-gup needs to do below in order: + * (1) pin the page (by prefetching pte), then (2) check pte not changed. + * + * For the rest of pgtable operations where pgtable updates can be racy + * with fast-gup, we need to do (1) clear pte, then (2) check whether page + * is pinned. + * + * Above will work for all pte-level operations, including THP split. + * + * For THP collapse, it's a bit more complicated because fast-gup may be + * walking a pgtable page that is being freed (pte is still valid but pmd + * can be cleared already). To avoid race in such condition, we need to + * also check pmd here to make sure pmd doesn't change (corresponds to + * pmdp_collapse_flush() in the THP collapse code path). + */ +static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, + unsigned long end, unsigned int flags, + struct page **pages, int *nr) { struct dev_pagemap *pgmap = NULL; int nr_start = *nr, ret = 0; @@ -2358,11 +2339,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, struct page *page; struct folio *folio; - /* - * Similar to the PMD case below, NUMA hinting must take slow - * path using the pte_protnone check. - */ - if (pte_protnone(pte)) + if (pte_protnone(pte) && !gup_can_follow_protnone(flags)) goto pte_unmap; if (!pte_access_permitted(pte, flags & FOLL_WRITE)) @@ -2392,12 +2369,13 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, goto pte_unmap; } - if (unlikely(pte_val(pte) != pte_val(*ptep))) { + if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) || + unlikely(pte_val(pte) != pte_val(*ptep))) { gup_put_folio(folio, 1, flags); goto pte_unmap; } - if (!pte_write(pte) && gup_must_unshare(flags, page)) { + if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) { gup_put_folio(folio, 1, flags); goto pte_unmap; } @@ -2439,8 +2417,9 @@ pte_unmap: * get_user_pages_fast_only implementation that can pin pages. Thus it's still * useful to have gup_huge_pmd even if we can't operate on ptes. */ -static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, - unsigned int flags, struct page **pages, int *nr) +static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, + unsigned long end, unsigned int flags, + struct page **pages, int *nr) { return 0; } @@ -2462,9 +2441,15 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, undo_dev_pagemap(nr, nr_start, flags, pages); break; } + + if (!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)) { + undo_dev_pagemap(nr, nr_start, flags, pages); + break; + } + SetPageReferenced(page); pages[*nr] = page; - if (unlikely(!try_grab_page(page, flags))) { + if (unlikely(try_grab_page(page, flags))) { undo_dev_pagemap(nr, nr_start, flags, pages); break; } @@ -2582,7 +2567,7 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, return 0; } - if (!pte_write(pte) && gup_must_unshare(flags, &folio->page)) { + if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } @@ -2648,7 +2633,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 0; } - if (!pmd_write(orig) && gup_must_unshare(flags, &folio->page)) { + if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } @@ -2688,7 +2673,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return 0; } - if (!pud_write(orig) && gup_must_unshare(flags, &folio->page)) { + if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } @@ -2736,7 +2721,7 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo pmdp = pmd_offset_lockless(pudp, pud, addr); do { - pmd_t pmd = READ_ONCE(*pmdp); + pmd_t pmd = pmdp_get_lockless(pmdp); next = pmd_addr_end(addr, end); if (!pmd_present(pmd)) @@ -2744,12 +2729,8 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) || pmd_devmap(pmd))) { - /* - * NUMA hinting faults need to be handled in the GUP - * slowpath for accounting purposes and so that they - * can be serialised against THP migration. - */ - if (pmd_protnone(pmd)) + if (pmd_protnone(pmd) && + !gup_can_follow_protnone(flags)) return 0; if (!gup_huge_pmd(pmd, pmdp, addr, next, flags, @@ -2764,7 +2745,7 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr, PMD_SHIFT, next, flags, pages, nr)) return 0; - } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr)) + } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr)) return 0; } while (pmdp++, addr = next, addr != end); @@ -2784,7 +2765,7 @@ static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned lo next = pud_addr_end(addr, end); if (unlikely(!pud_present(pud))) return 0; - if (unlikely(pud_huge(pud))) { + if (unlikely(pud_huge(pud) || pud_devmap(pud))) { if (!gup_huge_pud(pud, pudp, addr, next, flags, pages, nr)) return 0; @@ -2867,29 +2848,6 @@ static bool gup_fast_permitted(unsigned long start, unsigned long end) } #endif -static int __gup_longterm_unlocked(unsigned long start, int nr_pages, - unsigned int gup_flags, struct page **pages) -{ - int ret; - - /* - * FIXME: FOLL_LONGTERM does not work with - * get_user_pages_unlocked() (see comments in that function) - */ - if (gup_flags & FOLL_LONGTERM) { - mmap_read_lock(current->mm); - ret = __gup_longterm_locked(current->mm, - start, nr_pages, - pages, NULL, gup_flags); - mmap_read_unlock(current->mm); - } else { - ret = get_user_pages_unlocked(start, nr_pages, - pages, gup_flags); - } - - return ret; -} - static unsigned long lockless_pages_from_mm(unsigned long start, unsigned long end, unsigned int gup_flags, @@ -2950,7 +2908,8 @@ static int internal_get_user_pages_fast(unsigned long start, if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | FOLL_FORCE | FOLL_PIN | FOLL_GET | - FOLL_FAST_ONLY | FOLL_NOFAULT))) + FOLL_FAST_ONLY | FOLL_NOFAULT | + FOLL_PCI_P2PDMA))) return -EINVAL; if (gup_flags & FOLL_PIN) @@ -2973,8 +2932,8 @@ static int internal_get_user_pages_fast(unsigned long start, /* Slow path: try to get the remaining pages with get_user_pages */ start += nr_pinned << PAGE_SHIFT; pages += nr_pinned; - ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, gup_flags, - pages); + ret = get_user_pages_unlocked(start, nr_pages - nr_pinned, pages, + gup_flags); if (ret < 0) { /* * The caller has to unpin the pages we already pinned so @@ -3173,9 +3132,9 @@ long pin_user_pages_remote(struct mm_struct *mm, if (WARN_ON_ONCE(!pages)) return -EINVAL; - gup_flags |= FOLL_PIN; - return __get_user_pages_remote(mm, start, nr_pages, gup_flags, - pages, vmas, locked); + return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, locked, + gup_flags | FOLL_PIN | FOLL_TOUCH | + FOLL_REMOTE); } EXPORT_SYMBOL(pin_user_pages_remote); @@ -3209,7 +3168,7 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages, gup_flags |= FOLL_PIN; return __gup_longterm_locked(current->mm, start, nr_pages, - pages, vmas, gup_flags); + pages, vmas, NULL, gup_flags); } EXPORT_SYMBOL(pin_user_pages); diff --git a/mm/gup_test.c b/mm/gup_test.c index 12b0a91767d3..8ae7307a1bb6 100644 --- a/mm/gup_test.c +++ b/mm/gup_test.c @@ -4,6 +4,7 @@ #include <linux/uaccess.h> #include <linux/ktime.h> #include <linux/debugfs.h> +#include <linux/highmem.h> #include "gup_test.h" static void put_back_pages(unsigned int cmd, struct page **pages, @@ -203,6 +204,138 @@ free_pages: return ret; } +static DEFINE_MUTEX(pin_longterm_test_mutex); +static struct page **pin_longterm_test_pages; +static unsigned long pin_longterm_test_nr_pages; + +static inline void pin_longterm_test_stop(void) +{ + if (pin_longterm_test_pages) { + if (pin_longterm_test_nr_pages) + unpin_user_pages(pin_longterm_test_pages, + pin_longterm_test_nr_pages); + kvfree(pin_longterm_test_pages); + pin_longterm_test_pages = NULL; + pin_longterm_test_nr_pages = 0; + } +} + +static inline int pin_longterm_test_start(unsigned long arg) +{ + long nr_pages, cur_pages, addr, remaining_pages; + int gup_flags = FOLL_LONGTERM; + struct pin_longterm_test args; + struct page **pages; + int ret = 0; + bool fast; + + if (pin_longterm_test_pages) + return -EINVAL; + + if (copy_from_user(&args, (void __user *)arg, sizeof(args))) + return -EFAULT; + + if (args.flags & + ~(PIN_LONGTERM_TEST_FLAG_USE_WRITE|PIN_LONGTERM_TEST_FLAG_USE_FAST)) + return -EINVAL; + if (!IS_ALIGNED(args.addr | args.size, PAGE_SIZE)) + return -EINVAL; + if (args.size > LONG_MAX) + return -EINVAL; + nr_pages = args.size / PAGE_SIZE; + if (!nr_pages) + return -EINVAL; + + pages = kvcalloc(nr_pages, sizeof(void *), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + if (args.flags & PIN_LONGTERM_TEST_FLAG_USE_WRITE) + gup_flags |= FOLL_WRITE; + fast = !!(args.flags & PIN_LONGTERM_TEST_FLAG_USE_FAST); + + if (!fast && mmap_read_lock_killable(current->mm)) { + kvfree(pages); + return -EINTR; + } + + pin_longterm_test_pages = pages; + pin_longterm_test_nr_pages = 0; + + while (nr_pages - pin_longterm_test_nr_pages) { + remaining_pages = nr_pages - pin_longterm_test_nr_pages; + addr = args.addr + pin_longterm_test_nr_pages * PAGE_SIZE; + + if (fast) + cur_pages = pin_user_pages_fast(addr, remaining_pages, + gup_flags, pages); + else + cur_pages = pin_user_pages(addr, remaining_pages, + gup_flags, pages, NULL); + if (cur_pages < 0) { + pin_longterm_test_stop(); + ret = cur_pages; + break; + } + pin_longterm_test_nr_pages += cur_pages; + pages += cur_pages; + } + + if (!fast) + mmap_read_unlock(current->mm); + return ret; +} + +static inline int pin_longterm_test_read(unsigned long arg) +{ + __u64 user_addr; + unsigned long i; + + if (!pin_longterm_test_pages) + return -EINVAL; + + if (copy_from_user(&user_addr, (void __user *)arg, sizeof(user_addr))) + return -EFAULT; + + for (i = 0; i < pin_longterm_test_nr_pages; i++) { + void *addr = kmap_local_page(pin_longterm_test_pages[i]); + unsigned long ret; + + ret = copy_to_user((void __user *)(unsigned long)user_addr, addr, + PAGE_SIZE); + kunmap_local(addr); + if (ret) + return -EFAULT; + user_addr += PAGE_SIZE; + } + return 0; +} + +static long pin_longterm_test_ioctl(struct file *filep, unsigned int cmd, + unsigned long arg) +{ + int ret = -EINVAL; + + if (mutex_lock_killable(&pin_longterm_test_mutex)) + return -EINTR; + + switch (cmd) { + case PIN_LONGTERM_TEST_START: + ret = pin_longterm_test_start(arg); + break; + case PIN_LONGTERM_TEST_STOP: + pin_longterm_test_stop(); + ret = 0; + break; + case PIN_LONGTERM_TEST_READ: + ret = pin_longterm_test_read(arg); + break; + } + + mutex_unlock(&pin_longterm_test_mutex); + return ret; +} + static long gup_test_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { @@ -217,6 +350,10 @@ static long gup_test_ioctl(struct file *filep, unsigned int cmd, case PIN_BASIC_TEST: case DUMP_USER_PAGES_TEST: break; + case PIN_LONGTERM_TEST_START: + case PIN_LONGTERM_TEST_STOP: + case PIN_LONGTERM_TEST_READ: + return pin_longterm_test_ioctl(filep, cmd, arg); default: return -EINVAL; } @@ -234,9 +371,17 @@ static long gup_test_ioctl(struct file *filep, unsigned int cmd, return 0; } +static int gup_test_release(struct inode *inode, struct file *file) +{ + pin_longterm_test_stop(); + + return 0; +} + static const struct file_operations gup_test_fops = { .open = nonseekable_open, .unlocked_ioctl = gup_test_ioctl, + .release = gup_test_release, }; static int __init gup_test_init(void) diff --git a/mm/gup_test.h b/mm/gup_test.h index 887ac1d5f5bc..5b37b54e8bea 100644 --- a/mm/gup_test.h +++ b/mm/gup_test.h @@ -10,6 +10,9 @@ #define GUP_BASIC_TEST _IOWR('g', 4, struct gup_test) #define PIN_BASIC_TEST _IOWR('g', 5, struct gup_test) #define DUMP_USER_PAGES_TEST _IOWR('g', 6, struct gup_test) +#define PIN_LONGTERM_TEST_START _IOW('g', 7, struct pin_longterm_test) +#define PIN_LONGTERM_TEST_STOP _IO('g', 8) +#define PIN_LONGTERM_TEST_READ _IOW('g', 9, __u64) #define GUP_TEST_MAX_PAGES_TO_DUMP 8 @@ -30,4 +33,13 @@ struct gup_test { __u32 which_pages[GUP_TEST_MAX_PAGES_TO_DUMP]; }; +#define PIN_LONGTERM_TEST_FLAG_USE_WRITE 1 +#define PIN_LONGTERM_TEST_FLAG_USE_FAST 2 + +struct pin_longterm_test { + __u64 addr; + __u64 size; + __u32 flags; +}; + #endif /* __GUP_TEST_H */ diff --git a/mm/highmem.c b/mm/highmem.c index c707d7202d5f..db251e77f98f 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -30,6 +30,17 @@ #include <asm/tlbflush.h> #include <linux/vmalloc.h> +#ifdef CONFIG_KMAP_LOCAL +static inline int kmap_local_calc_idx(int idx) +{ + return idx + KM_MAX_IDX * smp_processor_id(); +} + +#ifndef arch_kmap_local_map_idx +#define arch_kmap_local_map_idx(idx, pfn) kmap_local_calc_idx(idx) +#endif +#endif /* CONFIG_KMAP_LOCAL */ + /* * Virtual_count is not a pure "count". * 0 means that it is not mapped, and has not been mapped @@ -142,12 +153,29 @@ pte_t *pkmap_page_table; struct page *__kmap_to_page(void *vaddr) { + unsigned long base = (unsigned long) vaddr & PAGE_MASK; + struct kmap_ctrl *kctrl = ¤t->kmap_ctrl; unsigned long addr = (unsigned long)vaddr; + int i; + + /* kmap() mappings */ + if (WARN_ON_ONCE(addr >= PKMAP_ADDR(0) && + addr < PKMAP_ADDR(LAST_PKMAP))) + return pte_page(pkmap_page_table[PKMAP_NR(addr)]); - if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) { - int i = PKMAP_NR(addr); + /* kmap_local_page() mappings */ + if (WARN_ON_ONCE(base >= __fix_to_virt(FIX_KMAP_END) && + base < __fix_to_virt(FIX_KMAP_BEGIN))) { + for (i = 0; i < kctrl->idx; i++) { + unsigned long base_addr; + int idx; - return pte_page(pkmap_page_table[i]); + idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); + base_addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + + if (base_addr == base) + return pte_page(kctrl->pteval[i]); + } } return virt_to_page(vaddr); @@ -462,10 +490,6 @@ static inline void kmap_local_idx_pop(void) # define arch_kmap_local_post_unmap(vaddr) do { } while (0) #endif -#ifndef arch_kmap_local_map_idx -#define arch_kmap_local_map_idx(idx, pfn) kmap_local_calc_idx(idx) -#endif - #ifndef arch_kmap_local_unmap_idx #define arch_kmap_local_unmap_idx(idx, vaddr) kmap_local_calc_idx(idx) #endif @@ -494,11 +518,6 @@ static inline bool kmap_high_unmap_local(unsigned long vaddr) return false; } -static inline int kmap_local_calc_idx(int idx) -{ - return idx + KM_MAX_IDX * smp_processor_id(); -} - static pte_t *__kmap_pte; static pte_t *kmap_get_pte(unsigned long vaddr, int idx) @@ -253,7 +253,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, cpu_flags = HMM_PFN_VALID; if (is_writable_device_private_entry(entry)) cpu_flags |= HMM_PFN_WRITE; - *hmm_pfn = swp_offset(entry) | cpu_flags; + *hmm_pfn = swp_offset_pfn(entry) | cpu_flags; return 0; } @@ -361,8 +361,7 @@ again: * huge or device mapping one and compute corresponding pfn * values. */ - pmd = pmd_read_atomic(pmdp); - barrier(); + pmd = pmdp_get_lockless(pmdp); if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) goto again; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e9414ee57c5b..abe6cfd92ffa 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -36,6 +36,7 @@ #include <linux/numa.h> #include <linux/page_owner.h> #include <linux/sched/sysctl.h> +#include <linux/memory-tiers.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -70,9 +71,8 @@ static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; unsigned long huge_zero_pfn __read_mostly = ~0UL; -bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags, - bool smaps, bool in_pf) +bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, + bool smaps, bool in_pf, bool enforce_sysfs) { if (!vma->vm_mm) /* vdso */ return false; @@ -119,13 +119,12 @@ bool hugepage_vma_check(struct vm_area_struct *vma, * own flags. */ if (!in_pf && shmem_file(vma->vm_file)) - return shmem_huge_enabled(vma); + return shmem_huge_enabled(vma, !enforce_sysfs); - if (!hugepage_flags_enabled()) - return false; - - /* THP settings require madvise. */ - if (!(vm_flags & VM_HUGEPAGE) && !hugepage_flags_always()) + /* Enforce sysfs THP requirements as necessary */ + if (enforce_sysfs && + (!hugepage_flags_enabled() || (!(vm_flags & VM_HUGEPAGE) && + !hugepage_flags_always()))) return false; /* Only regular file is valid */ @@ -164,7 +163,6 @@ retry: count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); return false; } - count_vm_event(THP_ZERO_PAGE_ALLOC); preempt_disable(); if (cmpxchg(&huge_zero_page, NULL, zero_page)) { preempt_enable(); @@ -176,6 +174,7 @@ retry: /* We take additional reference here. It will be put back by shrinker */ atomic_set(&huge_zero_refcount, 2); preempt_enable(); + count_vm_event(THP_ZERO_PAGE_ALLOC); return true; } @@ -772,8 +771,7 @@ static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, return; entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); - if (pgtable) - pgtable_trans_huge_deposit(mm, pmd, pgtable); + pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); mm_inc_nr_ptes(mm); } @@ -1037,6 +1035,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn = pmd_pfn(*pmd); struct mm_struct *mm = vma->vm_mm; struct page *page; + int ret; assert_spin_locked(pmd_lockptr(mm, pmd)); @@ -1068,8 +1067,9 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); - if (!try_grab_page(page, flags)) - page = ERR_PTR(-ENOMEM); + ret = try_grab_page(page, flags); + if (ret) + page = ERR_PTR(ret); return page; } @@ -1195,6 +1195,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn = pud_pfn(*pud); struct mm_struct *mm = vma->vm_mm; struct page *page; + int ret; assert_spin_locked(pud_lockptr(mm, pud)); @@ -1228,8 +1229,10 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); - if (!try_grab_page(page, flags)) - page = ERR_PTR(-ENOMEM); + + ret = try_grab_page(page, flags); + if (ret) + page = ERR_PTR(ret); return page; } @@ -1307,6 +1310,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; + struct folio *folio; struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t orig_pmd = vmf->orig_pmd; @@ -1314,9 +1318,6 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); - VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE)); - VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE)); - if (is_huge_zero_pmd(orig_pmd)) goto fallback; @@ -1328,46 +1329,48 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) } page = pmd_page(orig_pmd); + folio = page_folio(page); VM_BUG_ON_PAGE(!PageHead(page), page); /* Early check when only holding the PT lock. */ if (PageAnonExclusive(page)) goto reuse; - if (!trylock_page(page)) { - get_page(page); + if (!folio_trylock(folio)) { + folio_get(folio); spin_unlock(vmf->ptl); - lock_page(page); + folio_lock(folio); spin_lock(vmf->ptl); if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { spin_unlock(vmf->ptl); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return 0; } - put_page(page); + folio_put(folio); } /* Recheck after temporarily dropping the PT lock. */ if (PageAnonExclusive(page)) { - unlock_page(page); + folio_unlock(folio); goto reuse; } /* - * See do_wp_page(): we can only reuse the page exclusively if there are - * no additional references. Note that we always drain the LRU - * pagevecs immediately after adding a THP. + * See do_wp_page(): we can only reuse the folio exclusively if + * there are no additional references. Note that we always drain + * the LRU pagevecs immediately after adding a THP. */ - if (page_count(page) > 1 + PageSwapCache(page) * thp_nr_pages(page)) + if (folio_ref_count(folio) > + 1 + folio_test_swapcache(folio) * folio_nr_pages(folio)) goto unlock_fallback; - if (PageSwapCache(page)) - try_to_free_swap(page); - if (page_count(page) == 1) { + if (folio_test_swapcache(folio)) + folio_free_swap(folio); + if (folio_ref_count(folio) == 1) { pmd_t entry; page_move_anon_rmap(page, vma); - unlock_page(page); + folio_unlock(folio); reuse: if (unlikely(unshare)) { spin_unlock(vmf->ptl); @@ -1378,17 +1381,47 @@ reuse: if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); spin_unlock(vmf->ptl); - return VM_FAULT_WRITE; + return 0; } unlock_fallback: - unlock_page(page); + folio_unlock(folio); spin_unlock(vmf->ptl); fallback: __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); return VM_FAULT_FALLBACK; } +static inline bool can_change_pmd_writable(struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd) +{ + struct page *page; + + if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE))) + return false; + + /* Don't touch entries that are not even readable (NUMA hinting). */ + if (pmd_protnone(pmd)) + return false; + + /* Do we need write faults for softdirty tracking? */ + if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd)) + return false; + + /* Do we need write faults for uffd-wp tracking? */ + if (userfaultfd_huge_pmd_wp(vma, pmd)) + return false; + + if (!(vma->vm_flags & VM_SHARED)) { + /* See can_change_pte_writable(). */ + page = vm_normal_page_pmd(vma, addr, pmd); + return page && PageAnon(page) && PageAnonExclusive(page); + } + + /* See can_change_pte_writable(). */ + return pmd_dirty(pmd); +} + /* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */ static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, struct vm_area_struct *vma, @@ -1434,6 +1467,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; struct page *page; + int ret; assert_spin_locked(pmd_lockptr(mm, pmd)); @@ -1449,17 +1483,18 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, return ERR_PTR(-EFAULT); /* Full NUMA hinting faults to serialise migration in fault paths */ - if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) + if (pmd_protnone(*pmd) && !gup_can_follow_protnone(flags)) return NULL; - if (!pmd_write(*pmd) && gup_must_unshare(flags, page)) + if (!pmd_write(*pmd) && gup_must_unshare(vma, flags, page)) return ERR_PTR(-EMLINK); VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && !PageAnonExclusive(page), page); - if (!try_grab_page(page, flags)) - return ERR_PTR(-ENOMEM); + ret = try_grab_page(page, flags); + if (ret) + return ERR_PTR(ret); if (flags & FOLL_TOUCH) touch_pmd(vma, addr, pmd, flags & FOLL_WRITE); @@ -1479,9 +1514,8 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; int page_nid = NUMA_NO_NODE; - int target_nid, last_cpupid = -1; - bool migrated = false; - bool was_writable = pmd_savedwrite(oldpmd); + int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK); + bool migrated = false, writable = false; int flags = 0; vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); @@ -1491,16 +1525,31 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) } pmd = pmd_modify(oldpmd, vma->vm_page_prot); + + /* + * Detect now whether the PMD could be writable; this information + * is only valid while holding the PT lock. + */ + writable = pmd_write(pmd); + if (!writable && vma_wants_manual_pte_write_upgrade(vma) && + can_change_pmd_writable(vma, vmf->address, pmd)) + writable = true; + page = vm_normal_page_pmd(vma, haddr, pmd); if (!page) goto out_map; /* See similar comment in do_numa_page for explanation */ - if (!was_writable) + if (!writable) flags |= TNF_NO_GROUP; page_nid = page_to_nid(page); - last_cpupid = page_cpupid_last(page); + /* + * For memory tiering mode, cpupid of slow memory page is used + * to record page access time. So use default value. + */ + if (node_is_toptier(page_nid)) + last_cpupid = page_cpupid_last(page); target_nid = numa_migrate_prep(page, vma, haddr, page_nid, &flags); @@ -1510,6 +1559,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) } spin_unlock(vmf->ptl); + writable = false; migrated = migrate_misplaced_page(page, vma, target_nid); if (migrated) { @@ -1536,7 +1586,7 @@ out_map: /* Restore the PMD */ pmd = pmd_modify(oldpmd, vma->vm_page_prot); pmd = pmd_mkyoung(pmd); - if (was_writable) + if (writable) pmd = pmd_mkwrite(pmd); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); @@ -1777,11 +1827,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; pmd_t oldpmd, entry; - bool preserve_write; - int ret; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; + int ret = 1; tlb_change_page_size(tlb, HPAGE_PMD_SIZE); @@ -1792,9 +1841,6 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (!ptl) return 0; - preserve_write = prot_numa && pmd_write(*pmd); - ret = 1; - #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION if (is_swap_pmd(*pmd)) { swp_entry_t entry = pmd_to_swp_entry(*pmd); @@ -1824,6 +1870,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (prot_numa) { struct page *page; + bool toptier; /* * Avoid trapping faults against the zero page. The read-only * data is likely to be read-cached on the local CPU and @@ -1836,13 +1883,18 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, goto unlock; page = pmd_page(*pmd); + toptier = node_is_toptier(page_to_nid(page)); /* * Skip scanning top tier node if normal numa * balancing is disabled */ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && - node_is_toptier(page_to_nid(page))) + toptier) goto unlock; + + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && + !toptier) + xchg_page_access_time(page, jiffies_to_msecs(jiffies)); } /* * In case prot_numa, we are under mmap_read_lock(mm). It's critical @@ -1868,8 +1920,6 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, oldpmd = pmdp_invalidate_ad(vma, addr, pmd); entry = pmd_modify(oldpmd, newprot); - if (preserve_write) - entry = pmd_mk_savedwrite(entry); if (uffd_wp) { entry = pmd_wrprotect(entry); entry = pmd_mkuffd_wp(entry); @@ -1881,13 +1931,17 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, */ entry = pmd_clear_uffd_wp(entry); } + + /* See change_pte_range(). */ + if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) && + can_change_pmd_writable(vma, addr, entry)) + entry = pmd_mkwrite(entry); + ret = HPAGE_PMD_NR; set_pmd_at(mm, addr, pmd, entry); if (huge_pmd_needs_flush(oldpmd, entry)) tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE); - - BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry)); unlock: spin_unlock(ptl); return ret; @@ -2029,7 +2083,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, pgtable_t pgtable; pmd_t old_pmd, _pmd; bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; - bool anon_exclusive = false; + bool anon_exclusive = false, dirty = false; unsigned long addr; int i; @@ -2113,20 +2167,22 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, write = is_writable_migration_entry(entry); if (PageAnon(page)) anon_exclusive = is_readable_exclusive_migration_entry(entry); - young = false; + young = is_migration_entry_young(entry); + dirty = is_migration_entry_dirty(entry); soft_dirty = pmd_swp_soft_dirty(old_pmd); uffd_wp = pmd_swp_uffd_wp(old_pmd); } else { page = pmd_page(old_pmd); - if (pmd_dirty(old_pmd)) + if (pmd_dirty(old_pmd)) { + dirty = true; SetPageDirty(page); + } write = pmd_write(old_pmd); young = pmd_young(old_pmd); soft_dirty = pmd_soft_dirty(old_pmd); uffd_wp = pmd_uffd_wp(old_pmd); VM_BUG_ON_PAGE(!page_count(page), page); - page_ref_add(page, HPAGE_PMD_NR - 1); /* * Without "freeze", we'll simply split the PMD, propagating the @@ -2140,10 +2196,14 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, * * In case we cannot clear PageAnonExclusive(), split the PMD * only and let try_to_migrate_one() fail later. + * + * See page_try_share_anon_rmap(): invalidate PMD first. */ anon_exclusive = PageAnon(page) && PageAnonExclusive(page); if (freeze && anon_exclusive && page_try_share_anon_rmap(page)) freeze = false; + if (!freeze) + page_ref_add(page, HPAGE_PMD_NR - 1); } /* @@ -2171,6 +2231,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, else swp_entry = make_readable_migration_entry( page_to_pfn(page + i)); + if (young) + swp_entry = make_migration_entry_young(swp_entry); + if (dirty) + swp_entry = make_migration_entry_dirty(swp_entry); entry = swp_entry_to_pte(swp_entry); if (soft_dirty) entry = pte_swp_mksoft_dirty(entry); @@ -2181,60 +2245,37 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = maybe_mkwrite(entry, vma); if (anon_exclusive) SetPageAnonExclusive(page + i); - if (!write) - entry = pte_wrprotect(entry); if (!young) entry = pte_mkold(entry); + /* NOTE: this may set soft-dirty too on some archs */ + if (dirty) + entry = pte_mkdirty(entry); + /* + * NOTE: this needs to happen after pte_mkdirty, + * because some archs (sparc64, loongarch) could + * set hw write bit when mkdirty. + */ + if (!write) + entry = pte_wrprotect(entry); if (soft_dirty) entry = pte_mksoft_dirty(entry); if (uffd_wp) entry = pte_mkuffd_wp(entry); + page_add_anon_rmap(page + i, vma, addr, false); } pte = pte_offset_map(&_pmd, addr); BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, entry); - if (!pmd_migration) - atomic_inc(&page[i]._mapcount); pte_unmap(pte); } - if (!pmd_migration) { - /* - * Set PG_double_map before dropping compound_mapcount to avoid - * false-negative page_mapped(). - */ - if (compound_mapcount(page) > 1 && - !TestSetPageDoubleMap(page)) { - for (i = 0; i < HPAGE_PMD_NR; i++) - atomic_inc(&page[i]._mapcount); - } - - lock_page_memcg(page); - if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { - /* Last compound_mapcount is gone. */ - __mod_lruvec_page_state(page, NR_ANON_THPS, - -HPAGE_PMD_NR); - if (TestClearPageDoubleMap(page)) { - /* No need in mapcount reference anymore */ - for (i = 0; i < HPAGE_PMD_NR; i++) - atomic_dec(&page[i]._mapcount); - } - } - unlock_page_memcg(page); - - /* Above is effectively page_remove_rmap(page, vma, true) */ - munlock_vma_page(page, vma, true); - } + if (!pmd_migration) + page_remove_rmap(page, vma, true); + if (freeze) + put_page(page); smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); - - if (freeze) { - for (i = 0; i < HPAGE_PMD_NR; i++) { - page_remove_rmap(page + i, vma, false); - put_page(page + i); - } - } } void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, @@ -2288,25 +2329,11 @@ out: void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct folio *folio) { - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - - pgd = pgd_offset(vma->vm_mm, address); - if (!pgd_present(*pgd)) - return; + pmd_t *pmd = mm_find_pmd(vma->vm_mm, address); - p4d = p4d_offset(pgd, address); - if (!p4d_present(*p4d)) + if (!pmd) return; - pud = pud_offset(p4d, address); - if (!pud_present(*pud)) - return; - - pmd = pmd_offset(pud, address); - __split_huge_pmd(vma, pmd, address, freeze, folio); } @@ -2334,24 +2361,23 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, split_huge_pmd_if_needed(vma, end); /* - * If we're also updating the vma->vm_next->vm_start, + * If we're also updating the next vma vm_start, * check if we need to split it. */ if (adjust_next > 0) { - struct vm_area_struct *next = vma->vm_next; + struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end); unsigned long nstart = next->vm_start; nstart += adjust_next; split_huge_pmd_if_needed(next, nstart); } } -static void unmap_page(struct page *page) +static void unmap_folio(struct folio *folio) { - struct folio *folio = page_folio(page); enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | TTU_SYNC; - VM_BUG_ON_PAGE(!PageHead(page), page); + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); /* * Anon pages need migration entries to preserve them, but file @@ -2368,7 +2394,7 @@ static void remap_page(struct folio *folio, unsigned long nr) { int i = 0; - /* If unmap_page() uses try_to_migrate() on file, remove this check */ + /* If unmap_folio() uses try_to_migrate() on file, remove this check */ if (!folio_test_anon(folio)) return; for (;;) { @@ -2418,7 +2444,7 @@ static void __split_huge_page_tail(struct page *head, int tail, * for example lock_page() which set PG_waiters. * * Note that for mapped sub-pages of an anonymous THP, - * PG_anon_exclusive has been cleared in unmap_page() and is stored in + * PG_anon_exclusive has been cleared in unmap_folio() and is stored in * the migration entry instead from where remap_page() will restore it. * We can still have PG_anon_exclusive set on effectively unmapped and * unreferenced sub-pages of an anonymous THP: we can simply drop @@ -2435,17 +2461,32 @@ static void __split_huge_page_tail(struct page *head, int tail, (1L << PG_workingset) | (1L << PG_locked) | (1L << PG_unevictable) | -#ifdef CONFIG_64BIT +#ifdef CONFIG_ARCH_USES_PG_ARCH_X (1L << PG_arch_2) | + (1L << PG_arch_3) | #endif - (1L << PG_dirty))); + (1L << PG_dirty) | + LRU_GEN_MASK | LRU_REFS_MASK)); - /* ->mapping in first tail page is compound_mapcount */ + /* ->mapping in first and second tail page is replaced by other uses */ VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, page_tail); page_tail->mapping = head->mapping; page_tail->index = head->index + tail; - page_tail->private = 0; + + /* + * page->private should not be set in tail pages with the exception + * of swap cache pages that store the swp_entry_t in tail pages. + * Fix up and warn once if private is unexpectedly set. + * + * What of 32-bit systems, on which head[1].compound_pincount overlays + * head[1].private? No problem: THP_SWAP is not enabled on 32-bit, and + * compound_pincount must be 0 for folio_ref_freeze() to have succeeded. + */ + if (!folio_test_swapcache(page_folio(head))) { + VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail); + page_tail->private = 0; + } /* Page flags must be visible before we make the page non-compound. */ smp_wmb(); @@ -2611,27 +2652,26 @@ bool can_split_folio(struct folio *folio, int *pextra_pins) int split_huge_page_to_list(struct page *page, struct list_head *list) { struct folio *folio = page_folio(page); - struct page *head = &folio->page; - struct deferred_split *ds_queue = get_deferred_split_queue(head); - XA_STATE(xas, &head->mapping->i_pages, head->index); + struct deferred_split *ds_queue = get_deferred_split_queue(&folio->page); + XA_STATE(xas, &folio->mapping->i_pages, folio->index); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; int extra_pins, ret; pgoff_t end; bool is_hzp; - VM_BUG_ON_PAGE(!PageLocked(head), head); - VM_BUG_ON_PAGE(!PageCompound(head), head); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); - is_hzp = is_huge_zero_page(head); - VM_WARN_ON_ONCE_PAGE(is_hzp, head); + is_hzp = is_huge_zero_page(&folio->page); + VM_WARN_ON_ONCE_FOLIO(is_hzp, folio); if (is_hzp) return -EBUSY; - if (PageWriteback(head)) + if (folio_test_writeback(folio)) return -EBUSY; - if (PageAnon(head)) { + if (folio_test_anon(folio)) { /* * The caller does not necessarily hold an mmap_lock that would * prevent the anon_vma disappearing so we first we take a @@ -2640,7 +2680,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * is taken to serialise against parallel split or collapse * operations. */ - anon_vma = page_get_anon_vma(head); + anon_vma = folio_get_anon_vma(folio); if (!anon_vma) { ret = -EBUSY; goto out; @@ -2649,7 +2689,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) mapping = NULL; anon_vma_lock_write(anon_vma); } else { - mapping = head->mapping; + gfp_t gfp; + + mapping = folio->mapping; /* Truncated ? */ if (!mapping) { @@ -2657,8 +2699,16 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out; } - xas_split_alloc(&xas, head, compound_order(head), - mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK); + gfp = current_gfp_context(mapping_gfp_mask(mapping) & + GFP_RECLAIM_MASK); + + if (folio_test_private(folio) && + !filemap_release_folio(folio, gfp)) { + ret = -EBUSY; + goto out; + } + + xas_split_alloc(&xas, folio, folio_order(folio), gfp); if (xas_error(&xas)) { ret = xas_error(&xas); goto out; @@ -2672,7 +2722,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * but on 32-bit, i_size_read() takes an irq-unsafe seqlock, * which cannot be nested inside the page tree lock. So note * end now: i_size itself may be changed at any moment, but - * head page lock is good enough to serialize the trimming. + * folio lock is good enough to serialize the trimming. */ end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); if (shmem_mapping(mapping)) @@ -2680,46 +2730,46 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } /* - * Racy check if we can split the page, before unmap_page() will + * Racy check if we can split the page, before unmap_folio() will * split PMDs */ if (!can_split_folio(folio, &extra_pins)) { - ret = -EBUSY; + ret = -EAGAIN; goto out_unlock; } - unmap_page(head); + unmap_folio(folio); /* block interrupt reentry in xa_lock and spinlock */ local_irq_disable(); if (mapping) { /* - * Check if the head page is present in page cache. - * We assume all tail are present too, if head is there. + * Check if the folio is present in page cache. + * We assume all tail are present too, if folio is there. */ xas_lock(&xas); xas_reset(&xas); - if (xas_load(&xas) != head) + if (xas_load(&xas) != folio) goto fail; } /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock(&ds_queue->split_queue_lock); - if (page_ref_freeze(head, 1 + extra_pins)) { - if (!list_empty(page_deferred_list(head))) { + if (folio_ref_freeze(folio, 1 + extra_pins)) { + if (!list_empty(page_deferred_list(&folio->page))) { ds_queue->split_queue_len--; - list_del(page_deferred_list(head)); + list_del(page_deferred_list(&folio->page)); } spin_unlock(&ds_queue->split_queue_lock); if (mapping) { - int nr = thp_nr_pages(head); + int nr = folio_nr_pages(folio); - xas_split(&xas, head, thp_order(head)); - if (PageSwapBacked(head)) { - __mod_lruvec_page_state(head, NR_SHMEM_THPS, + xas_split(&xas, folio, folio_order(folio)); + if (folio_test_swapbacked(folio)) { + __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); } else { - __mod_lruvec_page_state(head, NR_FILE_THPS, + __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } @@ -2734,7 +2784,7 @@ fail: xas_unlock(&xas); local_irq_enable(); remap_page(folio, folio_nr_pages(folio)); - ret = -EBUSY; + ret = -EAGAIN; } out_unlock: @@ -2894,11 +2944,9 @@ static void split_huge_pages_all(void) max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { int nr_pages; - if (!pfn_valid(pfn)) - continue; - page = pfn_to_page(pfn); - if (!get_page_unless_zero(page)) + page = pfn_to_online_page(pfn); + if (!page || !get_page_unless_zero(page)) continue; if (zone != page_zone(page)) @@ -2985,7 +3033,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, /* FOLL_DUMP to ignore special (like zero) pages */ page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); - if (IS_ERR_OR_NULL(page) || is_zone_device_page(page)) + if (IS_ERR_OR_NULL(page)) continue; if (!is_transparent_hugepage(page)) @@ -3040,28 +3088,28 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, mapping = candidate->f_mapping; for (index = off_start; index < off_end; index += nr_pages) { - struct page *fpage = pagecache_get_page(mapping, index, - FGP_ENTRY | FGP_HEAD, 0); + struct folio *folio = __filemap_get_folio(mapping, index, + FGP_ENTRY, 0); nr_pages = 1; - if (xa_is_value(fpage) || !fpage) + if (xa_is_value(folio) || !folio) continue; - if (!is_transparent_hugepage(fpage)) + if (!folio_test_large(folio)) goto next; total++; - nr_pages = thp_nr_pages(fpage); + nr_pages = folio_nr_pages(folio); - if (!trylock_page(fpage)) + if (!folio_trylock(folio)) goto next; - if (!split_huge_page(fpage)) + if (!split_folio(folio)) split++; - unlock_page(fpage); + folio_unlock(folio); next: - put_page(fpage); + folio_put(folio); cond_resched(); } @@ -3177,6 +3225,7 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); pmdval = pmdp_invalidate(vma, address, pvmw->pmd); + /* See page_try_share_anon_rmap(): invalidate PMD first. */ anon_exclusive = PageAnon(page) && PageAnonExclusive(page); if (anon_exclusive && page_try_share_anon_rmap(page)) { set_pmd_at(mm, address, pvmw->pmd, pmdval); @@ -3191,6 +3240,10 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, entry = make_readable_exclusive_migration_entry(page_to_pfn(page)); else entry = make_readable_migration_entry(page_to_pfn(page)); + if (pmd_young(pmdval)) + entry = make_migration_entry_young(entry); + if (pmd_dirty(pmdval)) + entry = make_migration_entry_dirty(entry); pmdswp = swp_entry_to_pmd(entry); if (pmd_soft_dirty(pmdval)) pmdswp = pmd_swp_mksoft_dirty(pmdswp); @@ -3216,13 +3269,18 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) entry = pmd_to_swp_entry(*pvmw->pmd); get_page(new); - pmde = pmd_mkold(mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot))); + pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot)); if (pmd_swp_soft_dirty(*pvmw->pmd)) pmde = pmd_mksoft_dirty(pmde); if (is_writable_migration_entry(entry)) pmde = maybe_pmd_mkwrite(pmde, vma); if (pmd_swp_uffd_wp(*pvmw->pmd)) pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde)); + if (!is_migration_entry_young(entry)) + pmde = pmd_mkold(pmde); + /* NOTE: this may contain setting soft-dirty on some archs */ + if (PageDirty(new) && is_migration_entry_dirty(entry)) + pmde = pmd_mkdirty(pmde); if (PageAnon(new)) { rmap_t rmap_flags = RMAP_COMPOUND; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e070b8593b37..db895230ee7e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -33,6 +33,7 @@ #include <linux/migrate.h> #include <linux/nospec.h> #include <linux/delayacct.h> +#include <linux/memory.h> #include <asm/page.h> #include <asm/pgalloc.h> @@ -53,13 +54,13 @@ struct hstate hstates[HUGE_MAX_HSTATE]; #ifdef CONFIG_CMA static struct cma *hugetlb_cma[MAX_NUMNODES]; static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata; -static bool hugetlb_cma_page(struct page *page, unsigned int order) +static bool hugetlb_cma_folio(struct folio *folio, unsigned int order) { - return cma_pages_valid(hugetlb_cma[page_to_nid(page)], page, + return cma_pages_valid(hugetlb_cma[folio_nid(folio)], &folio->page, 1 << order); } #else -static bool hugetlb_cma_page(struct page *page, unsigned int order) +static bool hugetlb_cma_folio(struct folio *folio, unsigned int order) { return false; } @@ -90,6 +91,9 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); +static void hugetlb_vma_lock_free(struct vm_area_struct *vma); +static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); +static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); static inline bool subpool_is_free(struct hugepage_subpool *spool) { @@ -251,13 +255,159 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) return subpool_inode(file_inode(vma->vm_file)); } +/* + * hugetlb vma_lock helper routines + */ +static bool __vma_shareable_lock(struct vm_area_struct *vma) +{ + return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) && + vma->vm_private_data; +} + +void hugetlb_vma_lock_read(struct vm_area_struct *vma) +{ + if (__vma_shareable_lock(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + down_read(&vma_lock->rw_sema); + } +} + +void hugetlb_vma_unlock_read(struct vm_area_struct *vma) +{ + if (__vma_shareable_lock(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + up_read(&vma_lock->rw_sema); + } +} + +void hugetlb_vma_lock_write(struct vm_area_struct *vma) +{ + if (__vma_shareable_lock(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + down_write(&vma_lock->rw_sema); + } +} + +void hugetlb_vma_unlock_write(struct vm_area_struct *vma) +{ + if (__vma_shareable_lock(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + up_write(&vma_lock->rw_sema); + } +} + +int hugetlb_vma_trylock_write(struct vm_area_struct *vma) +{ + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + if (!__vma_shareable_lock(vma)) + return 1; + + return down_write_trylock(&vma_lock->rw_sema); +} + +void hugetlb_vma_assert_locked(struct vm_area_struct *vma) +{ + if (__vma_shareable_lock(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + lockdep_assert_held(&vma_lock->rw_sema); + } +} + +void hugetlb_vma_lock_release(struct kref *kref) +{ + struct hugetlb_vma_lock *vma_lock = container_of(kref, + struct hugetlb_vma_lock, refs); + + kfree(vma_lock); +} + +static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock) +{ + struct vm_area_struct *vma = vma_lock->vma; + + /* + * vma_lock structure may or not be released as a result of put, + * it certainly will no longer be attached to vma so clear pointer. + * Semaphore synchronizes access to vma_lock->vma field. + */ + vma_lock->vma = NULL; + vma->vm_private_data = NULL; + up_write(&vma_lock->rw_sema); + kref_put(&vma_lock->refs, hugetlb_vma_lock_release); +} + +static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma) +{ + if (__vma_shareable_lock(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + __hugetlb_vma_unlock_write_put(vma_lock); + } +} + +static void hugetlb_vma_lock_free(struct vm_area_struct *vma) +{ + /* + * Only present in sharable vmas. + */ + if (!vma || !__vma_shareable_lock(vma)) + return; + + if (vma->vm_private_data) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + down_write(&vma_lock->rw_sema); + __hugetlb_vma_unlock_write_put(vma_lock); + } +} + +static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) +{ + struct hugetlb_vma_lock *vma_lock; + + /* Only establish in (flags) sharable vmas */ + if (!vma || !(vma->vm_flags & VM_MAYSHARE)) + return; + + /* Should never get here with non-NULL vm_private_data */ + if (vma->vm_private_data) + return; + + vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL); + if (!vma_lock) { + /* + * If we can not allocate structure, then vma can not + * participate in pmd sharing. This is only a possible + * performance enhancement and memory saving issue. + * However, the lock is also used to synchronize page + * faults with truncation. If the lock is not present, + * unlikely races could leave pages in a file past i_size + * until the file is removed. Warn in the unlikely case of + * allocation failure. + */ + pr_warn_once("HugeTLB: unable to allocate vma specific lock\n"); + return; + } + + kref_init(&vma_lock->refs); + init_rwsem(&vma_lock->rw_sema); + vma_lock->vma = vma; + vma->vm_private_data = vma_lock; +} + /* Helper that removes a struct file_region from the resv_map cache and returns * it for use. */ static struct file_region * get_file_region_entry_from_cache(struct resv_map *resv, long from, long to) { - struct file_region *nrg = NULL; + struct file_region *nrg; VM_BUG_ON(resv->region_cache_count <= 0); @@ -339,7 +489,7 @@ static bool has_same_uncharge_info(struct file_region *rg, static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) { - struct file_region *nrg = NULL, *prg = NULL; + struct file_region *nrg, *prg; prg = list_prev_entry(rg, link); if (&prg->link != &resv->regions && prg->to == rg->from && @@ -456,14 +606,12 @@ static int allocate_file_region_entries(struct resv_map *resv, int regions_needed) __must_hold(&resv->lock) { - struct list_head allocated_regions; + LIST_HEAD(allocated_regions); int to_allocate = 0, i = 0; struct file_region *trg = NULL, *rg = NULL; VM_BUG_ON(regions_needed < 0); - INIT_LIST_HEAD(&allocated_regions); - /* * Check for sufficient descriptors in the cache to accommodate * the number of in progress add operations plus regions_needed. @@ -860,7 +1008,7 @@ __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) * faults in a MAP_PRIVATE mapping. Only the process that called mmap() * is guaranteed to have their future faults succeed. * - * With the exception of reset_vma_resv_huge_pages() which is called at fork(), + * With the exception of hugetlb_dup_vma_private() which is called at fork(), * the reserve counters are updated with the hugetlb_lock held. It is safe * to reset the VMA at fork() time as it is not in use yet and there is no * chance of the global counters getting corrupted as a result of the values. @@ -1007,12 +1155,28 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) return (get_vma_private_data(vma) & flag) != 0; } -/* Reset counters to 0 and clear all HPAGE_RESV_* flags */ -void reset_vma_resv_huge_pages(struct vm_area_struct *vma) +void hugetlb_dup_vma_private(struct vm_area_struct *vma) { VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); - if (!(vma->vm_flags & VM_MAYSHARE)) - vma->vm_private_data = (void *)0; + /* + * Clear vm_private_data + * - For shared mappings this is a per-vma semaphore that may be + * allocated in a subsequent call to hugetlb_vm_op_open. + * Before clearing, make sure pointer is not associated with vma + * as this will leak the structure. This is the case when called + * via clear_vma_resv_huge_pages() and hugetlb_vm_op_open has already + * been called to allocate a new structure. + * - For MAP_PRIVATE mappings, this is the reserve map which does + * not apply to children. Faults generated by the children are + * not guaranteed to succeed, even if read-only. + */ + if (vma->vm_flags & VM_MAYSHARE) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + if (vma_lock && vma_lock->vma != vma) + vma->vm_private_data = NULL; + } else + vma->vm_private_data = NULL; } /* @@ -1043,7 +1207,7 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma) kref_put(&reservations->refs, resv_map_release); } - reset_vma_resv_huge_pages(vma); + hugetlb_dup_vma_private(vma); } /* Returns true if the VMA has associated reserve pages */ @@ -1109,17 +1273,17 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg) return false; } -static void enqueue_huge_page(struct hstate *h, struct page *page) +static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio) { - int nid = page_to_nid(page); + int nid = folio_nid(folio); lockdep_assert_held(&hugetlb_lock); - VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); - list_move(&page->lru, &h->hugepage_freelists[nid]); + list_move(&folio->lru, &h->hugepage_freelists[nid]); h->free_huge_pages++; h->free_huge_pages_node[nid]++; - SetHPageFreed(page); + folio_set_hugetlb_freed(folio); } static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) @@ -1182,6 +1346,11 @@ retry_cpuset: return NULL; } +static unsigned long available_huge_pages(struct hstate *h) +{ + return h->free_huge_pages - h->resv_huge_pages; +} + static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve, @@ -1198,12 +1367,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, * have no page reserves. This check ensures that reservations are * not "stolen". The child may still get SIGKILLed */ - if (!vma_has_reserves(vma, chg) && - h->free_huge_pages - h->resv_huge_pages == 0) + if (!vma_has_reserves(vma, chg) && !available_huge_pages(h)) goto err; /* If reserves cannot be used, ensure enough pages are in the pool */ - if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) + if (avoid_reserve && !available_huge_pages(h)) goto err; gfp_mask = htlb_alloc_mask(h); @@ -1303,75 +1471,76 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) nr_nodes--) /* used to demote non-gigantic_huge pages as well */ -static void __destroy_compound_gigantic_page(struct page *page, +static void __destroy_compound_gigantic_folio(struct folio *folio, unsigned int order, bool demote) { int i; int nr_pages = 1 << order; - struct page *p = page + 1; + struct page *p; - atomic_set(compound_mapcount_ptr(page), 0); - atomic_set(compound_pincount_ptr(page), 0); + atomic_set(folio_mapcount_ptr(folio), 0); + atomic_set(folio_subpages_mapcount_ptr(folio), 0); + atomic_set(folio_pincount_ptr(folio), 0); - for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { + for (i = 1; i < nr_pages; i++) { + p = folio_page(folio, i); p->mapping = NULL; clear_compound_head(p); if (!demote) set_page_refcounted(p); } - set_compound_order(page, 0); -#ifdef CONFIG_64BIT - page[1].compound_nr = 0; -#endif - __ClearPageHead(page); + folio_set_compound_order(folio, 0); + __folio_clear_head(folio); } -static void destroy_compound_hugetlb_page_for_demote(struct page *page, +static void destroy_compound_hugetlb_folio_for_demote(struct folio *folio, unsigned int order) { - __destroy_compound_gigantic_page(page, order, true); + __destroy_compound_gigantic_folio(folio, order, true); } #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static void destroy_compound_gigantic_page(struct page *page, +static void destroy_compound_gigantic_folio(struct folio *folio, unsigned int order) { - __destroy_compound_gigantic_page(page, order, false); + __destroy_compound_gigantic_folio(folio, order, false); } -static void free_gigantic_page(struct page *page, unsigned int order) +static void free_gigantic_folio(struct folio *folio, unsigned int order) { /* * If the page isn't allocated using the cma allocator, * cma_release() returns false. */ #ifdef CONFIG_CMA - if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order)) + int nid = folio_nid(folio); + + if (cma_release(hugetlb_cma[nid], &folio->page, 1 << order)) return; #endif - free_contig_range(page_to_pfn(page), 1 << order); + free_contig_range(folio_pfn(folio), 1 << order); } #ifdef CONFIG_CONTIG_ALLOC -static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, +static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { + struct page *page; unsigned long nr_pages = pages_per_huge_page(h); if (nid == NUMA_NO_NODE) nid = numa_mem_id(); #ifdef CONFIG_CMA { - struct page *page; int node; if (hugetlb_cma[nid]) { page = cma_alloc(hugetlb_cma[nid], nr_pages, huge_page_order(h), true); if (page) - return page; + return page_folio(page); } if (!(gfp_mask & __GFP_THISNODE)) { @@ -1382,17 +1551,18 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, page = cma_alloc(hugetlb_cma[node], nr_pages, huge_page_order(h), true); if (page) - return page; + return page_folio(page); } } } #endif - return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); + page = alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); + return page ? page_folio(page) : NULL; } #else /* !CONFIG_CONTIG_ALLOC */ -static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, +static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { return NULL; @@ -1400,40 +1570,41 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, #endif /* CONFIG_CONTIG_ALLOC */ #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ -static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, +static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { return NULL; } -static inline void free_gigantic_page(struct page *page, unsigned int order) { } -static inline void destroy_compound_gigantic_page(struct page *page, +static inline void free_gigantic_folio(struct folio *folio, + unsigned int order) { } +static inline void destroy_compound_gigantic_folio(struct folio *folio, unsigned int order) { } #endif /* - * Remove hugetlb page from lists, and update dtor so that page appears + * Remove hugetlb folio from lists, and update dtor so that the folio appears * as just a compound page. * - * A reference is held on the page, except in the case of demote. + * A reference is held on the folio, except in the case of demote. * * Must be called with hugetlb lock held. */ -static void __remove_hugetlb_page(struct hstate *h, struct page *page, +static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio, bool adjust_surplus, bool demote) { - int nid = page_to_nid(page); + int nid = folio_nid(folio); - VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); - VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); + VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio(folio), folio); + VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio); lockdep_assert_held(&hugetlb_lock); if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; - list_del(&page->lru); + list_del(&folio->lru); - if (HPageFreed(page)) { + if (folio_test_hugetlb_freed(folio)) { h->free_huge_pages--; h->free_huge_pages_node[nid]--; } @@ -1452,50 +1623,50 @@ static void __remove_hugetlb_page(struct hstate *h, struct page *page, * * For gigantic pages set the destructor to the null dtor. This * destructor will never be called. Before freeing the gigantic - * page destroy_compound_gigantic_page will turn the compound page - * into a simple group of pages. After this the destructor does not + * page destroy_compound_gigantic_folio will turn the folio into a + * simple group of pages. After this the destructor does not * apply. * * This handles the case where more than one ref is held when and - * after update_and_free_page is called. + * after update_and_free_hugetlb_folio is called. * * In the case of demote we do not ref count the page as it will soon * be turned into a page of smaller size. */ if (!demote) - set_page_refcounted(page); + folio_ref_unfreeze(folio, 1); if (hstate_is_gigantic(h)) - set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + folio_set_compound_dtor(folio, NULL_COMPOUND_DTOR); else - set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); + folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR); h->nr_huge_pages--; h->nr_huge_pages_node[nid]--; } -static void remove_hugetlb_page(struct hstate *h, struct page *page, +static void remove_hugetlb_folio(struct hstate *h, struct folio *folio, bool adjust_surplus) { - __remove_hugetlb_page(h, page, adjust_surplus, false); + __remove_hugetlb_folio(h, folio, adjust_surplus, false); } -static void remove_hugetlb_page_for_demote(struct hstate *h, struct page *page, +static void remove_hugetlb_folio_for_demote(struct hstate *h, struct folio *folio, bool adjust_surplus) { - __remove_hugetlb_page(h, page, adjust_surplus, true); + __remove_hugetlb_folio(h, folio, adjust_surplus, true); } -static void add_hugetlb_page(struct hstate *h, struct page *page, +static void add_hugetlb_folio(struct hstate *h, struct folio *folio, bool adjust_surplus) { int zeroed; - int nid = page_to_nid(page); + int nid = folio_nid(folio); - VM_BUG_ON_PAGE(!HPageVmemmapOptimized(page), page); + VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio); lockdep_assert_held(&hugetlb_lock); - INIT_LIST_HEAD(&page->lru); + INIT_LIST_HEAD(&folio->lru); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; @@ -1504,17 +1675,21 @@ static void add_hugetlb_page(struct hstate *h, struct page *page, h->surplus_huge_pages_node[nid]++; } - set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); - set_page_private(page, 0); - SetHPageVmemmapOptimized(page); + folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR); + folio_change_private(folio, NULL); + /* + * We have to set hugetlb_vmemmap_optimized again as above + * folio_change_private(folio, NULL) cleared it. + */ + folio_set_hugetlb_vmemmap_optimized(folio); /* - * This page is about to be managed by the hugetlb allocator and + * This folio is about to be managed by the hugetlb allocator and * should have no users. Drop our reference, and check for others * just in case. */ - zeroed = put_page_testzero(page); - if (!zeroed) + zeroed = folio_put_testzero(folio); + if (unlikely(!zeroed)) /* * It is VERY unlikely soneone else has taken a ref on * the page. In this case, we simply return as the @@ -1523,14 +1698,15 @@ static void add_hugetlb_page(struct hstate *h, struct page *page, */ return; - arch_clear_hugepage_flags(page); - enqueue_huge_page(h, page); + arch_clear_hugepage_flags(&folio->page); + enqueue_hugetlb_folio(h, folio); } static void __update_and_free_page(struct hstate *h, struct page *page) { int i; - struct page *subpage = page; + struct folio *folio = page_folio(page); + struct page *subpage; if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; @@ -1539,7 +1715,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) * If we don't know which subpages are hwpoisoned, we can't free * the hugepage, so it's leaked intentionally. */ - if (HPageRawHwpUnreliable(page)) + if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return; if (hugetlb_vmemmap_restore(h, page)) { @@ -1549,7 +1725,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) * page and put the page back on the hugetlb free list and treat * as a surplus page. */ - add_hugetlb_page(h, page, true); + add_hugetlb_folio(h, folio, true); spin_unlock_irq(&hugetlb_lock); return; } @@ -1558,11 +1734,11 @@ static void __update_and_free_page(struct hstate *h, struct page *page) * Move PageHWPoison flag from head page to the raw error pages, * which makes any healthy subpages reusable. */ - if (unlikely(PageHWPoison(page))) - hugetlb_clear_page_hwpoison(page); + if (unlikely(folio_test_hwpoison(folio))) + hugetlb_clear_page_hwpoison(&folio->page); - for (i = 0; i < pages_per_huge_page(h); - i++, subpage = mem_map_next(subpage, page, i)) { + for (i = 0; i < pages_per_huge_page(h); i++) { + subpage = folio_page(folio, i); subpage->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 1 << PG_dirty | 1 << PG_active | 1 << PG_private | @@ -1571,19 +1747,19 @@ static void __update_and_free_page(struct hstate *h, struct page *page) /* * Non-gigantic pages demoted from CMA allocated gigantic pages - * need to be given back to CMA in free_gigantic_page. + * need to be given back to CMA in free_gigantic_folio. */ if (hstate_is_gigantic(h) || - hugetlb_cma_page(page, huge_page_order(h))) { - destroy_compound_gigantic_page(page, huge_page_order(h)); - free_gigantic_page(page, huge_page_order(h)); + hugetlb_cma_folio(folio, huge_page_order(h))) { + destroy_compound_gigantic_folio(folio, huge_page_order(h)); + free_gigantic_folio(folio, huge_page_order(h)); } else { __free_pages(page, huge_page_order(h)); } } /* - * As update_and_free_page() can be called under any context, so we cannot + * As update_and_free_hugetlb_folio() can be called under any context, so we cannot * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate * the vmemmap pages. @@ -1612,8 +1788,9 @@ static void free_hpage_workfn(struct work_struct *work) /* * The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate() * is going to trigger because a previous call to - * remove_hugetlb_page() will set_compound_page_dtor(page, - * NULL_COMPOUND_DTOR), so do not use page_hstate() directly. + * remove_hugetlb_folio() will call folio_set_compound_dtor + * (folio, NULL_COMPOUND_DTOR), so do not use page_hstate() + * directly. */ h = size_to_hstate(page_size(page)); @@ -1630,11 +1807,11 @@ static inline void flush_free_hpage_work(struct hstate *h) flush_work(&free_hpage_work); } -static void update_and_free_page(struct hstate *h, struct page *page, +static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio, bool atomic) { - if (!HPageVmemmapOptimized(page) || !atomic) { - __update_and_free_page(h, page); + if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) { + __update_and_free_page(h, &folio->page); return; } @@ -1645,16 +1822,18 @@ static void update_and_free_page(struct hstate *h, struct page *page, * empty. Otherwise, schedule_work() had been called but the workfn * hasn't retrieved the list yet. */ - if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist)) + if (llist_add((struct llist_node *)&folio->mapping, &hpage_freelist)) schedule_work(&free_hpage_work); } static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) { struct page *page, *t_page; + struct folio *folio; list_for_each_entry_safe(page, t_page, list, lru) { - update_and_free_page(h, page, false); + folio = page_folio(page); + update_and_free_hugetlb_folio(h, folio, false); cond_resched(); } } @@ -1676,21 +1855,22 @@ void free_huge_page(struct page *page) * Can't pass hstate in here because it is called from the * compound page destructor. */ - struct hstate *h = page_hstate(page); - int nid = page_to_nid(page); - struct hugepage_subpool *spool = hugetlb_page_subpool(page); + struct folio *folio = page_folio(page); + struct hstate *h = folio_hstate(folio); + int nid = folio_nid(folio); + struct hugepage_subpool *spool = hugetlb_folio_subpool(folio); bool restore_reserve; unsigned long flags; - VM_BUG_ON_PAGE(page_count(page), page); - VM_BUG_ON_PAGE(page_mapcount(page), page); + VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); + VM_BUG_ON_FOLIO(folio_mapcount(folio), folio); - hugetlb_set_page_subpool(page, NULL); - if (PageAnon(page)) - __ClearPageAnonExclusive(page); - page->mapping = NULL; - restore_reserve = HPageRestoreReserve(page); - ClearHPageRestoreReserve(page); + hugetlb_set_folio_subpool(folio, NULL); + if (folio_test_anon(folio)) + __ClearPageAnonExclusive(&folio->page); + folio->mapping = NULL; + restore_reserve = folio_test_hugetlb_restore_reserve(folio); + folio_clear_hugetlb_restore_reserve(folio); /* * If HPageRestoreReserve was set on page, page allocation consumed a @@ -1712,26 +1892,26 @@ void free_huge_page(struct page *page) } spin_lock_irqsave(&hugetlb_lock, flags); - ClearHPageMigratable(page); - hugetlb_cgroup_uncharge_page(hstate_index(h), - pages_per_huge_page(h), page); - hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), - pages_per_huge_page(h), page); + folio_clear_hugetlb_migratable(folio); + hugetlb_cgroup_uncharge_folio(hstate_index(h), + pages_per_huge_page(h), folio); + hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), + pages_per_huge_page(h), folio); if (restore_reserve) h->resv_huge_pages++; - if (HPageTemporary(page)) { - remove_hugetlb_page(h, page, false); + if (folio_test_hugetlb_temporary(folio)) { + remove_hugetlb_folio(h, folio, false); spin_unlock_irqrestore(&hugetlb_lock, flags); - update_and_free_page(h, page, true); + update_and_free_hugetlb_folio(h, folio, true); } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ - remove_hugetlb_page(h, page, true); + remove_hugetlb_folio(h, folio, true); spin_unlock_irqrestore(&hugetlb_lock, flags); - update_and_free_page(h, page, true); + update_and_free_hugetlb_folio(h, folio, true); } else { arch_clear_hugepage_flags(page); - enqueue_huge_page(h, page); + enqueue_hugetlb_folio(h, folio); spin_unlock_irqrestore(&hugetlb_lock, flags); } } @@ -1746,36 +1926,38 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid) h->nr_huge_pages_node[nid]++; } -static void __prep_new_huge_page(struct hstate *h, struct page *page) +static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) { - hugetlb_vmemmap_optimize(h, page); - INIT_LIST_HEAD(&page->lru); - set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); - hugetlb_set_page_subpool(page, NULL); - set_hugetlb_cgroup(page, NULL); - set_hugetlb_cgroup_rsvd(page, NULL); + hugetlb_vmemmap_optimize(h, &folio->page); + INIT_LIST_HEAD(&folio->lru); + folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR); + hugetlb_set_folio_subpool(folio, NULL); + set_hugetlb_cgroup(folio, NULL); + set_hugetlb_cgroup_rsvd(folio, NULL); } -static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) +static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid) { - __prep_new_huge_page(h, page); + __prep_new_hugetlb_folio(h, folio); spin_lock_irq(&hugetlb_lock); __prep_account_new_huge_page(h, nid); spin_unlock_irq(&hugetlb_lock); } -static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, - bool demote) +static bool __prep_compound_gigantic_folio(struct folio *folio, + unsigned int order, bool demote) { int i, j; int nr_pages = 1 << order; - struct page *p = page + 1; + struct page *p; + + __folio_clear_reserved(folio); + __folio_set_head(folio); + /* we rely on prep_new_hugetlb_folio to set the destructor */ + folio_set_compound_order(folio, order); + for (i = 0; i < nr_pages; i++) { + p = folio_page(folio, i); - /* we rely on prep_new_huge_page to set the destructor */ - set_compound_order(page, order); - __ClearPageReserved(page); - __SetPageHead(page); - for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { /* * For gigantic hugepages allocated through bootmem at * boot, it's safer to be consistent with the not-gigantic @@ -1788,7 +1970,8 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, * on the head page when they need know if put_page() is needed * after get_user_pages(). */ - __ClearPageReserved(p); + if (i != 0) /* head page cleared above */ + __ClearPageReserved(p); /* * Subtle and very unlikely * @@ -1814,39 +1997,42 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, } else { VM_BUG_ON_PAGE(page_count(p), p); } - set_compound_head(p, page); + if (i != 0) + set_compound_head(p, &folio->page); } - atomic_set(compound_mapcount_ptr(page), -1); - atomic_set(compound_pincount_ptr(page), 0); + atomic_set(folio_mapcount_ptr(folio), -1); + atomic_set(folio_subpages_mapcount_ptr(folio), 0); + atomic_set(folio_pincount_ptr(folio), 0); return true; out_error: - /* undo tail page modifications made above */ - p = page + 1; - for (j = 1; j < i; j++, p = mem_map_next(p, page, j)) { - clear_compound_head(p); + /* undo page modifications made above */ + for (j = 0; j < i; j++) { + p = folio_page(folio, j); + if (j != 0) + clear_compound_head(p); set_page_refcounted(p); } /* need to clear PG_reserved on remaining tail pages */ - for (; j < nr_pages; j++, p = mem_map_next(p, page, j)) + for (; j < nr_pages; j++) { + p = folio_page(folio, j); __ClearPageReserved(p); - set_compound_order(page, 0); -#ifdef CONFIG_64BIT - page[1].compound_nr = 0; -#endif - __ClearPageHead(page); + } + folio_set_compound_order(folio, 0); + __folio_clear_head(folio); return false; } -static bool prep_compound_gigantic_page(struct page *page, unsigned int order) +static bool prep_compound_gigantic_folio(struct folio *folio, + unsigned int order) { - return __prep_compound_gigantic_page(page, order, false); + return __prep_compound_gigantic_folio(folio, order, false); } -static bool prep_compound_gigantic_page_for_demote(struct page *page, +static bool prep_compound_gigantic_folio_for_demote(struct folio *folio, unsigned int order) { - return __prep_compound_gigantic_page(page, order, true); + return __prep_compound_gigantic_folio(folio, order, true); } /* @@ -1911,13 +2097,14 @@ pgoff_t hugetlb_basepage_index(struct page *page) return (index << compound_order(page_head)) + compound_idx; } -static struct page *alloc_buddy_huge_page(struct hstate *h, +static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { int order = huge_page_order(h); struct page *page; bool alloc_try_hard = true; + bool retry = true; /* * By default we always try hard to allocate the page with @@ -1933,11 +2120,20 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, gfp_mask |= __GFP_RETRY_MAYFAIL; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); +retry: page = __alloc_pages(gfp_mask, order, nid, nmask); - if (page) - __count_vm_event(HTLB_BUDDY_PGALLOC); - else - __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); + + /* Freeze head page */ + if (page && !page_ref_freeze(page, 1)) { + __free_pages(page, order); + if (retry) { /* retry once */ + retry = false; + goto retry; + } + /* WOW! twice in a row. */ + pr_warn("HugeTLB head page unexpected inflated ref count\n"); + page = NULL; + } /* * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this @@ -1955,36 +2151,44 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, if (node_alloc_noretry && !page && alloc_try_hard) node_set(nid, *node_alloc_noretry); - return page; + if (!page) { + __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); + return NULL; + } + + __count_vm_event(HTLB_BUDDY_PGALLOC); + return page_folio(page); } /* * Common helper to allocate a fresh hugetlb page. All specific allocators * should use this function to get new hugetlb pages + * + * Note that returned page is 'frozen': ref count of head page and all tail + * pages is zero. */ -static struct page *alloc_fresh_huge_page(struct hstate *h, +static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { - struct page *page; + struct folio *folio; bool retry = false; retry: if (hstate_is_gigantic(h)) - page = alloc_gigantic_page(h, gfp_mask, nid, nmask); + folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); else - page = alloc_buddy_huge_page(h, gfp_mask, + folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, node_alloc_noretry); - if (!page) + if (!folio) return NULL; - if (hstate_is_gigantic(h)) { - if (!prep_compound_gigantic_page(page, huge_page_order(h))) { + if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) { /* * Rare failure to convert pages to compound page. * Free pages and try again - ONCE! */ - free_gigantic_page(page, huge_page_order(h)); + free_gigantic_folio(folio, huge_page_order(h)); if (!retry) { retry = true; goto retry; @@ -1992,9 +2196,9 @@ retry: return NULL; } } - prep_new_huge_page(h, page, page_to_nid(page)); + prep_new_hugetlb_folio(h, folio, folio_nid(folio)); - return page; + return folio; } /* @@ -2004,23 +2208,20 @@ retry: static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, nodemask_t *node_alloc_noretry) { - struct page *page; + struct folio *folio; int nr_nodes, node; gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { - page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed, - node_alloc_noretry); - if (page) - break; + folio = alloc_fresh_hugetlb_folio(h, gfp_mask, node, + nodes_allowed, node_alloc_noretry); + if (folio) { + free_huge_page(&folio->page); /* free it into the hugepage allocator */ + return 1; + } } - if (!page) - return 0; - - put_page(page); /* free it into the hugepage allocator */ - - return 1; + return 0; } /* @@ -2036,6 +2237,7 @@ static struct page *remove_pool_huge_page(struct hstate *h, { int nr_nodes, node; struct page *page = NULL; + struct folio *folio; lockdep_assert_held(&hugetlb_lock); for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { @@ -2047,7 +2249,8 @@ static struct page *remove_pool_huge_page(struct hstate *h, !list_empty(&h->hugepage_freelists[node])) { page = list_entry(h->hugepage_freelists[node].next, struct page, lru); - remove_hugetlb_page(h, page, acct_surplus); + folio = page_folio(page); + remove_hugetlb_folio(h, folio, acct_surplus); break; } } @@ -2072,29 +2275,29 @@ static struct page *remove_pool_huge_page(struct hstate *h, int dissolve_free_huge_page(struct page *page) { int rc = -EBUSY; + struct folio *folio = page_folio(page); retry: /* Not to disrupt normal path by vainly holding hugetlb_lock */ - if (!PageHuge(page)) + if (!folio_test_hugetlb(folio)) return 0; spin_lock_irq(&hugetlb_lock); - if (!PageHuge(page)) { + if (!folio_test_hugetlb(folio)) { rc = 0; goto out; } - if (!page_count(page)) { - struct page *head = compound_head(page); - struct hstate *h = page_hstate(head); - if (h->free_huge_pages - h->resv_huge_pages == 0) + if (!folio_ref_count(folio)) { + struct hstate *h = folio_hstate(folio); + if (!available_huge_pages(h)) goto out; /* * We should make sure that the page is already on the free list * when it is dissolved. */ - if (unlikely(!HPageFreed(head))) { + if (unlikely(!folio_test_hugetlb_freed(folio))) { spin_unlock_irq(&hugetlb_lock); cond_resched(); @@ -2109,24 +2312,24 @@ retry: goto retry; } - remove_hugetlb_page(h, head, false); + remove_hugetlb_folio(h, folio, false); h->max_huge_pages--; spin_unlock_irq(&hugetlb_lock); /* - * Normally update_and_free_page will allocate required vmemmmap - * before freeing the page. update_and_free_page will fail to + * Normally update_and_free_hugtlb_folio will allocate required vmemmmap + * before freeing the page. update_and_free_hugtlb_folio will fail to * free the page if it can not allocate required vmemmap. We * need to adjust max_huge_pages if the page is not freed. * Attempt to allocate vmemmmap here so that we can take * appropriate action on failure. */ - rc = hugetlb_vmemmap_restore(h, head); + rc = hugetlb_vmemmap_restore(h, &folio->page); if (!rc) { - update_and_free_page(h, head, false); + update_and_free_hugetlb_folio(h, folio, false); } else { spin_lock_irq(&hugetlb_lock); - add_hugetlb_page(h, head, false); + add_hugetlb_folio(h, folio, false); h->max_huge_pages++; spin_unlock_irq(&hugetlb_lock); } @@ -2175,10 +2378,9 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) * Allocates a fresh surplus page from the page allocator. */ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nmask, bool zero_ref) + int nid, nodemask_t *nmask) { - struct page *page = NULL; - bool retry = false; + struct folio *folio = NULL; if (hstate_is_gigantic(h)) return NULL; @@ -2188,9 +2390,8 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, goto out_unlock; spin_unlock_irq(&hugetlb_lock); -retry: - page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); - if (!page) + folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); + if (!folio) return NULL; spin_lock_irq(&hugetlb_lock); @@ -2202,64 +2403,42 @@ retry: * codeflow */ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { - SetHPageTemporary(page); + folio_set_hugetlb_temporary(folio); spin_unlock_irq(&hugetlb_lock); - put_page(page); + free_huge_page(&folio->page); return NULL; } - if (zero_ref) { - /* - * Caller requires a page with zero ref count. - * We will drop ref count here. If someone else is holding - * a ref, the page will be freed when they drop it. Abuse - * temporary page flag to accomplish this. - */ - SetHPageTemporary(page); - if (!put_page_testzero(page)) { - /* - * Unexpected inflated ref count on freshly allocated - * huge. Retry once. - */ - pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n"); - spin_unlock_irq(&hugetlb_lock); - if (retry) - return NULL; - - retry = true; - goto retry; - } - ClearHPageTemporary(page); - } - h->surplus_huge_pages++; - h->surplus_huge_pages_node[page_to_nid(page)]++; + h->surplus_huge_pages_node[folio_nid(folio)]++; out_unlock: spin_unlock_irq(&hugetlb_lock); - return page; + return &folio->page; } static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { - struct page *page; + struct folio *folio; if (hstate_is_gigantic(h)) return NULL; - page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); - if (!page) + folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); + if (!folio) return NULL; + /* fresh huge pages are frozen */ + folio_ref_unfreeze(folio, 1); /* * We do not account these pages as surplus because they are only * temporary and will be released properly on the last reference */ - SetHPageTemporary(page); + folio_set_hugetlb_temporary(folio); - return page; + return &folio->page; } /* @@ -2280,14 +2459,14 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, gfp_t gfp = gfp_mask | __GFP_NOWARN; gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false); + page = alloc_surplus_huge_page(h, gfp, nid, nodemask); /* Fallback to all nodes if page==NULL */ nodemask = NULL; } if (!page) - page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false); + page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); mpol_cond_put(mpol); return page; } @@ -2297,7 +2476,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { spin_lock_irq(&hugetlb_lock); - if (h->free_huge_pages - h->resv_huge_pages > 0) { + if (available_huge_pages(h)) { struct page *page; page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); @@ -2336,7 +2515,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, static int gather_surplus_pages(struct hstate *h, long delta) __must_hold(&hugetlb_lock) { - struct list_head surplus_list; + LIST_HEAD(surplus_list); struct page *page, *tmp; int ret; long i; @@ -2351,14 +2530,13 @@ static int gather_surplus_pages(struct hstate *h, long delta) } allocated = 0; - INIT_LIST_HEAD(&surplus_list); ret = -ENOMEM; retry: spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), - NUMA_NO_NODE, NULL, true); + NUMA_NO_NODE, NULL); if (!page) { alloc_ok = false; break; @@ -2402,7 +2580,7 @@ retry: if ((--needed) < 0) break; /* Add the page to the hugetlb allocator */ - enqueue_huge_page(h, page); + enqueue_hugetlb_folio(h, page_folio(page)); } free: spin_unlock_irq(&hugetlb_lock); @@ -2709,73 +2887,52 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, } /* - * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one + * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve + * the old one * @h: struct hstate old page belongs to - * @old_page: Old page to dissolve + * @old_folio: Old folio to dissolve * @list: List to isolate the page in case we need to * Returns 0 on success, otherwise negated error. */ -static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, - struct list_head *list) +static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, + struct folio *old_folio, struct list_head *list) { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; - int nid = page_to_nid(old_page); - bool alloc_retry = false; - struct page *new_page; + int nid = folio_nid(old_folio); + struct folio *new_folio; int ret = 0; /* - * Before dissolving the page, we need to allocate a new one for the - * pool to remain stable. Here, we allocate the page and 'prep' it + * Before dissolving the folio, we need to allocate a new one for the + * pool to remain stable. Here, we allocate the folio and 'prep' it * by doing everything but actually updating counters and adding to * the pool. This simplifies and let us do most of the processing * under the lock. */ -alloc_retry: - new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL); - if (!new_page) + new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL); + if (!new_folio) return -ENOMEM; - /* - * If all goes well, this page will be directly added to the free - * list in the pool. For this the ref count needs to be zero. - * Attempt to drop now, and retry once if needed. It is VERY - * unlikely there is another ref on the page. - * - * If someone else has a reference to the page, it will be freed - * when they drop their ref. Abuse temporary page flag to accomplish - * this. Retry once if there is an inflated ref count. - */ - SetHPageTemporary(new_page); - if (!put_page_testzero(new_page)) { - if (alloc_retry) - return -EBUSY; - - alloc_retry = true; - goto alloc_retry; - } - ClearHPageTemporary(new_page); - - __prep_new_huge_page(h, new_page); + __prep_new_hugetlb_folio(h, new_folio); retry: spin_lock_irq(&hugetlb_lock); - if (!PageHuge(old_page)) { + if (!folio_test_hugetlb(old_folio)) { /* - * Freed from under us. Drop new_page too. + * Freed from under us. Drop new_folio too. */ goto free_new; - } else if (page_count(old_page)) { + } else if (folio_ref_count(old_folio)) { /* - * Someone has grabbed the page, try to isolate it here. + * Someone has grabbed the folio, try to isolate it here. * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); - ret = isolate_hugetlb(old_page, list); + ret = isolate_hugetlb(&old_folio->page, list); spin_lock_irq(&hugetlb_lock); goto free_new; - } else if (!HPageFreed(old_page)) { + } else if (!folio_test_hugetlb_freed(old_folio)) { /* - * Page's refcount is 0 but it has not been enqueued in the + * Folio's refcount is 0 but it has not been enqueued in the * freelist yet. Race window is small, so we can succeed here if * we retry. */ @@ -2784,35 +2941,35 @@ retry: goto retry; } else { /* - * Ok, old_page is still a genuine free hugepage. Remove it from + * Ok, old_folio is still a genuine free hugepage. Remove it from * the freelist and decrease the counters. These will be * incremented again when calling __prep_account_new_huge_page() - * and enqueue_huge_page() for new_page. The counters will remain - * stable since this happens under the lock. + * and enqueue_hugetlb_folio() for new_folio. The counters will + * remain stable since this happens under the lock. */ - remove_hugetlb_page(h, old_page, false); + remove_hugetlb_folio(h, old_folio, false); /* - * Ref count on new page is already zero as it was dropped + * Ref count on new_folio is already zero as it was dropped * earlier. It can be directly added to the pool free list. */ __prep_account_new_huge_page(h, nid); - enqueue_huge_page(h, new_page); + enqueue_hugetlb_folio(h, new_folio); /* - * Pages have been replaced, we can safely free the old one. + * Folio has been replaced, we can safely free the old one. */ spin_unlock_irq(&hugetlb_lock); - update_and_free_page(h, old_page, false); + update_and_free_hugetlb_folio(h, old_folio, false); } return ret; free_new: spin_unlock_irq(&hugetlb_lock); - /* Page has a zero ref count, but needs a ref to be freed */ - set_page_refcounted(new_page); - update_and_free_page(h, new_page, false); + /* Folio has a zero ref count, but needs a ref to be freed */ + folio_ref_unfreeze(new_folio, 1); + update_and_free_hugetlb_folio(h, new_folio, false); return ret; } @@ -2820,7 +2977,7 @@ free_new: int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) { struct hstate *h; - struct page *head; + struct folio *folio = page_folio(page); int ret = -EBUSY; /* @@ -2829,9 +2986,8 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) * Return success when racing as if we dissolved the page ourselves. */ spin_lock_irq(&hugetlb_lock); - if (PageHuge(page)) { - head = compound_head(page); - h = page_hstate(head); + if (folio_test_hugetlb(folio)) { + h = folio_hstate(folio); } else { spin_unlock_irq(&hugetlb_lock); return 0; @@ -2846,10 +3002,10 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) if (hstate_is_gigantic(h)) return -ENOMEM; - if (page_count(head) && !isolate_hugetlb(head, list)) + if (folio_ref_count(folio) && !isolate_hugetlb(&folio->page, list)) ret = 0; - else if (!page_count(head)) - ret = alloc_and_dissolve_huge_page(h, head, list); + else if (!folio_ref_count(folio)) + ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); return ret; } @@ -2860,6 +3016,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct page *page; + struct folio *folio; long map_chg, map_commit; long gbl_chg; int ret, idx; @@ -2928,14 +3085,16 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, page = alloc_buddy_huge_page_with_mpol(h, vma, addr); if (!page) goto out_uncharge_cgroup; + spin_lock_irq(&hugetlb_lock); if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { SetHPageRestoreReserve(page); h->resv_huge_pages--; } - spin_lock_irq(&hugetlb_lock); list_add(&page->lru, &h->hugepage_activelist); + set_page_refcounted(page); /* Fall through */ } + folio = page_folio(page); hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); /* If allocation is not consuming a reservation, also store the * hugetlb_cgroup pointer on the page. @@ -2965,8 +3124,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, rsv_adjust = hugepage_subpool_put_pages(spool, 1); hugetlb_acct_memory(h, -rsv_adjust); if (deferred_reserve) - hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), - pages_per_huge_page(h), page); + hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), + pages_per_huge_page(h), folio); } return page; @@ -3031,17 +3190,18 @@ static void __init gather_bootmem_prealloc(void) list_for_each_entry(m, &huge_boot_pages, list) { struct page *page = virt_to_page(m); + struct folio *folio = page_folio(page); struct hstate *h = m->hstate; VM_BUG_ON(!hstate_is_gigantic(h)); - WARN_ON(page_count(page) != 1); - if (prep_compound_gigantic_page(page, huge_page_order(h))) { - WARN_ON(PageReserved(page)); - prep_new_huge_page(h, page, page_to_nid(page)); - put_page(page); /* add to the hugepage allocator */ + WARN_ON(folio_ref_count(folio) != 1); + if (prep_compound_gigantic_folio(folio, huge_page_order(h))) { + WARN_ON(folio_test_reserved(folio)); + prep_new_hugetlb_folio(h, folio, folio_nid(folio)); + free_huge_page(page); /* add to the hugepage allocator */ } else { /* VERY unlikely inflated ref count on a tail page */ - free_gigantic_page(page, huge_page_order(h)); + free_gigantic_folio(folio, huge_page_order(h)); } /* @@ -3063,14 +3223,14 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) if (!alloc_bootmem_huge_page(h, nid)) break; } else { - struct page *page; + struct folio *folio; gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; - page = alloc_fresh_huge_page(h, gfp_mask, nid, + folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, &node_states[N_MEMORY], NULL); - if (!page) + if (!folio) break; - put_page(page); /* free it into the hugepage allocator */ + free_huge_page(&folio->page); /* free it into the hugepage allocator */ } cond_resched(); } @@ -3215,7 +3375,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count, goto out; if (PageHighMem(page)) continue; - remove_hugetlb_page(h, page, false); + remove_hugetlb_folio(h, page_folio(page), false); list_add(&page->lru, &page_list); } } @@ -3420,11 +3580,13 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) { int i, nid = page_to_nid(page); struct hstate *target_hstate; + struct folio *folio = page_folio(page); + struct page *subpage; int rc = 0; target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order); - remove_hugetlb_page_for_demote(h, page, false); + remove_hugetlb_folio_for_demote(h, folio, false); spin_unlock_irq(&hugetlb_lock); rc = hugetlb_vmemmap_restore(h, page); @@ -3432,15 +3594,15 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) /* Allocation of vmemmmap failed, we can not demote page */ spin_lock_irq(&hugetlb_lock); set_page_refcounted(page); - add_hugetlb_page(h, page, false); + add_hugetlb_folio(h, page_folio(page), false); return rc; } /* - * Use destroy_compound_hugetlb_page_for_demote for all huge page + * Use destroy_compound_hugetlb_folio_for_demote for all huge page * sizes as it will not ref count pages. */ - destroy_compound_hugetlb_page_for_demote(page, huge_page_order(h)); + destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(h)); /* * Taking target hstate mutex synchronizes with set_max_huge_pages. @@ -3453,15 +3615,16 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) mutex_lock(&target_hstate->resize_lock); for (i = 0; i < pages_per_huge_page(h); i += pages_per_huge_page(target_hstate)) { + subpage = nth_page(page, i); + folio = page_folio(subpage); if (hstate_is_gigantic(target_hstate)) - prep_compound_gigantic_page_for_demote(page + i, + prep_compound_gigantic_folio_for_demote(folio, target_hstate->order); else - prep_compound_page(page + i, target_hstate->order); - set_page_private(page + i, 0); - set_page_refcounted(page + i); - prep_new_huge_page(target_hstate, page + i, nid); - put_page(page + i); + prep_compound_page(subpage, target_hstate->order); + set_page_private(subpage, 0); + prep_new_hugetlb_folio(target_hstate, folio, nid); + free_huge_page(subpage); } mutex_unlock(&target_hstate->resize_lock); @@ -3472,7 +3635,8 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) * based on pool changes for the demoted page. */ h->max_huge_pages--; - target_hstate->max_huge_pages += pages_per_huge_page(h); + target_hstate->max_huge_pages += + pages_per_huge_page(h) / pages_per_huge_page(target_hstate); return rc; } @@ -3714,7 +3878,7 @@ static ssize_t demote_store(struct kobject *kobj, unsigned long nr_available; nodemask_t nodes_allowed, *n_mask; struct hstate *h; - int err = 0; + int err; int nid; err = kstrtoul(buf, 10, &nr_demote); @@ -3765,8 +3929,7 @@ HSTATE_ATTR_WO(demote); static ssize_t demote_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - int nid; - struct hstate *h = kobj_to_hstate(kobj, &nid); + struct hstate *h = kobj_to_hstate(kobj, NULL); unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K; return sysfs_emit(buf, "%lukB\n", demote_size); @@ -3779,7 +3942,6 @@ static ssize_t demote_size_store(struct kobject *kobj, struct hstate *h, *demote_hstate; unsigned long demote_size; unsigned int demote_order; - int nid; demote_size = (unsigned long)memparse(buf, NULL); @@ -3791,7 +3953,7 @@ static ssize_t demote_size_store(struct kobject *kobj, return -EINVAL; /* demote order must be smaller than hstate order */ - h = kobj_to_hstate(kobj, &nid); + h = kobj_to_hstate(kobj, NULL); if (demote_order >= h->order) return -EINVAL; @@ -3845,35 +4007,26 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, if (retval) { kobject_put(hstate_kobjs[hi]); hstate_kobjs[hi] = NULL; + return retval; } if (h->demote_order) { - if (sysfs_create_group(hstate_kobjs[hi], - &hstate_demote_attr_group)) + retval = sysfs_create_group(hstate_kobjs[hi], + &hstate_demote_attr_group); + if (retval) { pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name); + sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group); + kobject_put(hstate_kobjs[hi]); + hstate_kobjs[hi] = NULL; + return retval; + } } - return retval; -} - -static void __init hugetlb_sysfs_init(void) -{ - struct hstate *h; - int err; - - hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); - if (!hugepages_kobj) - return; - - for_each_hstate(h) { - err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, - hstate_kobjs, &hstate_attr_group); - if (err) - pr_err("HugeTLB: Unable to add hstate %s", h->name); - } + return 0; } #ifdef CONFIG_NUMA +static bool hugetlb_sysfs_initialized __ro_after_init; /* * node_hstate/s - associate per node hstate attributes, via their kobjects, @@ -3929,7 +4082,7 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) * Unregister hstate attributes from a single node device. * No-op if no hstate attributes attached. */ -static void hugetlb_unregister_node(struct node *node) +void hugetlb_unregister_node(struct node *node) { struct hstate *h; struct node_hstate *nhs = &node_hstates[node->dev.id]; @@ -3939,10 +4092,15 @@ static void hugetlb_unregister_node(struct node *node) for_each_hstate(h) { int idx = hstate_index(h); - if (nhs->hstate_kobjs[idx]) { - kobject_put(nhs->hstate_kobjs[idx]); - nhs->hstate_kobjs[idx] = NULL; - } + struct kobject *hstate_kobj = nhs->hstate_kobjs[idx]; + + if (!hstate_kobj) + continue; + if (h->demote_order) + sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group); + sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group); + kobject_put(hstate_kobj); + nhs->hstate_kobjs[idx] = NULL; } kobject_put(nhs->hugepages_kobj); @@ -3954,12 +4112,15 @@ static void hugetlb_unregister_node(struct node *node) * Register hstate attributes for a single node device. * No-op if attributes already registered. */ -static void hugetlb_register_node(struct node *node) +void hugetlb_register_node(struct node *node) { struct hstate *h; struct node_hstate *nhs = &node_hstates[node->dev.id]; int err; + if (!hugetlb_sysfs_initialized) + return; + if (nhs->hugepages_kobj) return; /* already allocated */ @@ -3990,18 +4151,8 @@ static void __init hugetlb_register_all_nodes(void) { int nid; - for_each_node_state(nid, N_MEMORY) { - struct node *node = node_devices[nid]; - if (node->dev.id == nid) - hugetlb_register_node(node); - } - - /* - * Let the node device driver know we're here so it can - * [un]register hstate attributes on node hotplug. - */ - register_hugetlbfs_with_node(hugetlb_register_node, - hugetlb_unregister_node); + for_each_online_node(nid) + hugetlb_register_node(node_devices[nid]); } #else /* !CONFIG_NUMA */ @@ -4017,6 +4168,36 @@ static void hugetlb_register_all_nodes(void) { } #endif +#ifdef CONFIG_CMA +static void __init hugetlb_cma_check(void); +#else +static inline __init void hugetlb_cma_check(void) +{ +} +#endif + +static void __init hugetlb_sysfs_init(void) +{ + struct hstate *h; + int err; + + hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); + if (!hugepages_kobj) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, + hstate_kobjs, &hstate_attr_group); + if (err) + pr_err("HugeTLB: Unable to add hstate %s", h->name); + } + +#ifdef CONFIG_NUMA + hugetlb_sysfs_initialized = true; +#endif + hugetlb_register_all_nodes(); +} + static int __init hugetlb_init(void) { int i; @@ -4071,7 +4252,6 @@ static int __init hugetlb_init(void) report_hugepages(); hugetlb_sysfs_init(); - hugetlb_register_all_nodes(); hugetlb_cgroup_file_init(); #ifdef CONFIG_SMP @@ -4116,7 +4296,7 @@ void __init hugetlb_add_hstate(unsigned int order) h->next_nid_to_alloc = first_memory_node; h->next_nid_to_free = first_memory_node; snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", - huge_page_size(h)/1024); + huge_page_size(h)/SZ_1K); parsed_hstate = h; } @@ -4131,11 +4311,11 @@ static void __init hugepages_clear_pages_in_node(void) if (!hugetlb_max_hstate) { default_hstate_max_huge_pages = 0; memset(default_hugepages_in_node, 0, - MAX_NUMNODES * sizeof(unsigned int)); + sizeof(default_hugepages_in_node)); } else { parsed_hstate->max_huge_pages = 0; memset(parsed_hstate->max_huge_pages_node, 0, - MAX_NUMNODES * sizeof(unsigned int)); + sizeof(parsed_hstate->max_huge_pages_node)); } } @@ -4330,18 +4510,34 @@ static int __init default_hugepagesz_setup(char *s) } __setup("default_hugepagesz=", default_hugepagesz_setup); +static nodemask_t *policy_mbind_nodemask(gfp_t gfp) +{ +#ifdef CONFIG_NUMA + struct mempolicy *mpol = get_task_policy(current); + + /* + * Only enforce MPOL_BIND policy which overlaps with cpuset policy + * (from policy_nodemask) specifically for hugetlb case + */ + if (mpol->mode == MPOL_BIND && + (apply_policy_zone(mpol, gfp_zone(gfp)) && + cpuset_nodemask_valid_mems_allowed(&mpol->nodes))) + return &mpol->nodes; +#endif + return NULL; +} + static unsigned int allowed_mems_nr(struct hstate *h) { int node; unsigned int nr = 0; - nodemask_t *mpol_allowed; + nodemask_t *mbind_nodemask; unsigned int *array = h->free_huge_pages_node; gfp_t gfp_mask = htlb_alloc_mask(h); - mpol_allowed = policy_nodemask_current(gfp_mask); - + mbind_nodemask = policy_mbind_nodemask(gfp_mask); for_each_node_mask(node, cpuset_current_mems_allowed) { - if (!mpol_allowed || node_isset(node, *mpol_allowed)) + if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) nr += array[node]; } @@ -4570,6 +4766,7 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) struct resv_map *resv = vma_resv_map(vma); /* + * HPAGE_RESV_OWNER indicates a private mapping. * This new VMA should share its siblings reservation map if present. * The VMA will only ever have a valid reservation map pointer where * it is being copied for another still existing VMA. As that VMA @@ -4581,16 +4778,38 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) resv_map_dup_hugetlb_cgroup_uncharge_info(resv); kref_get(&resv->refs); } + + /* + * vma_lock structure for sharable mappings is vma specific. + * Clear old pointer (if copied via vm_area_dup) and allocate + * new structure. Before clearing, make sure vma_lock is not + * for this vma. + */ + if (vma->vm_flags & VM_MAYSHARE) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + if (vma_lock) { + if (vma_lock->vma != vma) { + vma->vm_private_data = NULL; + hugetlb_vma_lock_alloc(vma); + } else + pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__); + } else + hugetlb_vma_lock_alloc(vma); + } } static void hugetlb_vm_op_close(struct vm_area_struct *vma) { struct hstate *h = hstate_vma(vma); - struct resv_map *resv = vma_resv_map(vma); + struct resv_map *resv; struct hugepage_subpool *spool = subpool_vma(vma); unsigned long reserve, start, end; long gbl_reserve; + hugetlb_vma_lock_free(vma); + + resv = vma_resv_map(vma); if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return; @@ -4713,7 +4932,6 @@ hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr hugepage_add_new_anon_rmap(new_page, vma, addr); set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1)); hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); - ClearHPageRestoreReserve(new_page); SetHPageMigratable(new_page); } @@ -4721,14 +4939,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { - pte_t *src_pte, *dst_pte, entry, dst_entry; + pte_t *src_pte, *dst_pte, entry; struct page *ptepage; unsigned long addr; bool cow = is_cow_mapping(src_vma->vm_flags); struct hstate *h = hstate_vma(src_vma); unsigned long sz = huge_page_size(h); unsigned long npages = pages_per_huge_page(h); - struct address_space *mapping = src_vma->vm_file->f_mapping; struct mmu_notifier_range range; unsigned long last_addr_mask; int ret = 0; @@ -4742,12 +4959,12 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, raw_write_seqcount_begin(&src->write_protect_seq); } else { /* - * For shared mappings i_mmap_rwsem must be held to call - * huge_pte_alloc, otherwise the returned ptep could go - * away if part of a shared pmd and another thread calls - * huge_pmd_unshare. + * For shared mappings the vma lock must be held before + * calling huge_pte_offset in the src vma. Otherwise, the + * returned ptep could go away if part of a shared pmd and + * another thread calls huge_pmd_unshare. */ - i_mmap_lock_read(mapping); + hugetlb_vma_lock_read(src_vma); } last_addr_mask = hugetlb_mask_last_page(h); @@ -4766,15 +4983,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, /* * If the pagetables are shared don't copy or take references. - * dst_pte == src_pte is the common case of src/dest sharing. * + * dst_pte == src_pte is the common case of src/dest sharing. * However, src could have 'unshared' and dst shares with - * another vma. If dst_pte !none, this implies sharing. - * Check here before taking page table lock, and once again - * after taking the lock below. + * another vma. So page_count of ptep page is checked instead + * to reliably determine whether pte is shared. */ - dst_entry = huge_ptep_get(dst_pte); - if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) { + if (page_count(virt_to_page(dst_pte)) > 1) { addr |= last_addr_mask; continue; } @@ -4783,13 +4998,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); - dst_entry = huge_ptep_get(dst_pte); again: - if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) { + if (huge_pte_none(entry)) { /* - * Skip if src entry none. Also, skip in the - * unlikely case dst entry !none as this implies - * sharing with another vma. + * Skip if src entry none. */ ; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) { @@ -4868,7 +5080,7 @@ again: restore_reserve_on_error(h, dst_vma, addr, new); put_page(new); - /* dst_entry won't change as in child */ + /* huge_ptep of dst_pte won't change as in child */ goto again; } hugetlb_install_page(dst_vma, dst_pte, addr, new); @@ -4900,7 +5112,7 @@ again: raw_write_seqcount_end(&src->write_protect_seq); mmu_notifier_invalidate_range_end(&range); } else { - i_mmap_unlock_read(mapping); + hugetlb_vma_unlock_read(src_vma); } return ret; @@ -4959,6 +5171,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(&range); last_addr_mask = hugetlb_mask_last_page(h); /* Prevent race with file truncation */ + hugetlb_vma_lock_write(vma); i_mmap_lock_write(mapping); for (; old_addr < old_end; old_addr += sz, new_addr += sz) { src_pte = huge_pte_offset(mm, old_addr, sz); @@ -4990,6 +5203,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, flush_tlb_range(vma, old_end - len, old_end); mmu_notifier_invalidate_range_end(&range); i_mmap_unlock_write(mapping); + hugetlb_vma_unlock_write(vma); return len + old_addr - old_end; } @@ -5006,7 +5220,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); - struct mmu_notifier_range range; unsigned long last_addr_mask; bool force_flush = false; @@ -5021,13 +5234,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct tlb_change_page_size(tlb, sz); tlb_start_vma(tlb, vma); - /* - * If sharing possible, alert mmu notifiers of worst case. - */ - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start, - end); - adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); - mmu_notifier_invalidate_range_start(&range); last_addr_mask = hugetlb_mask_last_page(h); address = start; for (; address < end; address += sz) { @@ -5112,7 +5318,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct if (ref_page) break; } - mmu_notifier_invalidate_range_end(&range); tlb_end_vma(tlb, vma); /* @@ -5137,29 +5342,46 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb, unsigned long end, struct page *ref_page, zap_flags_t zap_flags) { + hugetlb_vma_lock_write(vma); + i_mmap_lock_write(vma->vm_file->f_mapping); + + /* mmu notification performed in caller */ __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags); - /* - * Clear this flag so that x86's huge_pmd_share page_table_shareable - * test will fail on a vma being torn down, and not grab a page table - * on its way out. We're lucky that the flag has such an appropriate - * name, and can in fact be safely cleared here. We could clear it - * before the __unmap_hugepage_range above, but all that's necessary - * is to clear it before releasing the i_mmap_rwsem. This works - * because in the context this is called, the VMA is about to be - * destroyed and the i_mmap_rwsem is held. - */ - vma->vm_flags &= ~VM_MAYSHARE; + if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */ + /* + * Unlock and free the vma lock before releasing i_mmap_rwsem. + * When the vma_lock is freed, this makes the vma ineligible + * for pmd sharing. And, i_mmap_rwsem is required to set up + * pmd sharing. This is important as page tables for this + * unmapped range will be asynchrously deleted. If the page + * tables are shared, there will be issues when accessed by + * someone else. + */ + __hugetlb_vma_unlock_write_free(vma); + i_mmap_unlock_write(vma->vm_file->f_mapping); + } else { + i_mmap_unlock_write(vma->vm_file->f_mapping); + hugetlb_vma_unlock_write(vma); + } } void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page, zap_flags_t zap_flags) { + struct mmu_notifier_range range; struct mmu_gather tlb; + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + start, end); + adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); + mmu_notifier_invalidate_range_start(&range); tlb_gather_mmu(&tlb, vma->vm_mm); + __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags); + + mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); } @@ -5238,9 +5460,6 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr = address & huge_page_mask(h); struct mmu_notifier_range range; - VM_BUG_ON(unshare && (flags & FOLL_WRITE)); - VM_BUG_ON(!unshare && !(flags & FOLL_WRITE)); - /* * hugetlb does not support FOLL_FORCE-style write faults that keep the * PTE mapped R/O such as maybe_mkwrite() would do. @@ -5250,8 +5469,6 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, /* Let's take out MAP_SHARED mappings first. */ if (vma->vm_flags & VM_MAYSHARE) { - if (unlikely(unshare)) - return 0; set_huge_ptep_writable(vma, haddr, ptep); return 0; } @@ -5314,11 +5531,10 @@ retry_avoidcopy: u32 hash; put_page(old_page); - BUG_ON(huge_pte_none(pte)); /* - * Drop hugetlb_fault_mutex and i_mmap_rwsem before - * unmapping. unmapping needs to hold i_mmap_rwsem - * in write mode. Dropping i_mmap_rwsem in read mode + * Drop hugetlb_fault_mutex and vma_lock before + * unmapping. unmapping needs to hold vma_lock + * in write mode. Dropping vma_lock in read mode * here is OK as COW mappings do not interact with * PMD sharing. * @@ -5326,13 +5542,13 @@ retry_avoidcopy: */ idx = vma_hugecache_offset(h, vma, haddr); hash = hugetlb_fault_mutex_hash(mapping, idx); + hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); unmap_ref_private(mm, vma, old_page, haddr); - i_mmap_lock_read(mapping); mutex_lock(&hugetlb_fault_mutex_table[hash]); + hugetlb_vma_lock_read(vma); spin_lock(ptl); ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (likely(ptep && @@ -5374,8 +5590,6 @@ retry_avoidcopy: spin_lock(ptl); ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { - ClearHPageRestoreReserve(new_page); - /* Break COW or unshare */ huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, range.start, range.end); @@ -5406,19 +5620,6 @@ out_release_old: return ret; } -/* Return the pagecache page at a given address within a VMA */ -static struct page *hugetlbfs_pagecache_page(struct hstate *h, - struct vm_area_struct *vma, unsigned long address) -{ - struct address_space *mapping; - pgoff_t idx; - - mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, address); - - return find_lock_page(mapping, idx); -} - /* * Return whether there is a pagecache page to back given address within VMA. * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. @@ -5439,7 +5640,7 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, return page != NULL; } -int huge_add_to_page_cache(struct page *page, struct address_space *mapping, +int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx) { struct folio *folio = page_folio(page); @@ -5476,7 +5677,6 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, unsigned long addr, unsigned long reason) { - vm_fault_t ret; u32 hash; struct vm_fault vmf = { .vma = vma, @@ -5494,18 +5694,31 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, }; /* - * hugetlb_fault_mutex and i_mmap_rwsem must be - * dropped before handling userfault. Reacquire - * after handling fault to make calling code simpler. + * vma_lock and hugetlb_fault_mutex must be dropped before handling + * userfault. Also mmap_lock could be dropped due to handling + * userfault, any vma operation should be careful from here. */ + hugetlb_vma_unlock_read(vma); hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); - ret = handle_userfault(&vmf, reason); - i_mmap_lock_read(mapping); - mutex_lock(&hugetlb_fault_mutex_table[hash]); + return handle_userfault(&vmf, reason); +} - return ret; +/* + * Recheck pte with pgtable lock. Returns true if pte didn't change, or + * false if pte changed or is changing. + */ +static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, + pte_t *ptep, pte_t old_pte) +{ + spinlock_t *ptl; + bool same; + + ptl = huge_pte_lock(h, mm, ptep); + same = pte_same(huge_ptep_get(ptep), old_pte); + spin_unlock(ptl); + + return same; } static vm_fault_t hugetlb_no_page(struct mm_struct *mm, @@ -5523,6 +5736,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, spinlock_t *ptl; unsigned long haddr = address & huge_page_mask(h); bool new_page, new_pagecache_page = false; + u32 hash = hugetlb_fault_mutex_hash(mapping, idx); /* * Currently, we are forced to kill the process in the event the @@ -5533,28 +5747,46 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n", current->pid); - return ret; + goto out; } /* - * We can not race with truncation due to holding i_mmap_rwsem. - * i_size is modified when holding i_mmap_rwsem, so check here - * once for faults beyond end of file. + * Use page lock to guard against racing truncation + * before we get page_table_lock. */ - size = i_size_read(mapping->host) >> huge_page_shift(h); - if (idx >= size) - goto out; - -retry: new_page = false; page = find_lock_page(mapping, idx); if (!page) { + size = i_size_read(mapping->host) >> huge_page_shift(h); + if (idx >= size) + goto out; /* Check for page in userfault range */ if (userfaultfd_missing(vma)) { - ret = hugetlb_handle_userfault(vma, mapping, idx, - flags, haddr, address, - VM_UFFD_MISSING); - goto out; + /* + * Since hugetlb_no_page() was examining pte + * without pgtable lock, we need to re-test under + * lock because the pte may not be stable and could + * have changed from under us. Try to detect + * either changed or during-changing ptes and retry + * properly when needed. + * + * Note that userfaultfd is actually fine with + * false positives (e.g. caused by pte changed), + * but not wrong logical events (e.g. caused by + * reading a pte during changing). The latter can + * confuse the userspace, so the strictness is very + * much preferred. E.g., MISSING event should + * never happen on the page after UFFDIO_COPY has + * correctly installed the page and returned. + */ + if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { + ret = 0; + goto out; + } + + return hugetlb_handle_userfault(vma, mapping, idx, flags, + haddr, address, + VM_UFFD_MISSING); } page = alloc_huge_page(vma, haddr, 0); @@ -5571,11 +5803,10 @@ retry: * here. Before returning error, get ptl and make * sure there really is no pte entry. */ - ptl = huge_pte_lock(h, mm, ptep); - ret = 0; - if (huge_pte_none(huge_ptep_get(ptep))) + if (hugetlb_pte_stable(h, mm, ptep, old_pte)) ret = vmf_error(PTR_ERR(page)); - spin_unlock(ptl); + else + ret = 0; goto out; } clear_huge_page(page, address, pages_per_huge_page(h)); @@ -5583,11 +5814,17 @@ retry: new_page = true; if (vma->vm_flags & VM_MAYSHARE) { - int err = huge_add_to_page_cache(page, mapping, idx); + int err = hugetlb_add_to_page_cache(page, mapping, idx); if (err) { + /* + * err can't be -EEXIST which implies someone + * else consumed the reservation since hugetlb + * fault mutex is held when add a hugetlb page + * to the page cache. So it's safe to call + * restore_reserve_on_error() here. + */ + restore_reserve_on_error(h, vma, haddr, page); put_page(page); - if (err == -EEXIST) - goto retry; goto out; } new_pagecache_page = true; @@ -5615,10 +5852,14 @@ retry: if (userfaultfd_minor(vma)) { unlock_page(page); put_page(page); - ret = hugetlb_handle_userfault(vma, mapping, idx, - flags, haddr, address, - VM_UFFD_MINOR); - goto out; + /* See comment in userfaultfd_missing() block above */ + if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { + ret = 0; + goto out; + } + return hugetlb_handle_userfault(vma, mapping, idx, flags, + haddr, address, + VM_UFFD_MINOR); } } @@ -5643,10 +5884,9 @@ retry: if (!pte_same(huge_ptep_get(ptep), old_pte)) goto backout; - if (anon_rmap) { - ClearHPageRestoreReserve(page); + if (anon_rmap) hugepage_add_new_anon_rmap(page, vma, haddr); - } else + else page_dup_file_rmap(page, true); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); @@ -5676,15 +5916,17 @@ retry: unlock_page(page); out: + hugetlb_vma_unlock_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); return ret; backout: spin_unlock(ptl); backout_unlocked: - unlock_page(page); - /* restore reserve for newly allocated pages not in page cache */ if (new_page && !new_pagecache_page) restore_reserve_on_error(h, vma, haddr, page); + + unlock_page(page); put_page(page); goto out; } @@ -5745,40 +5987,41 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } /* - * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold - * until finished with ptep. This serves two purposes: - * 1) It prevents huge_pmd_unshare from being called elsewhere - * and making the ptep no longer valid. - * 2) It synchronizes us with i_size modifications during truncation. + * Serialize hugepage allocation and instantiation, so that we don't + * get spurious allocation failures if two CPUs race to instantiate + * the same page in the page cache. + */ + mapping = vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, vma, haddr); + hash = hugetlb_fault_mutex_hash(mapping, idx); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + + /* + * Acquire vma lock before calling huge_pte_alloc and hold + * until finished with ptep. This prevents huge_pmd_unshare from + * being called elsewhere and making the ptep no longer valid. * * ptep could have already be assigned via huge_pte_offset. That * is OK, as huge_pte_alloc will return the same value unless * something has changed. */ - mapping = vma->vm_file->f_mapping; - i_mmap_lock_read(mapping); + hugetlb_vma_lock_read(vma); ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); if (!ptep) { - i_mmap_unlock_read(mapping); + hugetlb_vma_unlock_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); return VM_FAULT_OOM; } - /* - * Serialize hugepage allocation and instantiation, so that we don't - * get spurious allocation failures if two CPUs race to instantiate - * the same page in the page cache. - */ - idx = vma_hugecache_offset(h, vma, haddr); - hash = hugetlb_fault_mutex_hash(mapping, idx); - mutex_lock(&hugetlb_fault_mutex_table[hash]); - entry = huge_ptep_get(ptep); /* PTE markers should be handled the same way as none pte */ - if (huge_pte_none_mostly(entry)) { - ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, + if (huge_pte_none_mostly(entry)) + /* + * hugetlb_no_page will drop vma lock and hugetlb fault + * mutex internally, which make us return immediately. + */ + return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, entry, flags); - goto out_mutex; - } ret = 0; @@ -5808,7 +6051,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, haddr); - pagecache_page = hugetlbfs_pagecache_page(h, vma, haddr); + pagecache_page = find_lock_page(mapping, idx); } ptl = huge_pte_lock(h, mm, ptep); @@ -5832,8 +6075,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unlock_page(pagecache_page); put_page(pagecache_page); } + hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); return handle_userfault(&vmf, VM_UFFD_WP); } @@ -5876,8 +6119,8 @@ out_ptl: put_page(pagecache_page); } out_mutex: + hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); /* * Generally it's safe to hold refcount during waiting page lock. But * here we just wait to defer the next page fault to avoid busy loop and @@ -6005,48 +6248,35 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, /* * Serialization between remove_inode_hugepages() and - * huge_add_to_page_cache() below happens through the + * hugetlb_add_to_page_cache() below happens through the * hugetlb_fault_mutex_table that here must be hold by * the caller. */ - ret = huge_add_to_page_cache(page, mapping, idx); + ret = hugetlb_add_to_page_cache(page, mapping, idx); if (ret) goto out_release_nounlock; page_in_pagecache = true; } - ptl = huge_pte_lockptr(h, dst_mm, dst_pte); - spin_lock(ptl); + ptl = huge_pte_lock(h, dst_mm, dst_pte); - /* - * Recheck the i_size after holding PT lock to make sure not - * to leave any page mapped (as page_mapped()) beyond the end - * of the i_size (remove_inode_hugepages() is strict about - * enforcing that). If we bail out here, we'll also leave a - * page in the radix tree in the vm_shared case beyond the end - * of the i_size, but remove_inode_hugepages() will take care - * of it as soon as we drop the hugetlb_fault_mutex_table. - */ - size = i_size_read(mapping->host) >> huge_page_shift(h); - ret = -EFAULT; - if (idx >= size) + ret = -EIO; + if (PageHWPoison(page)) goto out_release_unlock; - ret = -EEXIST; /* * We allow to overwrite a pte marker: consider when both MISSING|WP * registered, we firstly wr-protect a none pte which has no page cache * page backing it, then access the page. */ + ret = -EEXIST; if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) goto out_release_unlock; - if (page_in_pagecache) { + if (page_in_pagecache) page_dup_file_rmap(page, true); - } else { - ClearHPageRestoreReserve(page); + else hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); - } /* * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY @@ -6105,13 +6335,14 @@ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma, for (nr = 0; nr < refs; nr++) { if (likely(pages)) - pages[nr] = mem_map_offset(page, nr); + pages[nr] = nth_page(page, nr); if (vmas) vmas[nr] = vma; } } -static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte, +static inline bool __follow_hugetlb_must_fault(struct vm_area_struct *vma, + unsigned int flags, pte_t *pte, bool *unshare) { pte_t pteval = huge_ptep_get(pte); @@ -6123,13 +6354,69 @@ static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte, return false; if (flags & FOLL_WRITE) return true; - if (gup_must_unshare(flags, pte_page(pteval))) { + if (gup_must_unshare(vma, flags, pte_page(pteval))) { *unshare = true; return true; } return false; } +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +{ + struct hstate *h = hstate_vma(vma); + struct mm_struct *mm = vma->vm_mm; + unsigned long haddr = address & huge_page_mask(h); + struct page *page = NULL; + spinlock_t *ptl; + pte_t *pte, entry; + + /* + * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via + * follow_hugetlb_page(). + */ + if (WARN_ON_ONCE(flags & FOLL_PIN)) + return NULL; + +retry: + pte = huge_pte_offset(mm, haddr, huge_page_size(h)); + if (!pte) + return NULL; + + ptl = huge_pte_lock(h, mm, pte); + entry = huge_ptep_get(pte); + if (pte_present(entry)) { + page = pte_page(entry) + + ((address & ~huge_page_mask(h)) >> PAGE_SHIFT); + /* + * Note that page may be a sub-page, and with vmemmap + * optimizations the page struct may be read only. + * try_grab_page() will increase the ref count on the + * head page, so this will be OK. + * + * try_grab_page() should always be able to get the page here, + * because we hold the ptl lock and have verified pte_present(). + */ + if (try_grab_page(page, flags)) { + page = NULL; + goto out; + } + } else { + if (is_hugetlb_entry_migration(entry)) { + spin_unlock(ptl); + __migration_entry_wait_huge(pte, ptl); + goto retry; + } + /* + * hwpoisoned entry is treated as no_page_table in + * follow_page_mask(). + */ + } +out: + spin_unlock(ptl); + return page; +} + long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, unsigned long *nr_pages, @@ -6196,7 +6483,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * directly from any kind of swap entries. */ if (absent || - __follow_hugetlb_must_fault(flags, pte, &unshare)) { + __follow_hugetlb_must_fault(vma, flags, pte, &unshare)) { vm_fault_t ret; unsigned int fault_flags = 0; @@ -6206,9 +6493,12 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, fault_flags |= FAULT_FLAG_WRITE; else if (unshare) fault_flags |= FAULT_FLAG_UNSHARE; - if (locked) + if (locked) { fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + if (flags & FOLL_INTERRUPTIBLE) + fault_flags |= FAULT_FLAG_INTERRUPTIBLE; + } if (flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; @@ -6269,7 +6559,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT); if (pages || vmas) - record_subpages_vmas(mem_map_offset(page, pfn_offset), + record_subpages_vmas(nth_page(page, pfn_offset), vma, refs, likely(pages) ? pages + i : NULL, vmas ? vmas + i : NULL); @@ -6282,8 +6572,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * tables. If the huge page is present, then the tail * pages must also be present. The ptl prevents the * head page and tail pages from being rearranged in - * any way. So this page must be available at this - * point, unless the page refcount overflowed: + * any way. As this is hugetlb, the pages will never + * be p2pdma or not longterm pinable. So this page + * must be available at this point, unless the page + * refcount overflowed: */ if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs, flags))) { @@ -6340,8 +6632,9 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, flush_cache_range(vma, range.start, range.end); mmu_notifier_invalidate_range_start(&range); - last_addr_mask = hugetlb_mask_last_page(h); + hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); + last_addr_mask = hugetlb_mask_last_page(h); for (; address < end; address += psize) { spinlock_t *ptl; ptep = huge_pte_offset(mm, address, psize); @@ -6440,6 +6733,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * See Documentation/mm/mmu_notifier.rst */ i_mmap_unlock_write(vma->vm_file->f_mapping); + hugetlb_vma_unlock_write(vma); mmu_notifier_invalidate_range_end(&range); return pages << h->order; @@ -6465,6 +6759,12 @@ bool hugetlb_reserve_pages(struct inode *inode, } /* + * vma specific semaphore used for pmd sharing and fault/truncation + * synchronization + */ + hugetlb_vma_lock_alloc(vma); + + /* * Only apply hugepage reservation if asked. At fault time, an * attempt will be made for VM_NORESERVE to allocate a page * without using reserves @@ -6487,12 +6787,11 @@ bool hugetlb_reserve_pages(struct inode *inode, resv_map = inode_resv_map(inode); chg = region_chg(resv_map, from, to, ®ions_needed); - } else { /* Private mapping. */ resv_map = resv_map_alloc(); if (!resv_map) - return false; + goto out_err; chg = to - from; @@ -6587,6 +6886,7 @@ out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); out_err: + hugetlb_vma_lock_free(vma); if (!vma || vma->vm_flags & VM_MAYSHARE) /* Only call region_abort if the region_chg succeeded but the * region_add failed or didn't run. @@ -6656,35 +6956,37 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, /* * match the virtual addresses, permission and the alignment of the * page table page. + * + * Also, vma_lock (vm_private_data) is required for sharing. */ if (pmd_index(addr) != pmd_index(saddr) || vm_flags != svm_flags || - !range_in_vma(svma, sbase, s_end)) + !range_in_vma(svma, sbase, s_end) || + !svma->vm_private_data) return 0; return saddr; } -static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) -{ - unsigned long base = addr & PUD_MASK; - unsigned long end = base + PUD_SIZE; - - /* - * check on proper vm_flags and page table alignment - */ - if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end)) - return true; - return false; -} - bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) { + unsigned long start = addr & PUD_MASK; + unsigned long end = start + PUD_SIZE; + #ifdef CONFIG_USERFAULTFD if (uffd_disable_huge_pmd_share(vma)) return false; #endif - return vma_shareable(vma, addr); + /* + * check on proper vm_flags and page table alignment + */ + if (!(vma->vm_flags & VM_MAYSHARE)) + return false; + if (!vma->vm_private_data) /* vma lock required for sharing */ + return false; + if (!range_in_vma(vma, start, end)) + return false; + return true; } /* @@ -6718,12 +7020,10 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() * and returns the corresponding pte. While this is not necessary for the * !shared pmd case because we can allocate the pmd later as well, it makes the - * code much cleaner. - * - * This routine must be called with i_mmap_rwsem held in at least read mode if - * sharing is possible. For hugetlbfs, this prevents removal of any page - * table entries associated with the address space. This is important as we - * are setting up sharing based on existing page table entries (mappings). + * code much cleaner. pmd allocation is essential for the shared case because + * pud has to be populated inside the same i_mmap_rwsem section - otherwise + * racing tasks could either miss the sharing (see huge_pte_offset) or select a + * bad pmd for sharing. */ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) @@ -6737,7 +7037,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *pte; spinlock_t *ptl; - i_mmap_assert_locked(mapping); + i_mmap_lock_read(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; @@ -6767,6 +7067,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(ptl); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); + i_mmap_unlock_read(mapping); return pte; } @@ -6777,7 +7078,7 @@ out: * indicated by page_count > 1, unmap is achieved by clearing pud and * decrementing the ref count. If count == 1, the pte page is not shared. * - * Called with page table lock held and i_mmap_rwsem held in write mode. + * Called with page table lock held. * * returns: 1 successfully unmapped a shared pte page * 0 the underlying pte page is not shared, or it is the last user @@ -6790,6 +7091,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, pud_t *pud = pud_offset(p4d, addr); i_mmap_assert_write_locked(vma->vm_file->f_mapping); + hugetlb_vma_assert_locked(vma); BUG_ON(page_count(virt_to_page(ptep)) == 0); if (page_count(virt_to_page(ptep)) == 1) return 0; @@ -6801,6 +7103,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, } #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ + pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) { @@ -6928,123 +7231,6 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) * These functions are overwritable if your architecture needs its own * behavior. */ -struct page * __weak -follow_huge_addr(struct mm_struct *mm, unsigned long address, - int write) -{ - return ERR_PTR(-EINVAL); -} - -struct page * __weak -follow_huge_pd(struct vm_area_struct *vma, - unsigned long address, hugepd_t hpd, int flags, int pdshift) -{ - WARN(1, "hugepd follow called with no support for hugepage directory format\n"); - return NULL; -} - -struct page * __weak -follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int flags) -{ - struct page *page = NULL; - spinlock_t *ptl; - pte_t pte; - - /* - * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via - * follow_hugetlb_page(). - */ - if (WARN_ON_ONCE(flags & FOLL_PIN)) - return NULL; - -retry: - ptl = pmd_lockptr(mm, pmd); - spin_lock(ptl); - /* - * make sure that the address range covered by this pmd is not - * unmapped from other threads. - */ - if (!pmd_huge(*pmd)) - goto out; - pte = huge_ptep_get((pte_t *)pmd); - if (pte_present(pte)) { - page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); - /* - * try_grab_page() should always succeed here, because: a) we - * hold the pmd (ptl) lock, and b) we've just checked that the - * huge pmd (head) page is present in the page tables. The ptl - * prevents the head page and tail pages from being rearranged - * in any way. So this page must be available at this point, - * unless the page refcount overflowed: - */ - if (WARN_ON_ONCE(!try_grab_page(page, flags))) { - page = NULL; - goto out; - } - } else { - if (is_hugetlb_entry_migration(pte)) { - spin_unlock(ptl); - __migration_entry_wait_huge((pte_t *)pmd, ptl); - goto retry; - } - /* - * hwpoisoned entry is treated as no_page_table in - * follow_page_mask(). - */ - } -out: - spin_unlock(ptl); - return page; -} - -struct page * __weak -follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int flags) -{ - struct page *page = NULL; - spinlock_t *ptl; - pte_t pte; - - if (WARN_ON_ONCE(flags & FOLL_PIN)) - return NULL; - -retry: - ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud); - if (!pud_huge(*pud)) - goto out; - pte = huge_ptep_get((pte_t *)pud); - if (pte_present(pte)) { - page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); - if (WARN_ON_ONCE(!try_grab_page(page, flags))) { - page = NULL; - goto out; - } - } else { - if (is_hugetlb_entry_migration(pte)) { - spin_unlock(ptl); - __migration_entry_wait(mm, (pte_t *)pud, ptl); - goto retry; - } - /* - * hwpoisoned entry is treated as no_page_table in - * follow_page_mask(). - */ - } -out: - spin_unlock(ptl); - return page; -} - -struct page * __weak -follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags) -{ - if (flags & (FOLL_GET | FOLL_PIN)) - return NULL; - - return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); -} - int isolate_hugetlb(struct page *page, struct list_head *list) { int ret = 0; @@ -7063,7 +7249,7 @@ unlock: return ret; } -int get_hwpoison_huge_page(struct page *page, bool *hugetlb) +int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison) { int ret = 0; @@ -7073,7 +7259,7 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb) *hugetlb = true; if (HPageFreed(page)) ret = 0; - else if (HPageMigratable(page)) + else if (HPageMigratable(page) || unpoison) ret = get_page_unless_zero(page); else ret = -EBUSY; @@ -7082,12 +7268,13 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb) return ret; } -int get_huge_page_for_hwpoison(unsigned long pfn, int flags) +int get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared) { int ret; spin_lock_irq(&hugetlb_lock); - ret = __get_huge_page_for_hwpoison(pfn, flags); + ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared); spin_unlock_irq(&hugetlb_lock); return ret; } @@ -7101,15 +7288,15 @@ void putback_active_hugepage(struct page *page) put_page(page); } -void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) +void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason) { - struct hstate *h = page_hstate(oldpage); + struct hstate *h = folio_hstate(old_folio); - hugetlb_cgroup_migrate(oldpage, newpage); - set_page_owner_migrate_reason(newpage, reason); + hugetlb_cgroup_migrate(old_folio, new_folio); + set_page_owner_migrate_reason(&new_folio->page, reason); /* - * transfer temporary state of the new huge page. This is + * transfer temporary state of the new hugetlb folio. This is * reverse to other transitions because the newpage is going to * be final while the old one will be freed so it takes over * the temporary status. @@ -7118,12 +7305,13 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) * here as well otherwise the global surplus count will not match * the per-node's. */ - if (HPageTemporary(newpage)) { - int old_nid = page_to_nid(oldpage); - int new_nid = page_to_nid(newpage); + if (folio_test_hugetlb_temporary(new_folio)) { + int old_nid = folio_nid(old_folio); + int new_nid = folio_nid(new_folio); + + folio_set_hugetlb_temporary(old_folio); + folio_clear_hugetlb_temporary(new_folio); - SetHPageTemporary(oldpage); - ClearHPageTemporary(newpage); /* * There is no need to transfer the per-node surplus state @@ -7171,6 +7359,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, start, end); mmu_notifier_invalidate_range_start(&range); + hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); for (address = start; address < end; address += PUD_SIZE) { ptep = huge_pte_offset(mm, address, sz); @@ -7182,6 +7371,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) } flush_hugetlb_tlb_range(vma, start, end); i_mmap_unlock_write(vma->vm_file->f_mapping); + hugetlb_vma_unlock_write(vma); /* * No need to call mmu_notifier_invalidate_range(), see * Documentation/mm/mmu_notifier.rst. @@ -7332,7 +7522,7 @@ void __init hugetlb_cma_reserve(int order) hugetlb_cma_size = 0; } -void __init hugetlb_cma_check(void) +static void __init hugetlb_cma_check(void) { if (!hugetlb_cma_size || cma_reserve_called) return; diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index c86691c431fd..d9e4425d81ac 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -75,11 +75,11 @@ parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) { - int idx; + struct hstate *h; - for (idx = 0; idx < hugetlb_max_hstate; idx++) { + for_each_hstate(h) { if (page_counter_read( - hugetlb_cgroup_counter_from_cgroup(h_cg, idx))) + hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h)))) return true; } return false; @@ -154,9 +154,9 @@ hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) * function. */ for_each_node(node) { - /* Set node_to_alloc to -1 for offline nodes. */ + /* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */ int node_to_alloc = - node_state(node, N_NORMAL_MEMORY) ? node : -1; + node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE; h_cgroup->nodeinfo[node] = kzalloc_node(sizeof(struct hugetlb_cgroup_per_node), GFP_KERNEL, node_to_alloc); @@ -191,8 +191,9 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, struct page_counter *counter; struct hugetlb_cgroup *page_hcg; struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); + struct folio *folio = page_folio(page); - page_hcg = hugetlb_cgroup_from_page(page); + page_hcg = hugetlb_cgroup_from_folio(folio); /* * We can have pages in active list without any cgroup * ie, hugepage with less than 3 pages. We can safely @@ -211,7 +212,7 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, /* Take the pages off the local counter */ page_counter_cancel(counter, nr_pages); - set_hugetlb_cgroup(page, parent); + set_hugetlb_cgroup(folio, parent); out: return; } @@ -225,17 +226,14 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); struct hstate *h; struct page *page; - int idx; do { - idx = 0; for_each_hstate(h) { spin_lock_irq(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_activelist, lru) - hugetlb_cgroup_move_parent(idx, h_cg, page); + hugetlb_cgroup_move_parent(hstate_index(h), h_cg, page); spin_unlock_irq(&hugetlb_lock); - idx++; } cond_resched(); } while (hugetlb_cgroup_have_usage(h_cg)); @@ -312,21 +310,21 @@ int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, /* Should be called with hugetlb_lock held */ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page, bool rsvd) + struct folio *folio, bool rsvd) { if (hugetlb_cgroup_disabled() || !h_cg) return; - __set_hugetlb_cgroup(page, h_cg, rsvd); + __set_hugetlb_cgroup(folio, h_cg, rsvd); if (!rsvd) { unsigned long usage = - h_cg->nodeinfo[page_to_nid(page)]->usage[idx]; + h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; /* * This write is not atomic due to fetching usage and writing * to it, but that's fine because we call this with * hugetlb_lock held anyway. */ - WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx], + WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], usage + nr_pages); } } @@ -335,31 +333,35 @@ void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct page *page) { - __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, false); + struct folio *folio = page_folio(page); + + __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false); } void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct page *page) { - __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, true); + struct folio *folio = page_folio(page); + + __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true); } /* * Should be called with hugetlb_lock held */ -static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, - struct page *page, bool rsvd) +static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, + struct folio *folio, bool rsvd) { struct hugetlb_cgroup *h_cg; if (hugetlb_cgroup_disabled()) return; lockdep_assert_held(&hugetlb_lock); - h_cg = __hugetlb_cgroup_from_page(page, rsvd); + h_cg = __hugetlb_cgroup_from_folio(folio, rsvd); if (unlikely(!h_cg)) return; - __set_hugetlb_cgroup(page, NULL, rsvd); + __set_hugetlb_cgroup(folio, NULL, rsvd); page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), @@ -369,27 +371,27 @@ static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, css_put(&h_cg->css); else { unsigned long usage = - h_cg->nodeinfo[page_to_nid(page)]->usage[idx]; + h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; /* * This write is not atomic due to fetching usage and writing * to it, but that's fine because we call this with * hugetlb_lock held anyway. */ - WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx], + WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], usage - nr_pages); } } -void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, - struct page *page) +void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, + struct folio *folio) { - __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, false); + __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, false); } -void hugetlb_cgroup_uncharge_page_rsvd(int idx, unsigned long nr_pages, - struct page *page) +void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, + struct folio *folio) { - __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, true); + __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, true); } static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, @@ -442,7 +444,7 @@ void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages) return; - if (rg->reservation_counter && resv->pages_per_hpage && nr_pages > 0 && + if (rg->reservation_counter && resv->pages_per_hpage && !resv->reservation_counter) { page_counter_uncharge(rg->reservation_counter, nr_pages * resv->pages_per_hpage); @@ -675,12 +677,12 @@ static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, static char *mem_fmt(char *buf, int size, unsigned long hsize) { - if (hsize >= (1UL << 30)) - snprintf(buf, size, "%luGB", hsize >> 30); - else if (hsize >= (1UL << 20)) - snprintf(buf, size, "%luMB", hsize >> 20); + if (hsize >= SZ_1G) + snprintf(buf, size, "%luGB", hsize / SZ_1G); + else if (hsize >= SZ_1M) + snprintf(buf, size, "%luMB", hsize / SZ_1M); else - snprintf(buf, size, "%luKB", hsize >> 10); + snprintf(buf, size, "%luKB", hsize / SZ_1K); return buf; } @@ -886,25 +888,25 @@ void __init hugetlb_cgroup_file_init(void) * hugetlb_lock will make sure a parallel cgroup rmdir won't happen * when we migrate hugepages */ -void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) +void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio) { struct hugetlb_cgroup *h_cg; struct hugetlb_cgroup *h_cg_rsvd; - struct hstate *h = page_hstate(oldhpage); + struct hstate *h = folio_hstate(old_folio); if (hugetlb_cgroup_disabled()) return; spin_lock_irq(&hugetlb_lock); - h_cg = hugetlb_cgroup_from_page(oldhpage); - h_cg_rsvd = hugetlb_cgroup_from_page_rsvd(oldhpage); - set_hugetlb_cgroup(oldhpage, NULL); - set_hugetlb_cgroup_rsvd(oldhpage, NULL); + h_cg = hugetlb_cgroup_from_folio(old_folio); + h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio); + set_hugetlb_cgroup(old_folio, NULL); + set_hugetlb_cgroup_rsvd(old_folio, NULL); /* move the h_cg details to new cgroup */ - set_hugetlb_cgroup(newhpage, h_cg); - set_hugetlb_cgroup_rsvd(newhpage, h_cg_rsvd); - list_move(&newhpage->lru, &h->hugepage_activelist); + set_hugetlb_cgroup(new_folio, h_cg); + set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd); + list_move(&new_folio->lru, &h->hugepage_activelist); spin_unlock_irq(&hugetlb_lock); return; } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 20f414c0379f..45e93a545dd7 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -11,6 +11,7 @@ #define pr_fmt(fmt) "HugeTLB: " fmt #include <linux/pgtable.h> +#include <linux/moduleparam.h> #include <linux/bootmem_info.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> @@ -202,12 +203,7 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end, return ret; } while (pgd++, addr = next, addr != end); - /* - * We only change the mapping of the vmemmap virtual address range - * [@start + PAGE_SIZE, end), so we only need to flush the TLB which - * belongs to the range. - */ - flush_tlb_kernel_range(start + PAGE_SIZE, end); + flush_tlb_kernel_range(start, end); return 0; } @@ -231,10 +227,8 @@ static void free_vmemmap_page_list(struct list_head *list) { struct page *page, *next; - list_for_each_entry_safe(page, next, list, lru) { - list_del(&page->lru); + list_for_each_entry_safe(page, next, list, lru) free_vmemmap_page(page); - } } static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, @@ -245,9 +239,23 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, * to the tail pages. */ pgprot_t pgprot = PAGE_KERNEL_RO; - pte_t entry = mk_pte(walk->reuse_page, pgprot); struct page *page = pte_page(*pte); + pte_t entry; + + /* Remapping the head page requires r/w */ + if (unlikely(addr == walk->reuse_addr)) { + pgprot = PAGE_KERNEL; + list_del(&walk->reuse_page->lru); + + /* + * Makes sure that preceding stores to the page contents from + * vmemmap_remap_free() become visible before the set_pte_at() + * write. + */ + smp_wmb(); + } + entry = mk_pte(walk->reuse_page, pgprot); list_add_tail(&page->lru, walk->vmemmap_pages); set_pte_at(&init_mm, addr, pte, entry); } @@ -265,11 +273,10 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, static inline void reset_struct_pages(struct page *start) { - int i; struct page *from = start + NR_RESET_STRUCT_PAGE; - for (i = 0; i < NR_RESET_STRUCT_PAGE; i++) - memcpy(start + i, from, sizeof(*from)); + BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page)); + memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE); } static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, @@ -287,6 +294,11 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, copy_page(to, (void *)walk->reuse_addr); reset_struct_pages(to); + /* + * Makes sure that preceding stores to the page contents become visible + * before the set_pte_at() write. + */ + smp_wmb(); set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); } @@ -312,6 +324,24 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, .reuse_addr = reuse, .vmemmap_pages = &vmemmap_pages, }; + int nid = page_to_nid((struct page *)start); + gfp_t gfp_mask = GFP_KERNEL | __GFP_THISNODE | __GFP_NORETRY | + __GFP_NOWARN; + + /* + * Allocate a new head vmemmap page to avoid breaking a contiguous + * block of struct page memory when freeing it back to page allocator + * in free_vmemmap_page_list(). This will allow the likely contiguous + * struct page backing memory to be kept contiguous and allowing for + * more allocations of hugepages. Fallback to the currently + * mapped head page in case should it fail to allocate. + */ + walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0); + if (walk.reuse_page) { + copy_page(page_to_virt(walk.reuse_page), + (void *)walk.reuse_addr); + list_add(&walk.reuse_page->lru, &vmemmap_pages); + } /* * In order to make remapping routine most efficient for the huge pages, diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 65e242b5a432..d0548e382b6b 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -63,13 +63,13 @@ static int hwpoison_unpoison(void *data, u64 val) DEFINE_DEBUGFS_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); DEFINE_DEBUGFS_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); -static void pfn_inject_exit(void) +static void __exit pfn_inject_exit(void) { hwpoison_filter_enable = 0; debugfs_remove_recursive(hwpoison_dir); } -static int pfn_inject_init(void) +static int __init pfn_inject_init(void) { hwpoison_dir = debugfs_create_dir("hwpoison", NULL); diff --git a/mm/init-mm.c b/mm/init-mm.c index fbe7844d0912..c9327abb771c 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/mm_types.h> -#include <linux/rbtree.h> +#include <linux/maple_tree.h> #include <linux/rwsem.h> #include <linux/spinlock.h> #include <linux/list.h> @@ -28,7 +28,7 @@ * and size this cpu_bitmask to NR_CPUS. */ struct mm_struct init_mm = { - .mm_rb = RB_ROOT, + .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, init_mm.mmap_lock), .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), diff --git a/mm/internal.h b/mm/internal.h index 785409805ed7..bcf75a8b032d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -83,9 +83,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf); void folio_rotate_reclaimable(struct folio *folio); bool __folio_end_writeback(struct folio *folio); void deactivate_file_folio(struct folio *folio); +void folio_activate(struct folio *folio); -void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, - unsigned long floor, unsigned long ceiling); +void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, + struct vm_area_struct *start_vma, unsigned long floor, + unsigned long ceiling); void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); struct zap_details; @@ -104,9 +106,9 @@ static inline void force_page_cache_readahead(struct address_space *mapping, force_page_cache_ra(&ractl, nr_to_read); } -unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, +unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); -unsigned find_get_entries(struct address_space *mapping, pgoff_t start, +unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); void filemap_free_folio(struct address_space *mapping, struct folio *folio); int truncate_inode_folio(struct address_space *mapping, struct folio *folio); @@ -187,7 +189,7 @@ extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason /* * in mm/rmap.c: */ -extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); +pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); /* * in mm/page_alloc.c @@ -365,7 +367,6 @@ extern int user_min_free_kbytes; extern void free_unref_page(struct page *page, unsigned int order); extern void free_unref_page_list(struct list_head *list); -extern void zone_pcp_update(struct zone *zone, int cpu_online); extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); extern void zone_pcp_enable(struct zone *zone); @@ -479,9 +480,6 @@ static inline bool is_data_mapping(vm_flags_t flags) } /* mm/util.c */ -void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev); -void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma); struct anon_vma *folio_anon_vma(struct folio *folio); #ifdef CONFIG_MMU @@ -639,34 +637,6 @@ static inline void vunmap_range_noflush(unsigned long start, unsigned long end) } #endif /* !CONFIG_MMU */ -/* - * Return the mem_map entry representing the 'offset' subpage within - * the maximally aligned gigantic page 'base'. Handle any discontiguity - * in the mem_map at MAX_ORDER_NR_PAGES boundaries. - */ -static inline struct page *mem_map_offset(struct page *base, int offset) -{ - if (unlikely(offset >= MAX_ORDER_NR_PAGES)) - return nth_page(base, offset); - return base + offset; -} - -/* - * Iterator over all subpages within the maximally aligned gigantic - * page 'base'. Handle any discontiguity in the mem_map. - */ -static inline struct page *mem_map_next(struct page *iter, - struct page *base, int offset) -{ - if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) { - unsigned long pfn = page_to_pfn(base) + offset; - if (!pfn_valid(pfn)) - return NULL; - return pfn_to_page(pfn); - } - return iter + 1; -} - /* Memory initialisation debug and verification */ enum mminit_level { MMINIT_WARNING, @@ -738,14 +708,6 @@ extern u64 hwpoison_filter_flags_value; extern u64 hwpoison_filter_memcg; extern u32 hwpoison_filter_enable; -#ifdef CONFIG_MEMORY_FAILURE -void clear_hwpoisoned_pages(struct page *memmap, int nr_pages); -#else -static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) -{ -} -#endif - extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); @@ -847,8 +809,14 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end, } #endif +int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift); + void vunmap_range_noflush(unsigned long start, unsigned long end); +void __vunmap_range_noflush(unsigned long start, unsigned long end); + int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags); @@ -860,8 +828,6 @@ int migrate_device_coherent_page(struct page *page); */ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); -DECLARE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); - extern bool mirrored_kernelcore; static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 1f84df9c302e..d4837bff3b60 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -35,7 +35,15 @@ CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME) CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_KASAN_TEST := $(CFLAGS_KASAN) -fno-builtin $(call cc-disable-warning, vla) + +CFLAGS_kasan_test.o := $(CFLAGS_KASAN_TEST) +CFLAGS_kasan_test_module.o := $(CFLAGS_KASAN_TEST) + obj-y := common.o report.o obj-$(CONFIG_KASAN_GENERIC) += init.o generic.o report_generic.o shadow.o quarantine.o obj-$(CONFIG_KASAN_HW_TAGS) += hw_tags.o report_hw_tags.o tags.o report_tags.o obj-$(CONFIG_KASAN_SW_TAGS) += init.o report_sw_tags.o shadow.o sw_tags.o tags.o report_tags.o + +obj-$(CONFIG_KASAN_KUNIT_TEST) += kasan_test.o +obj-$(CONFIG_KASAN_MODULE_TEST) += kasan_test_module.o diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 69f583855c8b..833bf2cfd2a3 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -30,13 +30,20 @@ #include "kasan.h" #include "../slab.h" +struct slab *kasan_addr_to_slab(const void *addr) +{ + if (virt_addr_valid(addr)) + return virt_to_slab(addr); + return NULL; +} + depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc) { unsigned long entries[KASAN_STACK_DEPTH]; unsigned int nr_entries; nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); - return __stack_depot_save(entries, nr_entries, flags, can_alloc); + return __stack_depot_save(entries, nr_entries, 0, flags, can_alloc); } void kasan_set_track(struct kasan_track *track, gfp_t flags) @@ -88,17 +95,6 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) } #endif /* CONFIG_KASAN_STACK */ -/* - * Only allow cache merging when stack collection is disabled and no metadata - * is present. - */ -slab_flags_t __kasan_never_merge(void) -{ - if (kasan_stack_collection_enabled()) - return SLAB_KASAN; - return 0; -} - void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init) { u8 tag; @@ -121,132 +117,11 @@ void __kasan_poison_pages(struct page *page, unsigned int order, bool init) KASAN_PAGE_FREE, init); } -/* - * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. - * For larger allocations larger redzones are used. - */ -static inline unsigned int optimal_redzone(unsigned int object_size) -{ - return - object_size <= 64 - 16 ? 16 : - object_size <= 128 - 32 ? 32 : - object_size <= 512 - 64 ? 64 : - object_size <= 4096 - 128 ? 128 : - object_size <= (1 << 14) - 256 ? 256 : - object_size <= (1 << 15) - 512 ? 512 : - object_size <= (1 << 16) - 1024 ? 1024 : 2048; -} - -void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size, - slab_flags_t *flags) -{ - unsigned int ok_size; - unsigned int optimal_size; - - /* - * SLAB_KASAN is used to mark caches as ones that are sanitized by - * KASAN. Currently this flag is used in two places: - * 1. In slab_ksize() when calculating the size of the accessible - * memory within the object. - * 2. In slab_common.c to prevent merging of sanitized caches. - */ - *flags |= SLAB_KASAN; - - if (!kasan_stack_collection_enabled()) - return; - - ok_size = *size; - - /* Add alloc meta into redzone. */ - cache->kasan_info.alloc_meta_offset = *size; - *size += sizeof(struct kasan_alloc_meta); - - /* - * If alloc meta doesn't fit, don't add it. - * This can only happen with SLAB, as it has KMALLOC_MAX_SIZE equal - * to KMALLOC_MAX_CACHE_SIZE and doesn't fall back to page_alloc for - * larger sizes. - */ - if (*size > KMALLOC_MAX_SIZE) { - cache->kasan_info.alloc_meta_offset = 0; - *size = ok_size; - /* Continue, since free meta might still fit. */ - } - - /* Only the generic mode uses free meta or flexible redzones. */ - if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) { - cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META; - return; - } - - /* - * Add free meta into redzone when it's not possible to store - * it in the object. This is the case when: - * 1. Object is SLAB_TYPESAFE_BY_RCU, which means that it can - * be touched after it was freed, or - * 2. Object has a constructor, which means it's expected to - * retain its content until the next allocation, or - * 3. Object is too small. - * Otherwise cache->kasan_info.free_meta_offset = 0 is implied. - */ - if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor || - cache->object_size < sizeof(struct kasan_free_meta)) { - ok_size = *size; - - cache->kasan_info.free_meta_offset = *size; - *size += sizeof(struct kasan_free_meta); - - /* If free meta doesn't fit, don't add it. */ - if (*size > KMALLOC_MAX_SIZE) { - cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META; - *size = ok_size; - } - } - - /* Calculate size with optimal redzone. */ - optimal_size = cache->object_size + optimal_redzone(cache->object_size); - /* Limit it with KMALLOC_MAX_SIZE (relevant for SLAB only). */ - if (optimal_size > KMALLOC_MAX_SIZE) - optimal_size = KMALLOC_MAX_SIZE; - /* Use optimal size if the size with added metas is not large enough. */ - if (*size < optimal_size) - *size = optimal_size; -} - void __kasan_cache_create_kmalloc(struct kmem_cache *cache) { cache->kasan_info.is_kmalloc = true; } -size_t __kasan_metadata_size(struct kmem_cache *cache) -{ - if (!kasan_stack_collection_enabled()) - return 0; - return (cache->kasan_info.alloc_meta_offset ? - sizeof(struct kasan_alloc_meta) : 0) + - (cache->kasan_info.free_meta_offset ? - sizeof(struct kasan_free_meta) : 0); -} - -struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, - const void *object) -{ - if (!cache->kasan_info.alloc_meta_offset) - return NULL; - return kasan_reset_tag(object) + cache->kasan_info.alloc_meta_offset; -} - -#ifdef CONFIG_KASAN_GENERIC -struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, - const void *object) -{ - BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); - if (cache->kasan_info.free_meta_offset == KASAN_NO_FREE_META) - return NULL; - return kasan_reset_tag(object) + cache->kasan_info.free_meta_offset; -} -#endif - void __kasan_poison_slab(struct slab *slab) { struct page *page = slab_page(slab); @@ -312,13 +187,9 @@ static inline u8 assign_tag(struct kmem_cache *cache, void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, const void *object) { - struct kasan_alloc_meta *alloc_meta; - - if (kasan_stack_collection_enabled()) { - alloc_meta = kasan_get_alloc_meta(cache, object); - if (alloc_meta) - __memset(alloc_meta, 0, sizeof(*alloc_meta)); - } + /* Initialize per-object metadata if it is present. */ + if (kasan_requires_meta()) + kasan_init_object_meta(cache, object); /* Tag is ignored in set_tag() without CONFIG_KASAN_SW/HW_TAGS */ object = set_tag(object, assign_tag(cache, object, true)); @@ -329,13 +200,11 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip, bool quarantine, bool init) { - u8 tag; void *tagged_object; if (!kasan_arch_is_ready()) return false; - tag = get_tag(object); tagged_object = object; object = kasan_reset_tag(object); @@ -364,7 +233,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, return false; if (kasan_stack_collection_enabled()) - kasan_set_free_info(cache, object, tag); + kasan_save_free_info(cache, tagged_object); return kasan_quarantine_put(cache, object); } @@ -423,20 +292,6 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip) } } -static void set_alloc_info(struct kmem_cache *cache, void *object, - gfp_t flags, bool is_kmalloc) -{ - struct kasan_alloc_meta *alloc_meta; - - /* Don't save alloc info for kmalloc caches in kasan_slab_alloc(). */ - if (cache->kasan_info.is_kmalloc && !is_kmalloc) - return; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (alloc_meta) - kasan_set_track(&alloc_meta->alloc_track, flags); -} - void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags, bool init) { @@ -466,8 +321,8 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, kasan_unpoison(tagged_object, cache->object_size, init); /* Save alloc info (if possible) for non-kmalloc() allocations. */ - if (kasan_stack_collection_enabled()) - set_alloc_info(cache, (void *)object, flags, false); + if (kasan_stack_collection_enabled() && !cache->kasan_info.is_kmalloc) + kasan_save_alloc_info(cache, tagged_object, flags); return tagged_object; } @@ -512,8 +367,8 @@ static inline void *____kasan_kmalloc(struct kmem_cache *cache, * Save alloc info (if possible) for kmalloc() allocations. * This also rewrites the alloc info when called from kasan_krealloc(). */ - if (kasan_stack_collection_enabled()) - set_alloc_info(cache, (void *)object, flags, true); + if (kasan_stack_collection_enabled() && cache->kasan_info.is_kmalloc) + kasan_save_alloc_info(cache, (void *)object, flags); /* Keep the tag that was set by kasan_slab_alloc(). */ return (void *)object; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 437fcc7e77cf..b076f597a378 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -328,6 +328,146 @@ DEFINE_ASAN_SET_SHADOW(f3); DEFINE_ASAN_SET_SHADOW(f5); DEFINE_ASAN_SET_SHADOW(f8); +/* Only allow cache merging when no per-object metadata is present. */ +slab_flags_t kasan_never_merge(void) +{ + if (!kasan_requires_meta()) + return 0; + return SLAB_KASAN; +} + +/* + * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. + * For larger allocations larger redzones are used. + */ +static inline unsigned int optimal_redzone(unsigned int object_size) +{ + return + object_size <= 64 - 16 ? 16 : + object_size <= 128 - 32 ? 32 : + object_size <= 512 - 64 ? 64 : + object_size <= 4096 - 128 ? 128 : + object_size <= (1 << 14) - 256 ? 256 : + object_size <= (1 << 15) - 512 ? 512 : + object_size <= (1 << 16) - 1024 ? 1024 : 2048; +} + +void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, + slab_flags_t *flags) +{ + unsigned int ok_size; + unsigned int optimal_size; + + if (!kasan_requires_meta()) + return; + + /* + * SLAB_KASAN is used to mark caches that are sanitized by KASAN + * and that thus have per-object metadata. + * Currently this flag is used in two places: + * 1. In slab_ksize() to account for per-object metadata when + * calculating the size of the accessible memory within the object. + * 2. In slab_common.c via kasan_never_merge() to prevent merging of + * caches with per-object metadata. + */ + *flags |= SLAB_KASAN; + + ok_size = *size; + + /* Add alloc meta into redzone. */ + cache->kasan_info.alloc_meta_offset = *size; + *size += sizeof(struct kasan_alloc_meta); + + /* + * If alloc meta doesn't fit, don't add it. + * This can only happen with SLAB, as it has KMALLOC_MAX_SIZE equal + * to KMALLOC_MAX_CACHE_SIZE and doesn't fall back to page_alloc for + * larger sizes. + */ + if (*size > KMALLOC_MAX_SIZE) { + cache->kasan_info.alloc_meta_offset = 0; + *size = ok_size; + /* Continue, since free meta might still fit. */ + } + + /* + * Add free meta into redzone when it's not possible to store + * it in the object. This is the case when: + * 1. Object is SLAB_TYPESAFE_BY_RCU, which means that it can + * be touched after it was freed, or + * 2. Object has a constructor, which means it's expected to + * retain its content until the next allocation, or + * 3. Object is too small. + * Otherwise cache->kasan_info.free_meta_offset = 0 is implied. + */ + if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor || + cache->object_size < sizeof(struct kasan_free_meta)) { + ok_size = *size; + + cache->kasan_info.free_meta_offset = *size; + *size += sizeof(struct kasan_free_meta); + + /* If free meta doesn't fit, don't add it. */ + if (*size > KMALLOC_MAX_SIZE) { + cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META; + *size = ok_size; + } + } + + /* Calculate size with optimal redzone. */ + optimal_size = cache->object_size + optimal_redzone(cache->object_size); + /* Limit it with KMALLOC_MAX_SIZE (relevant for SLAB only). */ + if (optimal_size > KMALLOC_MAX_SIZE) + optimal_size = KMALLOC_MAX_SIZE; + /* Use optimal size if the size with added metas is not large enough. */ + if (*size < optimal_size) + *size = optimal_size; +} + +struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, + const void *object) +{ + if (!cache->kasan_info.alloc_meta_offset) + return NULL; + return (void *)object + cache->kasan_info.alloc_meta_offset; +} + +struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, + const void *object) +{ + BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); + if (cache->kasan_info.free_meta_offset == KASAN_NO_FREE_META) + return NULL; + return (void *)object + cache->kasan_info.free_meta_offset; +} + +void kasan_init_object_meta(struct kmem_cache *cache, const void *object) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (alloc_meta) + __memset(alloc_meta, 0, sizeof(*alloc_meta)); +} + +size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object) +{ + struct kasan_cache *info = &cache->kasan_info; + + if (!kasan_requires_meta()) + return 0; + + if (in_object) + return (info->free_meta_offset ? + 0 : sizeof(struct kasan_free_meta)); + else + return (info->alloc_meta_offset ? + sizeof(struct kasan_alloc_meta) : 0) + + ((info->free_meta_offset && + info->free_meta_offset != KASAN_NO_FREE_META) ? + sizeof(struct kasan_free_meta) : 0); +} + static void __kasan_record_aux_stack(void *addr, bool can_alloc) { struct slab *slab = kasan_addr_to_slab(addr); @@ -358,8 +498,16 @@ void kasan_record_aux_stack_noalloc(void *addr) return __kasan_record_aux_stack(addr, false); } -void kasan_set_free_info(struct kmem_cache *cache, - void *object, u8 tag) +void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (alloc_meta) + kasan_set_track(&alloc_meta->alloc_track, flags); +} + +void kasan_save_free_info(struct kmem_cache *cache, void *object) { struct kasan_free_meta *free_meta; @@ -371,12 +519,3 @@ void kasan_set_free_info(struct kmem_cache *cache, /* The object was freed and has free track set. */ *(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREETRACK; } - -struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag) -{ - if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREETRACK) - return NULL; - /* Free meta must be present with KASAN_SLAB_FREETRACK. */ - return &kasan_get_free_meta(cache, object)->free_track; -} diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 9ad8eff71b28..b22c4f461cb0 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -38,16 +38,9 @@ enum kasan_arg_vmalloc { KASAN_ARG_VMALLOC_ON, }; -enum kasan_arg_stacktrace { - KASAN_ARG_STACKTRACE_DEFAULT, - KASAN_ARG_STACKTRACE_OFF, - KASAN_ARG_STACKTRACE_ON, -}; - static enum kasan_arg kasan_arg __ro_after_init; static enum kasan_arg_mode kasan_arg_mode __ro_after_init; static enum kasan_arg_vmalloc kasan_arg_vmalloc __initdata; -static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata; /* * Whether KASAN is enabled at all. @@ -66,9 +59,6 @@ EXPORT_SYMBOL_GPL(kasan_mode); /* Whether to enable vmalloc tagging. */ DEFINE_STATIC_KEY_TRUE(kasan_flag_vmalloc); -/* Whether to collect alloc/free stack traces. */ -DEFINE_STATIC_KEY_TRUE(kasan_flag_stacktrace); - /* kasan=off/on */ static int __init early_kasan_flag(char *arg) { @@ -122,23 +112,6 @@ static int __init early_kasan_flag_vmalloc(char *arg) } early_param("kasan.vmalloc", early_kasan_flag_vmalloc); -/* kasan.stacktrace=off/on */ -static int __init early_kasan_flag_stacktrace(char *arg) -{ - if (!arg) - return -EINVAL; - - if (!strcmp(arg, "off")) - kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_OFF; - else if (!strcmp(arg, "on")) - kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_ON; - else - return -EINVAL; - - return 0; -} -early_param("kasan.stacktrace", early_kasan_flag_stacktrace); - static inline const char *kasan_mode_info(void) { if (kasan_mode == KASAN_MODE_ASYNC) @@ -213,17 +186,7 @@ void __init kasan_init_hw_tags(void) break; } - switch (kasan_arg_stacktrace) { - case KASAN_ARG_STACKTRACE_DEFAULT: - /* Default is specified by kasan_flag_stacktrace definition. */ - break; - case KASAN_ARG_STACKTRACE_OFF: - static_branch_disable(&kasan_flag_stacktrace); - break; - case KASAN_ARG_STACKTRACE_ON: - static_branch_enable(&kasan_flag_stacktrace); - break; - } + kasan_init_tags(); /* KASAN is now initialized, enable it. */ static_branch_enable(&kasan_flag_enabled); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 01c03e45acd4..ea8cf1310b1e 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -2,18 +2,37 @@ #ifndef __MM_KASAN_KASAN_H #define __MM_KASAN_KASAN_H +#include <linux/atomic.h> #include <linux/kasan.h> #include <linux/kasan-tags.h> #include <linux/kfence.h> #include <linux/stackdepot.h> -#ifdef CONFIG_KASAN_HW_TAGS +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) #include <linux/static_key.h> + +DECLARE_STATIC_KEY_TRUE(kasan_flag_stacktrace); + +static inline bool kasan_stack_collection_enabled(void) +{ + return static_branch_unlikely(&kasan_flag_stacktrace); +} + +#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ + +static inline bool kasan_stack_collection_enabled(void) +{ + return true; +} + +#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ + +#ifdef CONFIG_KASAN_HW_TAGS + #include "../slab.h" DECLARE_STATIC_KEY_TRUE(kasan_flag_vmalloc); -DECLARE_STATIC_KEY_TRUE(kasan_flag_stacktrace); enum kasan_mode { KASAN_MODE_SYNC, @@ -28,11 +47,6 @@ static inline bool kasan_vmalloc_enabled(void) return static_branch_likely(&kasan_flag_vmalloc); } -static inline bool kasan_stack_collection_enabled(void) -{ - return static_branch_unlikely(&kasan_flag_stacktrace); -} - static inline bool kasan_async_fault_possible(void) { return kasan_mode == KASAN_MODE_ASYNC || kasan_mode == KASAN_MODE_ASYMM; @@ -43,12 +57,7 @@ static inline bool kasan_sync_fault_possible(void) return kasan_mode == KASAN_MODE_SYNC || kasan_mode == KASAN_MODE_ASYMM; } -#else - -static inline bool kasan_stack_collection_enabled(void) -{ - return true; -} +#else /* CONFIG_KASAN_HW_TAGS */ static inline bool kasan_async_fault_possible(void) { @@ -60,7 +69,31 @@ static inline bool kasan_sync_fault_possible(void) return true; } -#endif +#endif /* CONFIG_KASAN_HW_TAGS */ + +#ifdef CONFIG_KASAN_GENERIC + +/* Generic KASAN uses per-object metadata to store stack traces. */ +static inline bool kasan_requires_meta(void) +{ + /* + * Technically, Generic KASAN always collects stack traces right now. + * However, let's use kasan_stack_collection_enabled() in case the + * kasan.stacktrace command-line argument is changed to affect + * Generic KASAN. + */ + return kasan_stack_collection_enabled(); +} + +#else /* CONFIG_KASAN_GENERIC */ + +/* Tag-based KASAN modes do not use per-object metadata. */ +static inline bool kasan_requires_meta(void) +{ + return false; +} + +#endif /* CONFIG_KASAN_GENERIC */ #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) #define KASAN_GRANULE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) @@ -122,6 +155,13 @@ static inline bool kasan_sync_fault_possible(void) #define META_MEM_BYTES_PER_ROW (META_BYTES_PER_ROW * KASAN_GRANULE_SIZE) #define META_ROWS_AROUND_ADDR 2 +#define KASAN_STACK_DEPTH 64 + +struct kasan_track { + u32 pid; + depot_stack_handle_t stack; +}; + enum kasan_report_type { KASAN_REPORT_ACCESS, KASAN_REPORT_INVALID_FREE, @@ -129,12 +169,22 @@ enum kasan_report_type { }; struct kasan_report_info { + /* Filled in by kasan_report_*(). */ enum kasan_report_type type; void *access_addr; - void *first_bad_addr; size_t access_size; bool is_write; unsigned long ip; + + /* Filled in by the common reporting code. */ + void *first_bad_addr; + struct kmem_cache *cache; + void *object; + + /* Filled in by the mode-specific reporting code. */ + const char *bug_type; + struct kasan_track alloc_track; + struct kasan_track free_track; }; /* Do not change the struct layout: compiler ABI. */ @@ -160,33 +210,14 @@ struct kasan_global { #endif }; -/* Structures for keeping alloc and free tracks. */ - -#define KASAN_STACK_DEPTH 64 +/* Structures for keeping alloc and free meta. */ -struct kasan_track { - u32 pid; - depot_stack_handle_t stack; -}; - -#if defined(CONFIG_KASAN_TAGS_IDENTIFY) && defined(CONFIG_KASAN_SW_TAGS) -#define KASAN_NR_FREE_STACKS 5 -#else -#define KASAN_NR_FREE_STACKS 1 -#endif +#ifdef CONFIG_KASAN_GENERIC struct kasan_alloc_meta { struct kasan_track alloc_track; - /* Generic mode stores free track in kasan_free_meta. */ -#ifdef CONFIG_KASAN_GENERIC + /* Free track is stored in kasan_free_meta. */ depot_stack_handle_t aux_stack[2]; -#else - struct kasan_track free_track[KASAN_NR_FREE_STACKS]; -#endif -#ifdef CONFIG_KASAN_TAGS_IDENTIFY - u8 free_pointer_tag[KASAN_NR_FREE_STACKS]; - u8 free_track_idx; -#endif }; struct qlist_node { @@ -205,26 +236,30 @@ struct qlist_node { * After that, slab allocator stores the freelist pointer in the object. */ struct kasan_free_meta { -#ifdef CONFIG_KASAN_GENERIC struct qlist_node quarantine_link; struct kasan_track free_track; -#endif }; -#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) -/* Used in KUnit-compatible KASAN tests. */ -struct kunit_kasan_status { - bool report_found; - bool sync_fault; +#endif /* CONFIG_KASAN_GENERIC */ + +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) + +struct kasan_stack_ring_entry { + void *ptr; + size_t size; + u32 pid; + depot_stack_handle_t stack; + bool is_free; }; -#endif -struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, - const void *object); -#ifdef CONFIG_KASAN_GENERIC -struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, - const void *object); -#endif +struct kasan_stack_ring { + rwlock_t lock; + size_t size; + atomic64_t pos; + struct kasan_stack_ring_entry *entries; +}; + +#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) @@ -260,34 +295,50 @@ static inline bool addr_has_metadata(const void *addr) #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ +void *kasan_find_first_bad_addr(void *addr, size_t size); +void kasan_complete_mode_report_info(struct kasan_report_info *info); +void kasan_metadata_fetch_row(char *buffer, void *row); + #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) void kasan_print_tags(u8 addr_tag, const void *addr); #else static inline void kasan_print_tags(u8 addr_tag, const void *addr) { } #endif -void *kasan_find_first_bad_addr(void *addr, size_t size); -const char *kasan_get_bug_type(struct kasan_report_info *info); -void kasan_metadata_fetch_row(char *buffer, void *row); - #if defined(CONFIG_KASAN_STACK) void kasan_print_address_stack_frame(const void *addr); #else static inline void kasan_print_address_stack_frame(const void *addr) { } #endif +#ifdef CONFIG_KASAN_GENERIC +void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object); +#else +static inline void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object) { } +#endif + bool kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report_type type); -struct page *kasan_addr_to_page(const void *addr); struct slab *kasan_addr_to_slab(const void *addr); +#ifdef CONFIG_KASAN_GENERIC +void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int *size); +void kasan_init_object_meta(struct kmem_cache *cache, const void *object); +struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, + const void *object); +struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, + const void *object); +#else +static inline void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int *size) { } +static inline void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { } +#endif + depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); void kasan_set_track(struct kasan_track *track, gfp_t flags); -void kasan_set_free_info(struct kmem_cache *cache, void *object, u8 tag); -struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag); +void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); +void kasan_save_free_info(struct kmem_cache *cache, void *object); #if defined(CONFIG_KASAN_GENERIC) && \ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB)) @@ -358,6 +409,10 @@ static inline void kasan_enable_tagging(void) { } #endif /* CONFIG_KASAN_HW_TAGS */ +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) +void __init kasan_init_tags(void); +#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ + #if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) void kasan_force_async_fault(void); @@ -486,6 +541,18 @@ static inline bool kasan_arch_is_ready(void) { return true; } #error kasan_arch_is_ready only works in KASAN generic outline mode! #endif +#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) + +void kasan_kunit_test_suite_start(void); +void kasan_kunit_test_suite_end(void); + +#else /* CONFIG_KASAN_KUNIT_TEST */ + +static inline void kasan_kunit_test_suite_start(void) { } +static inline void kasan_kunit_test_suite_end(void) { } + +#endif /* CONFIG_KASAN_KUNIT_TEST */ + #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST) bool kasan_save_enable_multi_shot(void); diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c new file mode 100644 index 000000000000..74cd80c12b25 --- /dev/null +++ b/mm/kasan/kasan_test.c @@ -0,0 +1,1570 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * + * Copyright (c) 2014 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin <a.ryabinin@samsung.com> + */ + +#define pr_fmt(fmt) "kasan_test: " fmt + +#include <kunit/test.h> +#include <linux/bitops.h> +#include <linux/delay.h> +#include <linux/io.h> +#include <linux/kasan.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/module.h> +#include <linux/printk.h> +#include <linux/random.h> +#include <linux/set_memory.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/tracepoint.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> +#include <trace/events/printk.h> + +#include <asm/page.h> + +#include "kasan.h" + +#define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE) + +static bool multishot; + +/* Fields set based on lines observed in the console. */ +static struct { + bool report_found; + bool async_fault; +} test_status; + +/* + * Some tests use these global variables to store return values from function + * calls that could otherwise be eliminated by the compiler as dead code. + */ +void *kasan_ptr_result; +int kasan_int_result; + +/* Probe for console output: obtains test_status lines of interest. */ +static void probe_console(void *ignore, const char *buf, size_t len) +{ + if (strnstr(buf, "BUG: KASAN: ", len)) + WRITE_ONCE(test_status.report_found, true); + else if (strnstr(buf, "Asynchronous fault: ", len)) + WRITE_ONCE(test_status.async_fault, true); +} + +static void register_tracepoints(struct tracepoint *tp, void *ignore) +{ + check_trace_callback_type_console(probe_console); + if (!strcmp(tp->name, "console")) + WARN_ON(tracepoint_probe_register(tp, probe_console, NULL)); +} + +static void unregister_tracepoints(struct tracepoint *tp, void *ignore) +{ + if (!strcmp(tp->name, "console")) + tracepoint_probe_unregister(tp, probe_console, NULL); +} + +static int kasan_suite_init(struct kunit_suite *suite) +{ + if (!kasan_enabled()) { + pr_err("Can't run KASAN tests with KASAN disabled"); + return -1; + } + + /* Stop failing KUnit tests on KASAN reports. */ + kasan_kunit_test_suite_start(); + + /* + * Temporarily enable multi-shot mode. Otherwise, KASAN would only + * report the first detected bug and panic the kernel if panic_on_warn + * is enabled. + */ + multishot = kasan_save_enable_multi_shot(); + + /* + * Because we want to be able to build the test as a module, we need to + * iterate through all known tracepoints, since the static registration + * won't work here. + */ + for_each_kernel_tracepoint(register_tracepoints, NULL); + return 0; +} + +static void kasan_suite_exit(struct kunit_suite *suite) +{ + kasan_kunit_test_suite_end(); + kasan_restore_multi_shot(multishot); + for_each_kernel_tracepoint(unregister_tracepoints, NULL); + tracepoint_synchronize_unregister(); +} + +static void kasan_test_exit(struct kunit *test) +{ + KUNIT_EXPECT_FALSE(test, READ_ONCE(test_status.report_found)); +} + +/** + * KUNIT_EXPECT_KASAN_FAIL() - check that the executed expression produces a + * KASAN report; causes a test failure otherwise. This relies on a KUnit + * resource named "kasan_status". Do not use this name for KUnit resources + * outside of KASAN tests. + * + * For hardware tag-based KASAN, when a synchronous tag fault happens, tag + * checking is auto-disabled. When this happens, this test handler reenables + * tag checking. As tag checking can be only disabled or enabled per CPU, + * this handler disables migration (preemption). + * + * Since the compiler doesn't see that the expression can change the test_status + * fields, it can reorder or optimize away the accesses to those fields. + * Use READ/WRITE_ONCE() for the accesses and compiler barriers around the + * expression to prevent that. + * + * In between KUNIT_EXPECT_KASAN_FAIL checks, test_status.report_found is kept + * as false. This allows detecting KASAN reports that happen outside of the + * checks by asserting !test_status.report_found at the start of + * KUNIT_EXPECT_KASAN_FAIL and in kasan_test_exit. + */ +#define KUNIT_EXPECT_KASAN_FAIL(test, expression) do { \ + if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \ + kasan_sync_fault_possible()) \ + migrate_disable(); \ + KUNIT_EXPECT_FALSE(test, READ_ONCE(test_status.report_found)); \ + barrier(); \ + expression; \ + barrier(); \ + if (kasan_async_fault_possible()) \ + kasan_force_async_fault(); \ + if (!READ_ONCE(test_status.report_found)) { \ + KUNIT_FAIL(test, KUNIT_SUBTEST_INDENT "KASAN failure " \ + "expected in \"" #expression \ + "\", but none occurred"); \ + } \ + if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \ + kasan_sync_fault_possible()) { \ + if (READ_ONCE(test_status.report_found) && \ + !READ_ONCE(test_status.async_fault)) \ + kasan_enable_tagging(); \ + migrate_enable(); \ + } \ + WRITE_ONCE(test_status.report_found, false); \ + WRITE_ONCE(test_status.async_fault, false); \ +} while (0) + +#define KASAN_TEST_NEEDS_CONFIG_ON(test, config) do { \ + if (!IS_ENABLED(config)) \ + kunit_skip((test), "Test requires " #config "=y"); \ +} while (0) + +#define KASAN_TEST_NEEDS_CONFIG_OFF(test, config) do { \ + if (IS_ENABLED(config)) \ + kunit_skip((test), "Test requires " #config "=n"); \ +} while (0) + +static void kmalloc_oob_right(struct kunit *test) +{ + char *ptr; + size_t size = 128 - KASAN_GRANULE_SIZE - 5; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + OPTIMIZER_HIDE_VAR(ptr); + /* + * An unaligned access past the requested kmalloc size. + * Only generic KASAN can precisely detect these. + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 'x'); + + /* + * An aligned access into the first out-of-bounds granule that falls + * within the aligned kmalloc object. + */ + KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + 5] = 'y'); + + /* Out-of-bounds access past the aligned kmalloc object. */ + KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = + ptr[size + KASAN_GRANULE_SIZE + 5]); + + kfree(ptr); +} + +static void kmalloc_oob_left(struct kunit *test) +{ + char *ptr; + size_t size = 15; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + OPTIMIZER_HIDE_VAR(ptr); + KUNIT_EXPECT_KASAN_FAIL(test, *ptr = *(ptr - 1)); + kfree(ptr); +} + +static void kmalloc_node_oob_right(struct kunit *test) +{ + char *ptr; + size_t size = 4096; + + ptr = kmalloc_node(size, GFP_KERNEL, 0); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + OPTIMIZER_HIDE_VAR(ptr); + KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = ptr[size]); + kfree(ptr); +} + +/* + * These kmalloc_pagealloc_* tests try allocating a memory chunk that doesn't + * fit into a slab cache and therefore is allocated via the page allocator + * fallback. Since this kind of fallback is only implemented for SLUB, these + * tests are limited to that allocator. + */ +static void kmalloc_pagealloc_oob_right(struct kunit *test) +{ + char *ptr; + size_t size = KMALLOC_MAX_CACHE_SIZE + 10; + + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB); + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + OPTIMIZER_HIDE_VAR(ptr); + KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + OOB_TAG_OFF] = 0); + + kfree(ptr); +} + +static void kmalloc_pagealloc_uaf(struct kunit *test) +{ + char *ptr; + size_t size = KMALLOC_MAX_CACHE_SIZE + 10; + + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB); + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + kfree(ptr); + + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]); +} + +static void kmalloc_pagealloc_invalid_free(struct kunit *test) +{ + char *ptr; + size_t size = KMALLOC_MAX_CACHE_SIZE + 10; + + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB); + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + KUNIT_EXPECT_KASAN_FAIL(test, kfree(ptr + 1)); +} + +static void pagealloc_oob_right(struct kunit *test) +{ + char *ptr; + struct page *pages; + size_t order = 4; + size_t size = (1UL << (PAGE_SHIFT + order)); + + /* + * With generic KASAN page allocations have no redzones, thus + * out-of-bounds detection is not guaranteed. + * See https://bugzilla.kernel.org/show_bug.cgi?id=210503. + */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); + + pages = alloc_pages(GFP_KERNEL, order); + ptr = page_address(pages); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = ptr[size]); + free_pages((unsigned long)ptr, order); +} + +static void pagealloc_uaf(struct kunit *test) +{ + char *ptr; + struct page *pages; + size_t order = 4; + + pages = alloc_pages(GFP_KERNEL, order); + ptr = page_address(pages); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + free_pages((unsigned long)ptr, order); + + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]); +} + +static void kmalloc_large_oob_right(struct kunit *test) +{ + char *ptr; + size_t size = KMALLOC_MAX_CACHE_SIZE - 256; + + /* + * Allocate a chunk that is large enough, but still fits into a slab + * and does not trigger the page allocator fallback in SLUB. + */ + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + OPTIMIZER_HIDE_VAR(ptr); + KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 0); + kfree(ptr); +} + +static void krealloc_more_oob_helper(struct kunit *test, + size_t size1, size_t size2) +{ + char *ptr1, *ptr2; + size_t middle; + + KUNIT_ASSERT_LT(test, size1, size2); + middle = size1 + (size2 - size1) / 2; + + ptr1 = kmalloc(size1, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); + + ptr2 = krealloc(ptr1, size2, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + + /* Suppress -Warray-bounds warnings. */ + OPTIMIZER_HIDE_VAR(ptr2); + + /* All offsets up to size2 must be accessible. */ + ptr2[size1 - 1] = 'x'; + ptr2[size1] = 'x'; + ptr2[middle] = 'x'; + ptr2[size2 - 1] = 'x'; + + /* Generic mode is precise, so unaligned size2 must be inaccessible. */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size2] = 'x'); + + /* For all modes first aligned offset after size2 must be inaccessible. */ + KUNIT_EXPECT_KASAN_FAIL(test, + ptr2[round_up(size2, KASAN_GRANULE_SIZE)] = 'x'); + + kfree(ptr2); +} + +static void krealloc_less_oob_helper(struct kunit *test, + size_t size1, size_t size2) +{ + char *ptr1, *ptr2; + size_t middle; + + KUNIT_ASSERT_LT(test, size2, size1); + middle = size2 + (size1 - size2) / 2; + + ptr1 = kmalloc(size1, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); + + ptr2 = krealloc(ptr1, size2, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + + /* Suppress -Warray-bounds warnings. */ + OPTIMIZER_HIDE_VAR(ptr2); + + /* Must be accessible for all modes. */ + ptr2[size2 - 1] = 'x'; + + /* Generic mode is precise, so unaligned size2 must be inaccessible. */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size2] = 'x'); + + /* For all modes first aligned offset after size2 must be inaccessible. */ + KUNIT_EXPECT_KASAN_FAIL(test, + ptr2[round_up(size2, KASAN_GRANULE_SIZE)] = 'x'); + + /* + * For all modes all size2, middle, and size1 should land in separate + * granules and thus the latter two offsets should be inaccessible. + */ + KUNIT_EXPECT_LE(test, round_up(size2, KASAN_GRANULE_SIZE), + round_down(middle, KASAN_GRANULE_SIZE)); + KUNIT_EXPECT_LE(test, round_up(middle, KASAN_GRANULE_SIZE), + round_down(size1, KASAN_GRANULE_SIZE)); + KUNIT_EXPECT_KASAN_FAIL(test, ptr2[middle] = 'x'); + KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size1 - 1] = 'x'); + KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size1] = 'x'); + + kfree(ptr2); +} + +static void krealloc_more_oob(struct kunit *test) +{ + krealloc_more_oob_helper(test, 201, 235); +} + +static void krealloc_less_oob(struct kunit *test) +{ + krealloc_less_oob_helper(test, 235, 201); +} + +static void krealloc_pagealloc_more_oob(struct kunit *test) +{ + /* page_alloc fallback in only implemented for SLUB. */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB); + + krealloc_more_oob_helper(test, KMALLOC_MAX_CACHE_SIZE + 201, + KMALLOC_MAX_CACHE_SIZE + 235); +} + +static void krealloc_pagealloc_less_oob(struct kunit *test) +{ + /* page_alloc fallback in only implemented for SLUB. */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB); + + krealloc_less_oob_helper(test, KMALLOC_MAX_CACHE_SIZE + 235, + KMALLOC_MAX_CACHE_SIZE + 201); +} + +/* + * Check that krealloc() detects a use-after-free, returns NULL, + * and doesn't unpoison the freed object. + */ +static void krealloc_uaf(struct kunit *test) +{ + char *ptr1, *ptr2; + int size1 = 201; + int size2 = 235; + + ptr1 = kmalloc(size1, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); + kfree(ptr1); + + KUNIT_EXPECT_KASAN_FAIL(test, ptr2 = krealloc(ptr1, size2, GFP_KERNEL)); + KUNIT_ASSERT_NULL(test, ptr2); + KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)ptr1); +} + +static void kmalloc_oob_16(struct kunit *test) +{ + struct { + u64 words[2]; + } *ptr1, *ptr2; + + /* This test is specifically crafted for the generic mode. */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); + + ptr1 = kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); + + ptr2 = kmalloc(sizeof(*ptr2), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + + OPTIMIZER_HIDE_VAR(ptr1); + OPTIMIZER_HIDE_VAR(ptr2); + KUNIT_EXPECT_KASAN_FAIL(test, *ptr1 = *ptr2); + kfree(ptr1); + kfree(ptr2); +} + +static void kmalloc_uaf_16(struct kunit *test) +{ + struct { + u64 words[2]; + } *ptr1, *ptr2; + + ptr1 = kmalloc(sizeof(*ptr1), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); + + ptr2 = kmalloc(sizeof(*ptr2), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + kfree(ptr2); + + KUNIT_EXPECT_KASAN_FAIL(test, *ptr1 = *ptr2); + kfree(ptr1); +} + +/* + * Note: in the memset tests below, the written range touches both valid and + * invalid memory. This makes sure that the instrumentation does not only check + * the starting address but the whole range. + */ + +static void kmalloc_oob_memset_2(struct kunit *test) +{ + char *ptr; + size_t size = 128 - KASAN_GRANULE_SIZE; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + OPTIMIZER_HIDE_VAR(size); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 1, 0, 2)); + kfree(ptr); +} + +static void kmalloc_oob_memset_4(struct kunit *test) +{ + char *ptr; + size_t size = 128 - KASAN_GRANULE_SIZE; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + OPTIMIZER_HIDE_VAR(size); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 3, 0, 4)); + kfree(ptr); +} + +static void kmalloc_oob_memset_8(struct kunit *test) +{ + char *ptr; + size_t size = 128 - KASAN_GRANULE_SIZE; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + OPTIMIZER_HIDE_VAR(size); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 7, 0, 8)); + kfree(ptr); +} + +static void kmalloc_oob_memset_16(struct kunit *test) +{ + char *ptr; + size_t size = 128 - KASAN_GRANULE_SIZE; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + OPTIMIZER_HIDE_VAR(size); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 15, 0, 16)); + kfree(ptr); +} + +static void kmalloc_oob_in_memset(struct kunit *test) +{ + char *ptr; + size_t size = 128 - KASAN_GRANULE_SIZE; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + OPTIMIZER_HIDE_VAR(ptr); + OPTIMIZER_HIDE_VAR(size); + KUNIT_EXPECT_KASAN_FAIL(test, + memset(ptr, 0, size + KASAN_GRANULE_SIZE)); + kfree(ptr); +} + +static void kmalloc_memmove_negative_size(struct kunit *test) +{ + char *ptr; + size_t size = 64; + size_t invalid_size = -2; + + /* + * Hardware tag-based mode doesn't check memmove for negative size. + * As a result, this test introduces a side-effect memory corruption, + * which can result in a crash. + */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_HW_TAGS); + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + memset((char *)ptr, 0, 64); + OPTIMIZER_HIDE_VAR(ptr); + OPTIMIZER_HIDE_VAR(invalid_size); + KUNIT_EXPECT_KASAN_FAIL(test, + memmove((char *)ptr, (char *)ptr + 4, invalid_size)); + kfree(ptr); +} + +static void kmalloc_memmove_invalid_size(struct kunit *test) +{ + char *ptr; + size_t size = 64; + size_t invalid_size = size; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + memset((char *)ptr, 0, 64); + OPTIMIZER_HIDE_VAR(ptr); + OPTIMIZER_HIDE_VAR(invalid_size); + KUNIT_EXPECT_KASAN_FAIL(test, + memmove((char *)ptr, (char *)ptr + 4, invalid_size)); + kfree(ptr); +} + +static void kmalloc_uaf(struct kunit *test) +{ + char *ptr; + size_t size = 10; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + kfree(ptr); + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[8]); +} + +static void kmalloc_uaf_memset(struct kunit *test) +{ + char *ptr; + size_t size = 33; + + /* + * Only generic KASAN uses quarantine, which is required to avoid a + * kernel memory corruption this test causes. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + kfree(ptr); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr, 0, size)); +} + +static void kmalloc_uaf2(struct kunit *test) +{ + char *ptr1, *ptr2; + size_t size = 43; + int counter = 0; + +again: + ptr1 = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); + + kfree(ptr1); + + ptr2 = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + + /* + * For tag-based KASAN ptr1 and ptr2 tags might happen to be the same. + * Allow up to 16 attempts at generating different tags. + */ + if (!IS_ENABLED(CONFIG_KASAN_GENERIC) && ptr1 == ptr2 && counter++ < 16) { + kfree(ptr2); + goto again; + } + + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[40]); + KUNIT_EXPECT_PTR_NE(test, ptr1, ptr2); + + kfree(ptr2); +} + +/* + * Check that KASAN detects use-after-free when another object was allocated in + * the same slot. Relevant for the tag-based modes, which do not use quarantine. + */ +static void kmalloc_uaf3(struct kunit *test) +{ + char *ptr1, *ptr2; + size_t size = 100; + + /* This test is specifically crafted for tag-based modes. */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); + + ptr1 = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); + kfree(ptr1); + + ptr2 = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + kfree(ptr2); + + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[8]); +} + +static void kfree_via_page(struct kunit *test) +{ + char *ptr; + size_t size = 8; + struct page *page; + unsigned long offset; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + page = virt_to_page(ptr); + offset = offset_in_page(ptr); + kfree(page_address(page) + offset); +} + +static void kfree_via_phys(struct kunit *test) +{ + char *ptr; + size_t size = 8; + phys_addr_t phys; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + phys = virt_to_phys(ptr); + kfree(phys_to_virt(phys)); +} + +static void kmem_cache_oob(struct kunit *test) +{ + char *p; + size_t size = 200; + struct kmem_cache *cache; + + cache = kmem_cache_create("test_cache", size, 0, 0, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); + + p = kmem_cache_alloc(cache, GFP_KERNEL); + if (!p) { + kunit_err(test, "Allocation failed: %s\n", __func__); + kmem_cache_destroy(cache); + return; + } + + KUNIT_EXPECT_KASAN_FAIL(test, *p = p[size + OOB_TAG_OFF]); + + kmem_cache_free(cache, p); + kmem_cache_destroy(cache); +} + +static void kmem_cache_accounted(struct kunit *test) +{ + int i; + char *p; + size_t size = 200; + struct kmem_cache *cache; + + cache = kmem_cache_create("test_cache", size, 0, SLAB_ACCOUNT, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); + + /* + * Several allocations with a delay to allow for lazy per memcg kmem + * cache creation. + */ + for (i = 0; i < 5; i++) { + p = kmem_cache_alloc(cache, GFP_KERNEL); + if (!p) + goto free_cache; + + kmem_cache_free(cache, p); + msleep(100); + } + +free_cache: + kmem_cache_destroy(cache); +} + +static void kmem_cache_bulk(struct kunit *test) +{ + struct kmem_cache *cache; + size_t size = 200; + char *p[10]; + bool ret; + int i; + + cache = kmem_cache_create("test_cache", size, 0, 0, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); + + ret = kmem_cache_alloc_bulk(cache, GFP_KERNEL, ARRAY_SIZE(p), (void **)&p); + if (!ret) { + kunit_err(test, "Allocation failed: %s\n", __func__); + kmem_cache_destroy(cache); + return; + } + + for (i = 0; i < ARRAY_SIZE(p); i++) + p[i][0] = p[i][size - 1] = 42; + + kmem_cache_free_bulk(cache, ARRAY_SIZE(p), (void **)&p); + kmem_cache_destroy(cache); +} + +static char global_array[10]; + +static void kasan_global_oob_right(struct kunit *test) +{ + /* + * Deliberate out-of-bounds access. To prevent CONFIG_UBSAN_LOCAL_BOUNDS + * from failing here and panicking the kernel, access the array via a + * volatile pointer, which will prevent the compiler from being able to + * determine the array bounds. + * + * This access uses a volatile pointer to char (char *volatile) rather + * than the more conventional pointer to volatile char (volatile char *) + * because we want to prevent the compiler from making inferences about + * the pointer itself (i.e. its array bounds), not the data that it + * refers to. + */ + char *volatile array = global_array; + char *p = &array[ARRAY_SIZE(global_array) + 3]; + + /* Only generic mode instruments globals. */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); + + KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p); +} + +static void kasan_global_oob_left(struct kunit *test) +{ + char *volatile array = global_array; + char *p = array - 3; + + /* + * GCC is known to fail this test, skip it. + * See https://bugzilla.kernel.org/show_bug.cgi?id=215051. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_CC_IS_CLANG); + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); + KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p); +} + +/* Check that ksize() does NOT unpoison whole object. */ +static void ksize_unpoisons_memory(struct kunit *test) +{ + char *ptr; + size_t size = 128 - KASAN_GRANULE_SIZE - 5; + size_t real_size; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + real_size = ksize(ptr); + KUNIT_EXPECT_GT(test, real_size, size); + + OPTIMIZER_HIDE_VAR(ptr); + + /* These accesses shouldn't trigger a KASAN report. */ + ptr[0] = 'x'; + ptr[size - 1] = 'x'; + + /* These must trigger a KASAN report. */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size]); + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size + 5]); + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[real_size - 1]); + + kfree(ptr); +} + +/* + * Check that a use-after-free is detected by ksize() and via normal accesses + * after it. + */ +static void ksize_uaf(struct kunit *test) +{ + char *ptr; + int size = 128 - KASAN_GRANULE_SIZE; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + kfree(ptr); + + OPTIMIZER_HIDE_VAR(ptr); + KUNIT_EXPECT_KASAN_FAIL(test, ksize(ptr)); + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]); + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size]); +} + +static void kasan_stack_oob(struct kunit *test) +{ + char stack_array[10]; + /* See comment in kasan_global_oob_right. */ + char *volatile array = stack_array; + char *p = &array[ARRAY_SIZE(stack_array) + OOB_TAG_OFF]; + + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_STACK); + + KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p); +} + +static void kasan_alloca_oob_left(struct kunit *test) +{ + volatile int i = 10; + char alloca_array[i]; + /* See comment in kasan_global_oob_right. */ + char *volatile array = alloca_array; + char *p = array - 1; + + /* Only generic mode instruments dynamic allocas. */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_STACK); + + KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p); +} + +static void kasan_alloca_oob_right(struct kunit *test) +{ + volatile int i = 10; + char alloca_array[i]; + /* See comment in kasan_global_oob_right. */ + char *volatile array = alloca_array; + char *p = array + i; + + /* Only generic mode instruments dynamic allocas. */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_STACK); + + KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p); +} + +static void kmem_cache_double_free(struct kunit *test) +{ + char *p; + size_t size = 200; + struct kmem_cache *cache; + + cache = kmem_cache_create("test_cache", size, 0, 0, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); + + p = kmem_cache_alloc(cache, GFP_KERNEL); + if (!p) { + kunit_err(test, "Allocation failed: %s\n", __func__); + kmem_cache_destroy(cache); + return; + } + + kmem_cache_free(cache, p); + KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_free(cache, p)); + kmem_cache_destroy(cache); +} + +static void kmem_cache_invalid_free(struct kunit *test) +{ + char *p; + size_t size = 200; + struct kmem_cache *cache; + + cache = kmem_cache_create("test_cache", size, 0, SLAB_TYPESAFE_BY_RCU, + NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); + + p = kmem_cache_alloc(cache, GFP_KERNEL); + if (!p) { + kunit_err(test, "Allocation failed: %s\n", __func__); + kmem_cache_destroy(cache); + return; + } + + /* Trigger invalid free, the object doesn't get freed. */ + KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_free(cache, p + 1)); + + /* + * Properly free the object to prevent the "Objects remaining in + * test_cache on __kmem_cache_shutdown" BUG failure. + */ + kmem_cache_free(cache, p); + + kmem_cache_destroy(cache); +} + +static void empty_cache_ctor(void *object) { } + +static void kmem_cache_double_destroy(struct kunit *test) +{ + struct kmem_cache *cache; + + /* Provide a constructor to prevent cache merging. */ + cache = kmem_cache_create("test_cache", 200, 0, 0, empty_cache_ctor); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); + kmem_cache_destroy(cache); + KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_destroy(cache)); +} + +static void kasan_memchr(struct kunit *test) +{ + char *ptr; + size_t size = 24; + + /* + * str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT. + * See https://bugzilla.kernel.org/show_bug.cgi?id=206337 for details. + */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_AMD_MEM_ENCRYPT); + + if (OOB_TAG_OFF) + size = round_up(size, OOB_TAG_OFF); + + ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + OPTIMIZER_HIDE_VAR(ptr); + OPTIMIZER_HIDE_VAR(size); + KUNIT_EXPECT_KASAN_FAIL(test, + kasan_ptr_result = memchr(ptr, '1', size + 1)); + + kfree(ptr); +} + +static void kasan_memcmp(struct kunit *test) +{ + char *ptr; + size_t size = 24; + int arr[9]; + + /* + * str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT. + * See https://bugzilla.kernel.org/show_bug.cgi?id=206337 for details. + */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_AMD_MEM_ENCRYPT); + + if (OOB_TAG_OFF) + size = round_up(size, OOB_TAG_OFF); + + ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + memset(arr, 0, sizeof(arr)); + + OPTIMIZER_HIDE_VAR(ptr); + OPTIMIZER_HIDE_VAR(size); + KUNIT_EXPECT_KASAN_FAIL(test, + kasan_int_result = memcmp(ptr, arr, size+1)); + kfree(ptr); +} + +static void kasan_strings(struct kunit *test) +{ + char *ptr; + size_t size = 24; + + /* + * str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT. + * See https://bugzilla.kernel.org/show_bug.cgi?id=206337 for details. + */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_AMD_MEM_ENCRYPT); + + ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + kfree(ptr); + + /* + * Try to cause only 1 invalid access (less spam in dmesg). + * For that we need ptr to point to zeroed byte. + * Skip metadata that could be stored in freed object so ptr + * will likely point to zeroed byte. + */ + ptr += 16; + KUNIT_EXPECT_KASAN_FAIL(test, kasan_ptr_result = strchr(ptr, '1')); + + KUNIT_EXPECT_KASAN_FAIL(test, kasan_ptr_result = strrchr(ptr, '1')); + + KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strcmp(ptr, "2")); + + KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strncmp(ptr, "2", 1)); + + KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strlen(ptr)); + + KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strnlen(ptr, 1)); +} + +static void kasan_bitops_modify(struct kunit *test, int nr, void *addr) +{ + KUNIT_EXPECT_KASAN_FAIL(test, set_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, __set_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, clear_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, __clear_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, clear_bit_unlock(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, __clear_bit_unlock(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, change_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, __change_bit(nr, addr)); +} + +static void kasan_bitops_test_and_modify(struct kunit *test, int nr, void *addr) +{ + KUNIT_EXPECT_KASAN_FAIL(test, test_and_set_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, __test_and_set_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, test_and_set_bit_lock(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, test_and_clear_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, __test_and_clear_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, test_and_change_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, __test_and_change_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = test_bit(nr, addr)); + +#if defined(clear_bit_unlock_is_negative_byte) + KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = + clear_bit_unlock_is_negative_byte(nr, addr)); +#endif +} + +static void kasan_bitops_generic(struct kunit *test) +{ + long *bits; + + /* This test is specifically crafted for the generic mode. */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); + + /* + * Allocate 1 more byte, which causes kzalloc to round up to 16 bytes; + * this way we do not actually corrupt other memory. + */ + bits = kzalloc(sizeof(*bits) + 1, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bits); + + /* + * Below calls try to access bit within allocated memory; however, the + * below accesses are still out-of-bounds, since bitops are defined to + * operate on the whole long the bit is in. + */ + kasan_bitops_modify(test, BITS_PER_LONG, bits); + + /* + * Below calls try to access bit beyond allocated memory. + */ + kasan_bitops_test_and_modify(test, BITS_PER_LONG + BITS_PER_BYTE, bits); + + kfree(bits); +} + +static void kasan_bitops_tags(struct kunit *test) +{ + long *bits; + + /* This test is specifically crafted for tag-based modes. */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); + + /* kmalloc-64 cache will be used and the last 16 bytes will be the redzone. */ + bits = kzalloc(48, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bits); + + /* Do the accesses past the 48 allocated bytes, but within the redone. */ + kasan_bitops_modify(test, BITS_PER_LONG, (void *)bits + 48); + kasan_bitops_test_and_modify(test, BITS_PER_LONG + BITS_PER_BYTE, (void *)bits + 48); + + kfree(bits); +} + +static void kmalloc_double_kzfree(struct kunit *test) +{ + char *ptr; + size_t size = 16; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + kfree_sensitive(ptr); + KUNIT_EXPECT_KASAN_FAIL(test, kfree_sensitive(ptr)); +} + +/* + * The two tests below check that Generic KASAN prints auxiliary stack traces + * for RCU callbacks and workqueues. The reports need to be inspected manually. + * + * These tests are still enabled for other KASAN modes to make sure that all + * modes report bad accesses in tested scenarios. + */ + +static struct kasan_rcu_info { + int i; + struct rcu_head rcu; +} *global_rcu_ptr; + +static void rcu_uaf_reclaim(struct rcu_head *rp) +{ + struct kasan_rcu_info *fp = + container_of(rp, struct kasan_rcu_info, rcu); + + kfree(fp); + ((volatile struct kasan_rcu_info *)fp)->i; +} + +static void rcu_uaf(struct kunit *test) +{ + struct kasan_rcu_info *ptr; + + ptr = kmalloc(sizeof(struct kasan_rcu_info), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + global_rcu_ptr = rcu_dereference_protected( + (struct kasan_rcu_info __rcu *)ptr, NULL); + + KUNIT_EXPECT_KASAN_FAIL(test, + call_rcu(&global_rcu_ptr->rcu, rcu_uaf_reclaim); + rcu_barrier()); +} + +static void workqueue_uaf_work(struct work_struct *work) +{ + kfree(work); +} + +static void workqueue_uaf(struct kunit *test) +{ + struct workqueue_struct *workqueue; + struct work_struct *work; + + workqueue = create_workqueue("kasan_workqueue_test"); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, workqueue); + + work = kmalloc(sizeof(struct work_struct), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, work); + + INIT_WORK(work, workqueue_uaf_work); + queue_work(workqueue, work); + destroy_workqueue(workqueue); + + KUNIT_EXPECT_KASAN_FAIL(test, + ((volatile struct work_struct *)work)->data); +} + +static void vmalloc_helpers_tags(struct kunit *test) +{ + void *ptr; + + /* This test is intended for tag-based modes. */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); + + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC); + + ptr = vmalloc(PAGE_SIZE); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + /* Check that the returned pointer is tagged. */ + KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure exported vmalloc helpers handle tagged pointers. */ + KUNIT_ASSERT_TRUE(test, is_vmalloc_addr(ptr)); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, vmalloc_to_page(ptr)); + +#if !IS_MODULE(CONFIG_KASAN_KUNIT_TEST) + { + int rv; + + /* Make sure vmalloc'ed memory permissions can be changed. */ + rv = set_memory_ro((unsigned long)ptr, 1); + KUNIT_ASSERT_GE(test, rv, 0); + rv = set_memory_rw((unsigned long)ptr, 1); + KUNIT_ASSERT_GE(test, rv, 0); + } +#endif + + vfree(ptr); +} + +static void vmalloc_oob(struct kunit *test) +{ + char *v_ptr, *p_ptr; + struct page *page; + size_t size = PAGE_SIZE / 2 - KASAN_GRANULE_SIZE - 5; + + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC); + + v_ptr = vmalloc(size); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); + + OPTIMIZER_HIDE_VAR(v_ptr); + + /* + * We have to be careful not to hit the guard page in vmalloc tests. + * The MMU will catch that and crash us. + */ + + /* Make sure in-bounds accesses are valid. */ + v_ptr[0] = 0; + v_ptr[size - 1] = 0; + + /* + * An unaligned access past the requested vmalloc size. + * Only generic KASAN can precisely detect these. + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size]); + + /* An aligned access into the first out-of-bounds granule. */ + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size + 5]); + + /* Check that in-bounds accesses to the physical page are valid. */ + page = vmalloc_to_page(v_ptr); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, page); + p_ptr = page_address(page); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr); + p_ptr[0] = 0; + + vfree(v_ptr); + + /* + * We can't check for use-after-unmap bugs in this nor in the following + * vmalloc tests, as the page might be fully unmapped and accessing it + * will crash the kernel. + */ +} + +static void vmap_tags(struct kunit *test) +{ + char *p_ptr, *v_ptr; + struct page *p_page, *v_page; + + /* + * This test is specifically crafted for the software tag-based mode, + * the only tag-based mode that poisons vmap mappings. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS); + + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC); + + p_page = alloc_pages(GFP_KERNEL, 1); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_page); + p_ptr = page_address(p_page); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr); + + v_ptr = vmap(&p_page, 1, VM_MAP, PAGE_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); + + /* + * We can't check for out-of-bounds bugs in this nor in the following + * vmalloc tests, as allocations have page granularity and accessing + * the guard page will crash the kernel. + */ + + KUNIT_EXPECT_GE(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure that in-bounds accesses through both pointers work. */ + *p_ptr = 0; + *v_ptr = 0; + + /* Make sure vmalloc_to_page() correctly recovers the page pointer. */ + v_page = vmalloc_to_page(v_ptr); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_page); + KUNIT_EXPECT_PTR_EQ(test, p_page, v_page); + + vunmap(v_ptr); + free_pages((unsigned long)p_ptr, 1); +} + +static void vm_map_ram_tags(struct kunit *test) +{ + char *p_ptr, *v_ptr; + struct page *page; + + /* + * This test is specifically crafted for the software tag-based mode, + * the only tag-based mode that poisons vm_map_ram mappings. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS); + + page = alloc_pages(GFP_KERNEL, 1); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, page); + p_ptr = page_address(page); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr); + + v_ptr = vm_map_ram(&page, 1, -1); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); + + KUNIT_EXPECT_GE(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure that in-bounds accesses through both pointers work. */ + *p_ptr = 0; + *v_ptr = 0; + + vm_unmap_ram(v_ptr, 1); + free_pages((unsigned long)p_ptr, 1); +} + +static void vmalloc_percpu(struct kunit *test) +{ + char __percpu *ptr; + int cpu; + + /* + * This test is specifically crafted for the software tag-based mode, + * the only tag-based mode that poisons percpu mappings. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS); + + ptr = __alloc_percpu(PAGE_SIZE, PAGE_SIZE); + + for_each_possible_cpu(cpu) { + char *c_ptr = per_cpu_ptr(ptr, cpu); + + KUNIT_EXPECT_GE(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure that in-bounds accesses don't crash the kernel. */ + *c_ptr = 0; + } + + free_percpu(ptr); +} + +/* + * Check that the assigned pointer tag falls within the [KASAN_TAG_MIN, + * KASAN_TAG_KERNEL) range (note: excluding the match-all tag) for tag-based + * modes. + */ +static void match_all_not_assigned(struct kunit *test) +{ + char *ptr; + struct page *pages; + int i, size, order; + + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); + + for (i = 0; i < 256; i++) { + size = get_random_u32_inclusive(1, 1024); + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); + kfree(ptr); + } + + for (i = 0; i < 256; i++) { + order = get_random_u32_inclusive(1, 4); + pages = alloc_pages(GFP_KERNEL, order); + ptr = page_address(pages); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); + free_pages((unsigned long)ptr, order); + } + + if (!IS_ENABLED(CONFIG_KASAN_VMALLOC)) + return; + + for (i = 0; i < 256; i++) { + size = get_random_u32_inclusive(1, 1024); + ptr = vmalloc(size); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); + vfree(ptr); + } +} + +/* Check that 0xff works as a match-all pointer tag for tag-based modes. */ +static void match_all_ptr_tag(struct kunit *test) +{ + char *ptr; + u8 tag; + + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); + + ptr = kmalloc(128, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + /* Backup the assigned tag. */ + tag = get_tag(ptr); + KUNIT_EXPECT_NE(test, tag, (u8)KASAN_TAG_KERNEL); + + /* Reset the tag to 0xff.*/ + ptr = set_tag(ptr, KASAN_TAG_KERNEL); + + /* This access shouldn't trigger a KASAN report. */ + *ptr = 0; + + /* Recover the pointer tag and free. */ + ptr = set_tag(ptr, tag); + kfree(ptr); +} + +/* Check that there are no match-all memory tags for tag-based modes. */ +static void match_all_mem_tag(struct kunit *test) +{ + char *ptr; + int tag; + + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); + + ptr = kmalloc(128, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + KUNIT_EXPECT_NE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); + + /* For each possible tag value not matching the pointer tag. */ + for (tag = KASAN_TAG_MIN; tag <= KASAN_TAG_KERNEL; tag++) { + if (tag == get_tag(ptr)) + continue; + + /* Mark the first memory granule with the chosen memory tag. */ + kasan_poison(ptr, KASAN_GRANULE_SIZE, (u8)tag, false); + + /* This access must cause a KASAN report. */ + KUNIT_EXPECT_KASAN_FAIL(test, *ptr = 0); + } + + /* Recover the memory tag and free. */ + kasan_poison(ptr, KASAN_GRANULE_SIZE, get_tag(ptr), false); + kfree(ptr); +} + +static struct kunit_case kasan_kunit_test_cases[] = { + KUNIT_CASE(kmalloc_oob_right), + KUNIT_CASE(kmalloc_oob_left), + KUNIT_CASE(kmalloc_node_oob_right), + KUNIT_CASE(kmalloc_pagealloc_oob_right), + KUNIT_CASE(kmalloc_pagealloc_uaf), + KUNIT_CASE(kmalloc_pagealloc_invalid_free), + KUNIT_CASE(pagealloc_oob_right), + KUNIT_CASE(pagealloc_uaf), + KUNIT_CASE(kmalloc_large_oob_right), + KUNIT_CASE(krealloc_more_oob), + KUNIT_CASE(krealloc_less_oob), + KUNIT_CASE(krealloc_pagealloc_more_oob), + KUNIT_CASE(krealloc_pagealloc_less_oob), + KUNIT_CASE(krealloc_uaf), + KUNIT_CASE(kmalloc_oob_16), + KUNIT_CASE(kmalloc_uaf_16), + KUNIT_CASE(kmalloc_oob_in_memset), + KUNIT_CASE(kmalloc_oob_memset_2), + KUNIT_CASE(kmalloc_oob_memset_4), + KUNIT_CASE(kmalloc_oob_memset_8), + KUNIT_CASE(kmalloc_oob_memset_16), + KUNIT_CASE(kmalloc_memmove_negative_size), + KUNIT_CASE(kmalloc_memmove_invalid_size), + KUNIT_CASE(kmalloc_uaf), + KUNIT_CASE(kmalloc_uaf_memset), + KUNIT_CASE(kmalloc_uaf2), + KUNIT_CASE(kmalloc_uaf3), + KUNIT_CASE(kfree_via_page), + KUNIT_CASE(kfree_via_phys), + KUNIT_CASE(kmem_cache_oob), + KUNIT_CASE(kmem_cache_accounted), + KUNIT_CASE(kmem_cache_bulk), + KUNIT_CASE(kasan_global_oob_right), + KUNIT_CASE(kasan_global_oob_left), + KUNIT_CASE(kasan_stack_oob), + KUNIT_CASE(kasan_alloca_oob_left), + KUNIT_CASE(kasan_alloca_oob_right), + KUNIT_CASE(ksize_unpoisons_memory), + KUNIT_CASE(ksize_uaf), + KUNIT_CASE(kmem_cache_double_free), + KUNIT_CASE(kmem_cache_invalid_free), + KUNIT_CASE(kmem_cache_double_destroy), + KUNIT_CASE(kasan_memchr), + KUNIT_CASE(kasan_memcmp), + KUNIT_CASE(kasan_strings), + KUNIT_CASE(kasan_bitops_generic), + KUNIT_CASE(kasan_bitops_tags), + KUNIT_CASE(kmalloc_double_kzfree), + KUNIT_CASE(rcu_uaf), + KUNIT_CASE(workqueue_uaf), + KUNIT_CASE(vmalloc_helpers_tags), + KUNIT_CASE(vmalloc_oob), + KUNIT_CASE(vmap_tags), + KUNIT_CASE(vm_map_ram_tags), + KUNIT_CASE(vmalloc_percpu), + KUNIT_CASE(match_all_not_assigned), + KUNIT_CASE(match_all_ptr_tag), + KUNIT_CASE(match_all_mem_tag), + {} +}; + +static struct kunit_suite kasan_kunit_test_suite = { + .name = "kasan", + .test_cases = kasan_kunit_test_cases, + .exit = kasan_test_exit, + .suite_init = kasan_suite_init, + .suite_exit = kasan_suite_exit, +}; + +kunit_test_suite(kasan_kunit_test_suite); + +MODULE_LICENSE("GPL"); diff --git a/mm/kasan/kasan_test_module.c b/mm/kasan/kasan_test_module.c new file mode 100644 index 000000000000..7be7bed456ef --- /dev/null +++ b/mm/kasan/kasan_test_module.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * + * Copyright (c) 2014 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin <a.ryabinin@samsung.com> + */ + +#define pr_fmt(fmt) "kasan test: %s " fmt, __func__ + +#include <linux/mman.h> +#include <linux/module.h> +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/uaccess.h> + +#include "kasan.h" + +static noinline void __init copy_user_test(void) +{ + char *kmem; + char __user *usermem; + size_t size = 128 - KASAN_GRANULE_SIZE; + int __maybe_unused unused; + + kmem = kmalloc(size, GFP_KERNEL); + if (!kmem) + return; + + usermem = (char __user *)vm_mmap(NULL, 0, PAGE_SIZE, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_ANONYMOUS | MAP_PRIVATE, 0); + if (IS_ERR(usermem)) { + pr_err("Failed to allocate user memory\n"); + kfree(kmem); + return; + } + + OPTIMIZER_HIDE_VAR(size); + + pr_info("out-of-bounds in copy_from_user()\n"); + unused = copy_from_user(kmem, usermem, size + 1); + + pr_info("out-of-bounds in copy_to_user()\n"); + unused = copy_to_user(usermem, kmem, size + 1); + + pr_info("out-of-bounds in __copy_from_user()\n"); + unused = __copy_from_user(kmem, usermem, size + 1); + + pr_info("out-of-bounds in __copy_to_user()\n"); + unused = __copy_to_user(usermem, kmem, size + 1); + + pr_info("out-of-bounds in __copy_from_user_inatomic()\n"); + unused = __copy_from_user_inatomic(kmem, usermem, size + 1); + + pr_info("out-of-bounds in __copy_to_user_inatomic()\n"); + unused = __copy_to_user_inatomic(usermem, kmem, size + 1); + + pr_info("out-of-bounds in strncpy_from_user()\n"); + unused = strncpy_from_user(kmem, usermem, size + 1); + + vm_munmap((unsigned long)usermem, PAGE_SIZE); + kfree(kmem); +} + +static int __init test_kasan_module_init(void) +{ + /* + * Temporarily enable multi-shot mode. Otherwise, KASAN would only + * report the first detected bug and panic the kernel if panic_on_warn + * is enabled. + */ + bool multishot = kasan_save_enable_multi_shot(); + + copy_user_test(); + + kasan_restore_multi_shot(multishot); + return -EAGAIN; +} + +module_init(test_kasan_module_init); +MODULE_LICENSE("GPL"); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index fe3f606b3a98..1d02757e90a3 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -9,6 +9,7 @@ * Andrey Konovalov <andreyknvl@gmail.com> */ +#include <kunit/test.h> #include <linux/bitops.h> #include <linux/ftrace.h> #include <linux/init.h> @@ -30,8 +31,6 @@ #include <asm/sections.h> -#include <kunit/test.h> - #include "kasan.h" #include "../slab.h" @@ -115,40 +114,63 @@ EXPORT_SYMBOL_GPL(kasan_restore_multi_shot); #endif #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) -static void update_kunit_status(bool sync) + +/* + * Whether the KASAN KUnit test suite is currently being executed. + * Updated in kasan_test.c. + */ +bool kasan_kunit_executing; + +void kasan_kunit_test_suite_start(void) +{ + WRITE_ONCE(kasan_kunit_executing, true); +} +EXPORT_SYMBOL_GPL(kasan_kunit_test_suite_start); + +void kasan_kunit_test_suite_end(void) +{ + WRITE_ONCE(kasan_kunit_executing, false); +} +EXPORT_SYMBOL_GPL(kasan_kunit_test_suite_end); + +static bool kasan_kunit_test_suite_executing(void) +{ + return READ_ONCE(kasan_kunit_executing); +} + +#else /* CONFIG_KASAN_KUNIT_TEST */ + +static inline bool kasan_kunit_test_suite_executing(void) { return false; } + +#endif /* CONFIG_KASAN_KUNIT_TEST */ + +#if IS_ENABLED(CONFIG_KUNIT) + +static void fail_non_kasan_kunit_test(void) { struct kunit *test; - struct kunit_resource *resource; - struct kunit_kasan_status *status; - test = current->kunit_test; - if (!test) + if (kasan_kunit_test_suite_executing()) return; - resource = kunit_find_named_resource(test, "kasan_status"); - if (!resource) { + test = current->kunit_test; + if (test) kunit_set_failure(test); - return; - } +} - status = (struct kunit_kasan_status *)resource->data; - WRITE_ONCE(status->report_found, true); - WRITE_ONCE(status->sync_fault, sync); +#else /* CONFIG_KUNIT */ - kunit_put_resource(resource); -} -#else -static void update_kunit_status(bool sync) { } -#endif +static inline void fail_non_kasan_kunit_test(void) { } + +#endif /* CONFIG_KUNIT */ static DEFINE_SPINLOCK(report_lock); static void start_report(unsigned long *flags, bool sync) { + fail_non_kasan_kunit_test(); /* Respect the /proc/sys/kernel/traceoff_on_warning interface. */ disable_trace_on_warning(); - /* Update status of the currently running KASAN test. */ - update_kunit_status(sync); /* Do not allow LOCKDEP mangling KASAN reports. */ lockdep_off(); /* Make sure we don't end up in loop. */ @@ -164,8 +186,8 @@ static void end_report(unsigned long *flags, void *addr) (unsigned long)addr); pr_err("==================================================================\n"); spin_unlock_irqrestore(&report_lock, *flags); - if (panic_on_warn && !test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) - panic("panic_on_warn set ...\n"); + if (!test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) + check_panic_on_warn("KASAN"); if (kasan_arg_fault == KASAN_ARG_FAULT_PANIC) panic("kasan.fault=panic set ...\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); @@ -175,18 +197,14 @@ static void end_report(unsigned long *flags, void *addr) static void print_error_description(struct kasan_report_info *info) { - if (info->type == KASAN_REPORT_INVALID_FREE) { - pr_err("BUG: KASAN: invalid-free in %pS\n", (void *)info->ip); - return; - } + pr_err("BUG: KASAN: %s in %pS\n", info->bug_type, (void *)info->ip); - if (info->type == KASAN_REPORT_DOUBLE_FREE) { - pr_err("BUG: KASAN: double-free in %pS\n", (void *)info->ip); + if (info->type != KASAN_REPORT_ACCESS) { + pr_err("Free of addr %px by task %s/%d\n", + info->access_addr, current->comm, task_pid_nr(current)); return; } - pr_err("BUG: KASAN: %s in %pS\n", - kasan_get_bug_type(info), (void *)info->ip); if (info->access_size) pr_err("%s of size %zu at addr %px by task %s/%d\n", info->is_write ? "Write" : "Read", info->access_size, @@ -200,31 +218,21 @@ static void print_error_description(struct kasan_report_info *info) static void print_track(struct kasan_track *track, const char *prefix) { pr_err("%s by task %u:\n", prefix, track->pid); - if (track->stack) { + if (track->stack) stack_depot_print(track->stack); - } else { + else pr_err("(stack is not available)\n"); - } } -struct page *kasan_addr_to_page(const void *addr) +static inline struct page *addr_to_page(const void *addr) { - if ((addr >= (void *)PAGE_OFFSET) && - (addr < high_memory)) + if (virt_addr_valid(addr)) return virt_to_head_page(addr); return NULL; } -struct slab *kasan_addr_to_slab(const void *addr) -{ - if ((addr >= (void *)PAGE_OFFSET) && - (addr < high_memory)) - return virt_to_slab(addr); - return NULL; -} - -static void describe_object_addr(struct kmem_cache *cache, void *object, - const void *addr) +static void describe_object_addr(const void *addr, struct kmem_cache *cache, + void *object) { unsigned long access_addr = (unsigned long)addr; unsigned long object_addr = (unsigned long)object; @@ -252,46 +260,26 @@ static void describe_object_addr(struct kmem_cache *cache, void *object, (void *)(object_addr + cache->object_size)); } -static void describe_object_stacks(struct kmem_cache *cache, void *object, - const void *addr, u8 tag) +static void describe_object_stacks(struct kasan_report_info *info) { - struct kasan_alloc_meta *alloc_meta; - struct kasan_track *free_track; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (alloc_meta) { - print_track(&alloc_meta->alloc_track, "Allocated"); + if (info->alloc_track.stack) { + print_track(&info->alloc_track, "Allocated"); pr_err("\n"); } - free_track = kasan_get_free_track(cache, object, tag); - if (free_track) { - print_track(free_track, "Freed"); + if (info->free_track.stack) { + print_track(&info->free_track, "Freed"); pr_err("\n"); } -#ifdef CONFIG_KASAN_GENERIC - if (!alloc_meta) - return; - if (alloc_meta->aux_stack[0]) { - pr_err("Last potentially related work creation:\n"); - stack_depot_print(alloc_meta->aux_stack[0]); - pr_err("\n"); - } - if (alloc_meta->aux_stack[1]) { - pr_err("Second to last potentially related work creation:\n"); - stack_depot_print(alloc_meta->aux_stack[1]); - pr_err("\n"); - } -#endif + kasan_print_aux_stacks(info->cache, info->object); } -static void describe_object(struct kmem_cache *cache, void *object, - const void *addr, u8 tag) +static void describe_object(const void *addr, struct kasan_report_info *info) { if (kasan_stack_collection_enabled()) - describe_object_stacks(cache, object, addr, tag); - describe_object_addr(cache, object, addr); + describe_object_stacks(info); + describe_object_addr(addr, info->cache, info->object); } static inline bool kernel_or_module_addr(const void *addr) @@ -310,19 +298,16 @@ static inline bool init_task_stack_addr(const void *addr) sizeof(init_thread_union.stack)); } -static void print_address_description(void *addr, u8 tag) +static void print_address_description(void *addr, u8 tag, + struct kasan_report_info *info) { - struct page *page = kasan_addr_to_page(addr); + struct page *page = addr_to_page(addr); dump_stack_lvl(KERN_ERR); pr_err("\n"); - if (page && PageSlab(page)) { - struct slab *slab = page_slab(page); - struct kmem_cache *cache = slab->slab_cache; - void *object = nearest_obj(cache, slab, addr); - - describe_object(cache, object, addr, tag); + if (info->cache && info->object) { + describe_object(addr, info); pr_err("\n"); } @@ -420,23 +405,56 @@ static void print_memory_metadata(const void *addr) static void print_report(struct kasan_report_info *info) { - void *tagged_addr = info->access_addr; - void *untagged_addr = kasan_reset_tag(tagged_addr); - u8 tag = get_tag(tagged_addr); + void *addr = kasan_reset_tag(info->access_addr); + u8 tag = get_tag(info->access_addr); print_error_description(info); - if (addr_has_metadata(untagged_addr)) + if (addr_has_metadata(addr)) kasan_print_tags(tag, info->first_bad_addr); pr_err("\n"); - if (addr_has_metadata(untagged_addr)) { - print_address_description(untagged_addr, tag); + if (addr_has_metadata(addr)) { + print_address_description(addr, tag, info); print_memory_metadata(info->first_bad_addr); } else { dump_stack_lvl(KERN_ERR); } } +static void complete_report_info(struct kasan_report_info *info) +{ + void *addr = kasan_reset_tag(info->access_addr); + struct slab *slab; + + if (info->type == KASAN_REPORT_ACCESS) + info->first_bad_addr = kasan_find_first_bad_addr( + info->access_addr, info->access_size); + else + info->first_bad_addr = addr; + + slab = kasan_addr_to_slab(addr); + if (slab) { + info->cache = slab->slab_cache; + info->object = nearest_obj(info->cache, slab, addr); + } else + info->cache = info->object = NULL; + + switch (info->type) { + case KASAN_REPORT_INVALID_FREE: + info->bug_type = "invalid-free"; + break; + case KASAN_REPORT_DOUBLE_FREE: + info->bug_type = "double-free"; + break; + default: + /* bug_type filled in by kasan_complete_mode_report_info. */ + break; + } + + /* Fill in mode-specific report info fields. */ + kasan_complete_mode_report_info(info); +} + void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_type type) { unsigned long flags; @@ -452,13 +470,15 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_ty start_report(&flags, true); + memset(&info, 0, sizeof(info)); info.type = type; info.access_addr = ptr; - info.first_bad_addr = kasan_reset_tag(ptr); info.access_size = 0; info.is_write = false; info.ip = ip; + complete_report_info(&info); + print_report(&info); end_report(&flags, ptr); @@ -485,13 +505,15 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write, start_report(&irq_flags, true); + memset(&info, 0, sizeof(info)); info.type = KASAN_REPORT_ACCESS; info.access_addr = ptr; - info.first_bad_addr = kasan_find_first_bad_addr(ptr, size); info.access_size = size; info.is_write = is_write; info.ip = ip; + complete_report_info(&info); + print_report(&info); end_report(&irq_flags, ptr); diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 6689fb9a919b..043c94b04605 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -109,7 +109,7 @@ static const char *get_wild_bug_type(struct kasan_report_info *info) return bug_type; } -const char *kasan_get_bug_type(struct kasan_report_info *info) +static const char *get_bug_type(struct kasan_report_info *info) { /* * If access_size is a negative number, then it has reason to be @@ -127,11 +127,55 @@ const char *kasan_get_bug_type(struct kasan_report_info *info) return get_wild_bug_type(info); } +void kasan_complete_mode_report_info(struct kasan_report_info *info) +{ + struct kasan_alloc_meta *alloc_meta; + struct kasan_free_meta *free_meta; + + if (!info->bug_type) + info->bug_type = get_bug_type(info); + + if (!info->cache || !info->object) + return; + + alloc_meta = kasan_get_alloc_meta(info->cache, info->object); + if (alloc_meta) + memcpy(&info->alloc_track, &alloc_meta->alloc_track, + sizeof(info->alloc_track)); + + if (*(u8 *)kasan_mem_to_shadow(info->object) == KASAN_SLAB_FREETRACK) { + /* Free meta must be present with KASAN_SLAB_FREETRACK. */ + free_meta = kasan_get_free_meta(info->cache, info->object); + memcpy(&info->free_track, &free_meta->free_track, + sizeof(info->free_track)); + } +} + void kasan_metadata_fetch_row(char *buffer, void *row) { memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW); } +void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (!alloc_meta) + return; + + if (alloc_meta->aux_stack[0]) { + pr_err("Last potentially related work creation:\n"); + stack_depot_print(alloc_meta->aux_stack[0]); + pr_err("\n"); + } + if (alloc_meta->aux_stack[1]) { + pr_err("Second to last potentially related work creation:\n"); + stack_depot_print(alloc_meta->aux_stack[1]); + pr_err("\n"); + } +} + #ifdef CONFIG_KASAN_STACK static bool __must_check tokenize_frame_descr(const char **frame_descr, char *token, size_t max_tok_len, diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index e25d2166e813..ecede06ef374 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -4,38 +4,14 @@ * Copyright (c) 2020 Google, Inc. */ +#include <linux/atomic.h> + #include "kasan.h" -#include "../slab.h" -const char *kasan_get_bug_type(struct kasan_report_info *info) -{ -#ifdef CONFIG_KASAN_TAGS_IDENTIFY - struct kasan_alloc_meta *alloc_meta; - struct kmem_cache *cache; - struct slab *slab; - const void *addr; - void *object; - u8 tag; - int i; - - tag = get_tag(info->access_addr); - addr = kasan_reset_tag(info->access_addr); - slab = kasan_addr_to_slab(addr); - if (slab) { - cache = slab->slab_cache; - object = nearest_obj(cache, slab, (void *)addr); - alloc_meta = kasan_get_alloc_meta(cache, object); - - if (alloc_meta) { - for (i = 0; i < KASAN_NR_FREE_STACKS; i++) { - if (alloc_meta->free_pointer_tag[i] == tag) - return "use-after-free"; - } - } - return "out-of-bounds"; - } -#endif +extern struct kasan_stack_ring stack_ring; +static const char *get_common_bug_type(struct kasan_report_info *info) +{ /* * If access_size is a negative number, then it has reason to be * defined as out-of-bounds bug type. @@ -49,3 +25,92 @@ const char *kasan_get_bug_type(struct kasan_report_info *info) return "invalid-access"; } + +void kasan_complete_mode_report_info(struct kasan_report_info *info) +{ + unsigned long flags; + u64 pos; + struct kasan_stack_ring_entry *entry; + void *ptr; + u32 pid; + depot_stack_handle_t stack; + bool is_free; + bool alloc_found = false, free_found = false; + + if ((!info->cache || !info->object) && !info->bug_type) { + info->bug_type = get_common_bug_type(info); + return; + } + + write_lock_irqsave(&stack_ring.lock, flags); + + pos = atomic64_read(&stack_ring.pos); + + /* + * The loop below tries to find stack ring entries relevant to the + * buggy object. This is a best-effort process. + * + * First, another object with the same tag can be allocated in place of + * the buggy object. Also, since the number of entries is limited, the + * entries relevant to the buggy object can be overwritten. + */ + + for (u64 i = pos - 1; i != pos - 1 - stack_ring.size; i--) { + if (alloc_found && free_found) + break; + + entry = &stack_ring.entries[i % stack_ring.size]; + + /* Paired with smp_store_release() in save_stack_info(). */ + ptr = (void *)smp_load_acquire(&entry->ptr); + + if (kasan_reset_tag(ptr) != info->object || + get_tag(ptr) != get_tag(info->access_addr)) + continue; + + pid = READ_ONCE(entry->pid); + stack = READ_ONCE(entry->stack); + is_free = READ_ONCE(entry->is_free); + + if (is_free) { + /* + * Second free of the same object. + * Give up on trying to find the alloc entry. + */ + if (free_found) + break; + + info->free_track.pid = pid; + info->free_track.stack = stack; + free_found = true; + + /* + * If a free entry is found first, the bug is likely + * a use-after-free. + */ + if (!info->bug_type) + info->bug_type = "use-after-free"; + } else { + /* Second alloc of the same object. Give up. */ + if (alloc_found) + break; + + info->alloc_track.pid = pid; + info->alloc_track.stack = stack; + alloc_found = true; + + /* + * If an alloc entry is found first, the bug is likely + * an out-of-bounds. + */ + if (!info->bug_type) + info->bug_type = "slab-out-of-bounds"; + } + } + + write_unlock_irqrestore(&stack_ring.lock, flags); + + /* Assign the common bug type if no entries were found. */ + if (!info->bug_type) + info->bug_type = get_common_bug_type(info); +} diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 0e3648b603a6..2fba1f51f042 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -244,7 +244,7 @@ static int __meminit kasan_mem_notifier(struct notifier_block *nb, static int __init kasan_memhotplug_init(void) { - hotplug_memory_notifier(kasan_mem_notifier, 0); + hotplug_memory_notifier(kasan_mem_notifier, DEFAULT_CALLBACK_PRI); return 0; } diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index 77f13f391b57..a3afaf2ad1b1 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -42,7 +42,10 @@ void __init kasan_init_sw_tags(void) for_each_possible_cpu(cpu) per_cpu(prng_state, cpu) = (u32)get_cycles(); - pr_info("KernelAddressSanitizer initialized (sw-tags)\n"); + kasan_init_tags(); + + pr_info("KernelAddressSanitizer initialized (sw-tags, stacktrace=%s)\n", + kasan_stack_collection_enabled() ? "on" : "off"); } /* diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 8f48b9502a17..67a222586846 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -6,9 +6,11 @@ * Copyright (c) 2020 Google, Inc. */ +#include <linux/atomic.h> #include <linux/init.h> #include <linux/kasan.h> #include <linux/kernel.h> +#include <linux/memblock.h> #include <linux/memory.h> #include <linux/mm.h> #include <linux/static_key.h> @@ -16,44 +18,127 @@ #include <linux/types.h> #include "kasan.h" +#include "../slab.h" -void kasan_set_free_info(struct kmem_cache *cache, - void *object, u8 tag) -{ - struct kasan_alloc_meta *alloc_meta; - u8 idx = 0; +#define KASAN_STACK_RING_SIZE_DEFAULT (32 << 10) + +enum kasan_arg_stacktrace { + KASAN_ARG_STACKTRACE_DEFAULT, + KASAN_ARG_STACKTRACE_OFF, + KASAN_ARG_STACKTRACE_ON, +}; + +static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata; + +/* Whether to collect alloc/free stack traces. */ +DEFINE_STATIC_KEY_TRUE(kasan_flag_stacktrace); - alloc_meta = kasan_get_alloc_meta(cache, object); - if (!alloc_meta) - return; +/* Non-zero, as initial pointer values are 0. */ +#define STACK_RING_BUSY_PTR ((void *)1) + +struct kasan_stack_ring stack_ring = { + .lock = __RW_LOCK_UNLOCKED(stack_ring.lock) +}; + +/* kasan.stacktrace=off/on */ +static int __init early_kasan_flag_stacktrace(char *arg) +{ + if (!arg) + return -EINVAL; -#ifdef CONFIG_KASAN_TAGS_IDENTIFY - idx = alloc_meta->free_track_idx; - alloc_meta->free_pointer_tag[idx] = tag; - alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS; -#endif + if (!strcmp(arg, "off")) + kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_OFF; + else if (!strcmp(arg, "on")) + kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_ON; + else + return -EINVAL; - kasan_set_track(&alloc_meta->free_track[idx], GFP_NOWAIT); + return 0; } +early_param("kasan.stacktrace", early_kasan_flag_stacktrace); -struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag) +/* kasan.stack_ring_size=<number of entries> */ +static int __init early_kasan_flag_stack_ring_size(char *arg) { - struct kasan_alloc_meta *alloc_meta; - int i = 0; + if (!arg) + return -EINVAL; - alloc_meta = kasan_get_alloc_meta(cache, object); - if (!alloc_meta) - return NULL; + return kstrtoul(arg, 0, &stack_ring.size); +} +early_param("kasan.stack_ring_size", early_kasan_flag_stack_ring_size); + +void __init kasan_init_tags(void) +{ + switch (kasan_arg_stacktrace) { + case KASAN_ARG_STACKTRACE_DEFAULT: + /* Default is specified by kasan_flag_stacktrace definition. */ + break; + case KASAN_ARG_STACKTRACE_OFF: + static_branch_disable(&kasan_flag_stacktrace); + break; + case KASAN_ARG_STACKTRACE_ON: + static_branch_enable(&kasan_flag_stacktrace); + break; + } -#ifdef CONFIG_KASAN_TAGS_IDENTIFY - for (i = 0; i < KASAN_NR_FREE_STACKS; i++) { - if (alloc_meta->free_pointer_tag[i] == tag) - break; + if (kasan_stack_collection_enabled()) { + if (!stack_ring.size) + stack_ring.size = KASAN_STACK_RING_SIZE_DEFAULT; + stack_ring.entries = memblock_alloc( + sizeof(stack_ring.entries[0]) * stack_ring.size, + SMP_CACHE_BYTES); + if (WARN_ON(!stack_ring.entries)) + static_branch_disable(&kasan_flag_stacktrace); } - if (i == KASAN_NR_FREE_STACKS) - i = alloc_meta->free_track_idx; -#endif +} + +static void save_stack_info(struct kmem_cache *cache, void *object, + gfp_t gfp_flags, bool is_free) +{ + unsigned long flags; + depot_stack_handle_t stack; + u64 pos; + struct kasan_stack_ring_entry *entry; + void *old_ptr; + + stack = kasan_save_stack(gfp_flags, true); + + /* + * Prevent save_stack_info() from modifying stack ring + * when kasan_complete_mode_report_info() is walking it. + */ + read_lock_irqsave(&stack_ring.lock, flags); + +next: + pos = atomic64_fetch_add(1, &stack_ring.pos); + entry = &stack_ring.entries[pos % stack_ring.size]; + + /* Detect stack ring entry slots that are being written to. */ + old_ptr = READ_ONCE(entry->ptr); + if (old_ptr == STACK_RING_BUSY_PTR) + goto next; /* Busy slot. */ + if (!try_cmpxchg(&entry->ptr, &old_ptr, STACK_RING_BUSY_PTR)) + goto next; /* Busy slot. */ + + WRITE_ONCE(entry->size, cache->object_size); + WRITE_ONCE(entry->pid, current->pid); + WRITE_ONCE(entry->stack, stack); + WRITE_ONCE(entry->is_free, is_free); + + /* + * Paired with smp_load_acquire() in kasan_complete_mode_report_info(). + */ + smp_store_release(&entry->ptr, (s64)object); - return &alloc_meta->free_track[i]; + read_unlock_irqrestore(&stack_ring.lock, flags); +} + +void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) +{ + save_stack_info(cache, object, flags, false); +} + +void kasan_save_free_info(struct kmem_cache *cache, void *object) +{ + save_stack_info(cache, object, GFP_NOWAIT, true); } diff --git a/mm/kfence/core.c b/mm/kfence/core.c index c252081b11df..5349c37a5dac 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -26,7 +26,6 @@ #include <linux/random.h> #include <linux/rcupdate.h> #include <linux/sched/clock.h> -#include <linux/sched/sysctl.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -360,9 +359,9 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g unsigned long flags; struct slab *slab; void *addr; - const bool random_right_allocate = prandom_u32_max(2); + const bool random_right_allocate = get_random_u32_below(2); const bool random_fault = CONFIG_KFENCE_STRESS_TEST_FAULTS && - !prandom_u32_max(CONFIG_KFENCE_STRESS_TEST_FAULTS); + !get_random_u32_below(CONFIG_KFENCE_STRESS_TEST_FAULTS); /* Try to obtain a free object. */ raw_spin_lock_irqsave(&kfence_freelist_lock, flags); @@ -719,24 +718,13 @@ static int show_object(struct seq_file *seq, void *v) return 0; } -static const struct seq_operations object_seqops = { +static const struct seq_operations objects_sops = { .start = start_object, .next = next_object, .stop = stop_object, .show = show_object, }; - -static int open_objects(struct inode *inode, struct file *file) -{ - return seq_open(file, &object_seqops); -} - -static const struct file_operations objects_fops = { - .open = open_objects, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(objects); static int __init kfence_debugfs_init(void) { @@ -810,16 +798,7 @@ static void toggle_allocation_gate(struct work_struct *work) /* Enable static key, and await allocation to happen. */ static_branch_enable(&kfence_allocation_key); - if (sysctl_hung_task_timeout_secs) { - /* - * During low activity with no allocations we might wait a - * while; let's avoid the hung task warning. - */ - wait_event_idle_timeout(allocation_wait, atomic_read(&kfence_allocation_gate), - sysctl_hung_task_timeout_secs * HZ / 2); - } else { - wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate)); - } + wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate)); /* Disable static key and reset timer. */ static_branch_disable(&kfence_allocation_key); @@ -864,7 +843,7 @@ static void kfence_init_enable(void) void __init kfence_init(void) { - stack_hash_seed = (u32)random_get_entropy(); + stack_hash_seed = get_random_u32(); /* Setting kfence_sample_interval to 0 on boot disables KFENCE. */ if (!kfence_sample_interval) @@ -1003,6 +982,13 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) return NULL; } + /* + * Skip allocations for this slab, if KFENCE has been disabled for + * this slab. + */ + if (s->flags & SLAB_SKIP_KFENCE) + return NULL; + if (atomic_inc_return(&kfence_allocation_gate) > 1) return NULL; #ifdef CONFIG_KFENCE_STATIC_KEYS diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c index a97bffe0cc3e..b5d66a69200d 100644 --- a/mm/kfence/kfence_test.c +++ b/mm/kfence/kfence_test.c @@ -532,8 +532,8 @@ static void test_free_bulk(struct kunit *test) int iter; for (iter = 0; iter < 5; iter++) { - const size_t size = setup_test_cache(test, 8 + prandom_u32_max(300), 0, - (iter & 1) ? ctor_set_x : NULL); + const size_t size = setup_test_cache(test, get_random_u32_inclusive(8, 307), + 0, (iter & 1) ? ctor_set_x : NULL); void *objects[] = { test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT), test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE), diff --git a/mm/kfence/report.c b/mm/kfence/report.c index f5a6d8ba3e21..60205f1257ef 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -75,15 +75,21 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfence_") || str_has_prefix(buf, ARCH_FUNC_PREFIX "__kfence_") || + str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmem_cache_free") || !strncmp(buf, ARCH_FUNC_PREFIX "__slab_free", len)) { /* - * In case of tail calls from any of the below - * to any of the above. + * In case of tail calls from any of the below to any of + * the above, optimized by the compiler such that the + * stack trace would omit the initial entry point below. */ fallback = skipnr + 1; } - /* Also the *_bulk() variants by only checking prefixes. */ + /* + * The below list should only include the initial entry points + * into the slab allocators. Includes the *_bulk() variants by + * checking prefixes. + */ if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfree") || str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_free") || str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmalloc") || @@ -267,8 +273,7 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r lockdep_on(); - if (panic_on_warn) - panic("panic_on_warn set ...\n"); + check_panic_on_warn("KFENCE"); /* We encountered a memory safety error, taint the kernel! */ add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 01f71786d530..5cb401aa2b9d 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -23,16 +23,20 @@ #include <asm/tlb.h> #include <asm/pgalloc.h> #include "internal.h" +#include "mm_slot.h" enum scan_result { SCAN_FAIL, SCAN_SUCCEED, SCAN_PMD_NULL, + SCAN_PMD_NONE, + SCAN_PMD_MAPPED, SCAN_EXCEED_NONE_PTE, SCAN_EXCEED_SWAP_PTE, SCAN_EXCEED_SHARED_PTE, SCAN_PTE_NON_PRESENT, SCAN_PTE_UFFD_WP, + SCAN_PTE_MAPPED_HUGEPAGE, SCAN_PAGE_RO, SCAN_LACK_REFERENCED_PAGE, SCAN_PAGE_NULL, @@ -73,6 +77,8 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); * default collapse hugepages if there is at least one pte mapped like * it would have happened if the vma was large enough during page * fault. + * + * Note that these are only respected if collapse was initiated by khugepaged. */ static unsigned int khugepaged_max_ptes_none __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly; @@ -85,18 +91,24 @@ static struct kmem_cache *mm_slot_cache __read_mostly; #define MAX_PTE_MAPPED_THP 8 +struct collapse_control { + bool is_khugepaged; + + /* Num pages scanned per node */ + u32 node_load[MAX_NUMNODES]; + + /* nodemask for allocation fallback */ + nodemask_t alloc_nmask; +}; + /** - * struct mm_slot - hash lookup from mm to mm_slot - * @hash: hash collision list - * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head - * @mm: the mm that this information is valid for + * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned + * @slot: hash lookup from mm to mm_slot * @nr_pte_mapped_thp: number of pte mapped THP * @pte_mapped_thp: address array corresponding pte mapped THP */ -struct mm_slot { - struct hlist_node hash; - struct list_head mm_node; - struct mm_struct *mm; +struct khugepaged_mm_slot { + struct mm_slot slot; /* pte-mapped THP in this mm */ int nr_pte_mapped_thp; @@ -113,7 +125,7 @@ struct mm_slot { */ struct khugepaged_scan { struct list_head mm_head; - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; unsigned long address; }; @@ -377,8 +389,9 @@ int hugepage_madvise(struct vm_area_struct *vma, int __init khugepaged_init(void) { mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", - sizeof(struct mm_slot), - __alignof__(struct mm_slot), 0, NULL); + sizeof(struct khugepaged_mm_slot), + __alignof__(struct khugepaged_mm_slot), + 0, NULL); if (!mm_slot_cache) return -ENOMEM; @@ -395,65 +408,38 @@ void __init khugepaged_destroy(void) kmem_cache_destroy(mm_slot_cache); } -static inline struct mm_slot *alloc_mm_slot(void) -{ - if (!mm_slot_cache) /* initialization failed */ - return NULL; - return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); -} - -static inline void free_mm_slot(struct mm_slot *mm_slot) -{ - kmem_cache_free(mm_slot_cache, mm_slot); -} - -static struct mm_slot *get_mm_slot(struct mm_struct *mm) -{ - struct mm_slot *mm_slot; - - hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm) - if (mm == mm_slot->mm) - return mm_slot; - - return NULL; -} - -static void insert_to_mm_slots_hash(struct mm_struct *mm, - struct mm_slot *mm_slot) -{ - mm_slot->mm = mm; - hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); -} - -static inline int khugepaged_test_exit(struct mm_struct *mm) +static inline int hpage_collapse_test_exit(struct mm_struct *mm) { return atomic_read(&mm->mm_users) == 0; } void __khugepaged_enter(struct mm_struct *mm) { - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; int wakeup; - mm_slot = alloc_mm_slot(); + mm_slot = mm_slot_alloc(mm_slot_cache); if (!mm_slot) return; + slot = &mm_slot->slot; + /* __khugepaged_exit() must not run from under us */ - VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); + VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { - free_mm_slot(mm_slot); + mm_slot_free(mm_slot_cache, mm_slot); return; } spin_lock(&khugepaged_mm_lock); - insert_to_mm_slots_hash(mm, mm_slot); + mm_slot_insert(mm_slots_hash, mm, slot); /* * Insert just behind the scanning cursor, to let the area settle * down a little. */ wakeup = list_empty(&khugepaged_scan.mm_head); - list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); + list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head); spin_unlock(&khugepaged_mm_lock); mmgrab(mm); @@ -466,37 +452,38 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && hugepage_flags_enabled()) { - if (hugepage_vma_check(vma, vm_flags, false, false)) + if (hugepage_vma_check(vma, vm_flags, false, false, true)) __khugepaged_enter(vma->vm_mm); } } void __khugepaged_exit(struct mm_struct *mm) { - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; int free = 0; spin_lock(&khugepaged_mm_lock); - mm_slot = get_mm_slot(mm); + slot = mm_slot_lookup(mm_slots_hash, mm); + mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { - hash_del(&mm_slot->hash); - list_del(&mm_slot->mm_node); + hash_del(&slot->hash); + list_del(&slot->mm_node); free = 1; } spin_unlock(&khugepaged_mm_lock); if (free) { clear_bit(MMF_VM_HUGEPAGE, &mm->flags); - free_mm_slot(mm_slot); + mm_slot_free(mm_slot_cache, mm_slot); mmdrop(mm); } else if (mm_slot) { /* * This is required to serialize against - * khugepaged_test_exit() (which is guaranteed to run - * under mmap sem read mode). Stop here (after we - * return all pagetables will be destroyed) until - * khugepaged has finished working on the pagetables - * under the mmap_lock. + * hpage_collapse_test_exit() (which is guaranteed to run + * under mmap sem read mode). Stop here (after we return all + * pagetables will be destroyed) until khugepaged has finished + * working on the pagetables under the mmap_lock. */ mmap_write_lock(mm); mmap_write_unlock(mm); @@ -546,11 +533,12 @@ static bool is_refcount_suitable(struct page *page) static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, pte_t *pte, + struct collapse_control *cc, struct list_head *compound_pagelist) { struct page *page = NULL; pte_t *_pte; - int none_or_zero = 0, shared = 0, result = 0, referenced = 0; + int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; bool writable = false; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; @@ -558,8 +546,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, pte_t pteval = *_pte; if (pte_none(pteval) || (pte_present(pteval) && is_zero_pfn(pte_pfn(pteval)))) { + ++none_or_zero; if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) { + (!cc->is_khugepaged || + none_or_zero <= khugepaged_max_ptes_none)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; @@ -579,11 +569,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON_PAGE(!PageAnon(page), page); - if (page_mapcount(page) > 1 && - ++shared > khugepaged_max_ptes_shared) { - result = SCAN_EXCEED_SHARED_PTE; - count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); - goto out; + if (page_mapcount(page) > 1) { + ++shared; + if (cc->is_khugepaged && + shared > khugepaged_max_ptes_shared) { + result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); + goto out; + } } if (PageCompound(page)) { @@ -646,10 +639,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (PageCompound(page)) list_add_tail(&page->lru, compound_pagelist); next: - /* There should be enough young pte to collapse the page */ - if (pte_young(pteval) || - page_is_young(page) || PageReferenced(page) || - mmu_notifier_test_young(vma->vm_mm, address)) + /* + * If collapse was initiated by khugepaged, check that there is + * enough young pte to justify collapsing the page + */ + if (cc->is_khugepaged && + (pte_young(pteval) || page_is_young(page) || + PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, + address))) referenced++; if (pte_write(pteval)) @@ -658,19 +655,19 @@ next: if (unlikely(!writable)) { result = SCAN_PAGE_RO; - } else if (unlikely(!referenced)) { + } else if (unlikely(cc->is_khugepaged && !referenced)) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; trace_mm_collapse_huge_page_isolate(page, none_or_zero, referenced, writable, result); - return 1; + return result; } out: release_pte_pages(pte, _pte, compound_pagelist); trace_mm_collapse_huge_page_isolate(page, none_or_zero, referenced, writable, result); - return 0; + return result; } static void __collapse_huge_page_copy(pte_t *pte, struct page *page, @@ -730,14 +727,16 @@ static void khugepaged_alloc_sleep(void) DEFINE_WAIT(wait); add_wait_queue(&khugepaged_wait, &wait); - freezable_schedule_timeout_interruptible( - msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); + schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); remove_wait_queue(&khugepaged_wait, &wait); } -static int khugepaged_node_load[MAX_NUMNODES]; +struct collapse_control khugepaged_collapse_control = { + .is_khugepaged = true, +}; -static bool khugepaged_scan_abort(int nid) +static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc) { int i; @@ -749,11 +748,11 @@ static bool khugepaged_scan_abort(int nid) return false; /* If there is a count for this node already, it must be acceptable */ - if (khugepaged_node_load[nid]) + if (cc->node_load[nid]) return false; for (i = 0; i < MAX_NUMNODES; i++) { - if (!khugepaged_node_load[i]) + if (!cc->node_load[i]) continue; if (node_distance(nid, i) > node_reclaim_distance) return true; @@ -772,146 +771,59 @@ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) } #ifdef CONFIG_NUMA -static int khugepaged_find_target_node(void) +static int hpage_collapse_find_target_node(struct collapse_control *cc) { - static int last_khugepaged_target_node = NUMA_NO_NODE; int nid, target_node = 0, max_value = 0; /* find first node with max normal pages hit */ for (nid = 0; nid < MAX_NUMNODES; nid++) - if (khugepaged_node_load[nid] > max_value) { - max_value = khugepaged_node_load[nid]; + if (cc->node_load[nid] > max_value) { + max_value = cc->node_load[nid]; target_node = nid; } - /* do some balance if several nodes have the same hit record */ - if (target_node <= last_khugepaged_target_node) - for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; - nid++) - if (max_value == khugepaged_node_load[nid]) { - target_node = nid; - break; - } + for_each_online_node(nid) { + if (max_value == cc->node_load[nid]) + node_set(nid, cc->alloc_nmask); + } - last_khugepaged_target_node = target_node; return target_node; } - -static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) +#else +static int hpage_collapse_find_target_node(struct collapse_control *cc) { - if (IS_ERR(*hpage)) { - if (!*wait) - return false; - - *wait = false; - *hpage = NULL; - khugepaged_alloc_sleep(); - } else if (*hpage) { - put_page(*hpage); - *hpage = NULL; - } - - return true; + return 0; } +#endif -static struct page * -khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) +static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node, + nodemask_t *nmask) { - VM_BUG_ON_PAGE(*hpage, *hpage); - - *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); + *hpage = __alloc_pages(gfp, HPAGE_PMD_ORDER, node, nmask); if (unlikely(!*hpage)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - *hpage = ERR_PTR(-ENOMEM); - return NULL; + return false; } prep_transhuge_page(*hpage); count_vm_event(THP_COLLAPSE_ALLOC); - return *hpage; -} -#else -static int khugepaged_find_target_node(void) -{ - return 0; -} - -static inline struct page *alloc_khugepaged_hugepage(void) -{ - struct page *page; - - page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(), - HPAGE_PMD_ORDER); - if (page) - prep_transhuge_page(page); - return page; -} - -static struct page *khugepaged_alloc_hugepage(bool *wait) -{ - struct page *hpage; - - do { - hpage = alloc_khugepaged_hugepage(); - if (!hpage) { - count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - if (!*wait) - return NULL; - - *wait = false; - khugepaged_alloc_sleep(); - } else - count_vm_event(THP_COLLAPSE_ALLOC); - } while (unlikely(!hpage) && likely(hugepage_flags_enabled())); - - return hpage; -} - -static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) -{ - /* - * If the hpage allocated earlier was briefly exposed in page cache - * before collapse_file() failed, it is possible that racing lookups - * have not yet completed, and would then be unpleasantly surprised by - * finding the hpage reused for the same mapping at a different offset. - * Just release the previous allocation if there is any danger of that. - */ - if (*hpage && page_count(*hpage) > 1) { - put_page(*hpage); - *hpage = NULL; - } - - if (!*hpage) - *hpage = khugepaged_alloc_hugepage(wait); - - if (unlikely(!*hpage)) - return false; - return true; } -static struct page * -khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) -{ - VM_BUG_ON(!*hpage); - - return *hpage; -} -#endif - /* * If mmap_lock temporarily dropped, revalidate vma * before taking mmap_lock. - * Return 0 if succeeds, otherwise return none-zero - * value (scan code). + * Returns enum scan_result value. */ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, - struct vm_area_struct **vmap) + bool expect_anon, + struct vm_area_struct **vmap, + struct collapse_control *cc) { struct vm_area_struct *vma; - if (unlikely(khugepaged_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit(mm))) return SCAN_ANY_PROCESS; *vmap = vma = find_vma(mm, address); @@ -920,7 +832,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!transhuge_vma_suitable(vma, address)) return SCAN_ADDRESS_RANGE; - if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, + cc->is_khugepaged)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then @@ -929,23 +842,62 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, * hugepage_vma_check may return true for qualified file * vmas. */ - if (!vma->anon_vma || !vma_is_anonymous(vma)) - return SCAN_VMA_CHECK; - return 0; + if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap))) + return SCAN_PAGE_ANON; + return SCAN_SUCCEED; +} + +static int find_pmd_or_thp_or_none(struct mm_struct *mm, + unsigned long address, + pmd_t **pmd) +{ + pmd_t pmde; + + *pmd = mm_find_pmd(mm, address); + if (!*pmd) + return SCAN_PMD_NULL; + + pmde = pmdp_get_lockless(*pmd); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* See comments in pmd_none_or_trans_huge_or_clear_bad() */ + barrier(); +#endif + if (pmd_none(pmde)) + return SCAN_PMD_NONE; + if (pmd_trans_huge(pmde)) + return SCAN_PMD_MAPPED; + if (pmd_bad(pmde)) + return SCAN_PMD_NULL; + return SCAN_SUCCEED; +} + +static int check_pmd_still_valid(struct mm_struct *mm, + unsigned long address, + pmd_t *pmd) +{ + pmd_t *new_pmd; + int result = find_pmd_or_thp_or_none(mm, address, &new_pmd); + + if (result != SCAN_SUCCEED) + return result; + if (new_pmd != pmd) + return SCAN_FAIL; + return SCAN_SUCCEED; } /* * Bring missing pages in from swap, to complete THP collapse. - * Only done if khugepaged_scan_pmd believes it is worthwhile. + * Only done if hpage_collapse_scan_pmd believes it is worthwhile. * * Called and returns without pte mapped or spinlocks held. * Note that if false is returned, mmap_lock will be released. */ -static bool __collapse_huge_page_swapin(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long haddr, pmd_t *pmd, - int referenced) +static int __collapse_huge_page_swapin(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long haddr, pmd_t *pmd, + int referenced) { int swapped_in = 0; vm_fault_t ret = 0; @@ -976,12 +928,13 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, */ if (ret & VM_FAULT_RETRY) { trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return false; + /* Likely, but not guaranteed, that page lock failed */ + return SCAN_PAGE_LOCK; } if (ret & VM_FAULT_ERROR) { mmap_read_unlock(mm); trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return false; + return SCAN_FAIL; } swapped_in++; } @@ -991,30 +944,40 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, lru_add_drain(); trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); - return true; + return SCAN_SUCCEED; } -static void collapse_huge_page(struct mm_struct *mm, - unsigned long address, - struct page **hpage, - int node, int referenced, int unmapped) +static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm, + struct collapse_control *cc) +{ + gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : + GFP_TRANSHUGE); + int node = hpage_collapse_find_target_node(cc); + + if (!hpage_collapse_alloc_page(hpage, gfp, node, &cc->alloc_nmask)) + return SCAN_ALLOC_HUGE_PAGE_FAIL; + if (unlikely(mem_cgroup_charge(page_folio(*hpage), mm, gfp))) + return SCAN_CGROUP_CHARGE_FAIL; + count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC); + return SCAN_SUCCEED; +} + +static int collapse_huge_page(struct mm_struct *mm, unsigned long address, + int referenced, int unmapped, + struct collapse_control *cc) { LIST_HEAD(compound_pagelist); pmd_t *pmd, _pmd; pte_t *pte; pgtable_t pgtable; - struct page *new_page; + struct page *hpage; spinlock_t *pmd_ptl, *pte_ptl; - int isolated = 0, result = 0; + int result = SCAN_FAIL; struct vm_area_struct *vma; struct mmu_notifier_range range; - gfp_t gfp; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - /* Only allocate from the target node */ - gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; - /* * Before allocating the hugepage, release the mmap_lock read lock. * The allocation can take potentially a long time if it involves @@ -1022,40 +985,34 @@ static void collapse_huge_page(struct mm_struct *mm, * that. We will recheck the vma after taking it again in write mode. */ mmap_read_unlock(mm); - new_page = khugepaged_alloc_page(hpage, gfp, node); - if (!new_page) { - result = SCAN_ALLOC_HUGE_PAGE_FAIL; - goto out_nolock; - } - if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) { - result = SCAN_CGROUP_CHARGE_FAIL; + result = alloc_charge_hpage(&hpage, mm, cc); + if (result != SCAN_SUCCEED) goto out_nolock; - } - count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); mmap_read_lock(mm); - result = hugepage_vma_revalidate(mm, address, &vma); - if (result) { + result = hugepage_vma_revalidate(mm, address, true, &vma, cc); + if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; } - pmd = mm_find_pmd(mm, address); - if (!pmd) { - result = SCAN_PMD_NULL; + result = find_pmd_or_thp_or_none(mm, address, &pmd); + if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; } - /* - * __collapse_huge_page_swapin will return with mmap_lock released - * when it fails. So we jump out_nolock directly in that case. - * Continuing to collapse causes inconsistency. - */ - if (unmapped && !__collapse_huge_page_swapin(mm, vma, address, - pmd, referenced)) { - goto out_nolock; + if (unmapped) { + /* + * __collapse_huge_page_swapin will return with mmap_lock + * released when it fails. So we jump out_nolock directly in + * that case. Continuing to collapse causes inconsistency. + */ + result = __collapse_huge_page_swapin(mm, vma, address, pmd, + referenced); + if (result != SCAN_SUCCEED) + goto out_nolock; } mmap_read_unlock(mm); @@ -1065,11 +1022,12 @@ static void collapse_huge_page(struct mm_struct *mm, * handled by the anon_vma lock + PG_lock. */ mmap_write_lock(mm); - result = hugepage_vma_revalidate(mm, address, &vma); - if (result) + result = hugepage_vma_revalidate(mm, address, true, &vma, cc); + if (result != SCAN_SUCCEED) goto out_up_write; /* check if the pmd is still valid */ - if (mm_find_pmd(mm, address) != pmd) + result = check_pmd_still_valid(mm, address, pmd); + if (result != SCAN_SUCCEED) goto out_up_write; anon_vma_lock_write(vma->anon_vma); @@ -1083,21 +1041,24 @@ static void collapse_huge_page(struct mm_struct *mm, pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ /* - * After this gup_fast can't run anymore. This also removes - * any huge TLB entry from the CPU so we won't allow - * huge and small TLB entries for the same virtual address - * to avoid the risk of CPU bugs in that area. + * This removes any huge TLB entry from the CPU so we won't allow + * huge and small TLB entries for the same virtual address to + * avoid the risk of CPU bugs in that area. + * + * Parallel fast GUP is fine since fast GUP will back off when + * it detects PMD is changed. */ _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(&range); + tlb_remove_table_sync_one(); spin_lock(pte_ptl); - isolated = __collapse_huge_page_isolate(vma, address, pte, - &compound_pagelist); + result = __collapse_huge_page_isolate(vma, address, pte, cc, + &compound_pagelist); spin_unlock(pte_ptl); - if (unlikely(!isolated)) { + if (unlikely(result != SCAN_SUCCEED)) { pte_unmap(pte); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); @@ -1109,7 +1070,6 @@ static void collapse_huge_page(struct mm_struct *mm, pmd_populate(mm, pmd, pmd_pgtable(_pmd)); spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); - result = SCAN_FAIL; goto out_up_write; } @@ -1119,8 +1079,8 @@ static void collapse_huge_page(struct mm_struct *mm, */ anon_vma_unlock_write(vma->anon_vma); - __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl, - &compound_pagelist); + __collapse_huge_page_copy(pte, hpage, vma, address, pte_ptl, + &compound_pagelist); pte_unmap(pte); /* * spin_lock() below is not the equivalent of smp_wmb(), but @@ -1128,42 +1088,43 @@ static void collapse_huge_page(struct mm_struct *mm, * avoid the copy_huge_page writes to become visible after * the set_pmd_at() write. */ - __SetPageUptodate(new_page); + __SetPageUptodate(hpage); pgtable = pmd_pgtable(_pmd); - _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); + _pmd = mk_huge_pmd(hpage, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - page_add_new_anon_rmap(new_page, vma, address); - lru_cache_add_inactive_or_unevictable(new_page, vma); + page_add_new_anon_rmap(hpage, vma, address); + lru_cache_add_inactive_or_unevictable(hpage, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); spin_unlock(pmd_ptl); - *hpage = NULL; + hpage = NULL; - khugepaged_pages_collapsed++; result = SCAN_SUCCEED; out_up_write: mmap_write_unlock(mm); out_nolock: - if (!IS_ERR_OR_NULL(*hpage)) - mem_cgroup_uncharge(page_folio(*hpage)); - trace_mm_collapse_huge_page(mm, isolated, result); - return; + if (hpage) { + mem_cgroup_uncharge(page_folio(hpage)); + put_page(hpage); + } + trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); + return result; } -static int khugepaged_scan_pmd(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, - struct page **hpage) +static int hpage_collapse_scan_pmd(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, bool *mmap_locked, + struct collapse_control *cc) { pmd_t *pmd; pte_t *pte, *_pte; - int ret = 0, result = 0, referenced = 0; + int result = SCAN_FAIL, referenced = 0; int none_or_zero = 0, shared = 0; struct page *page = NULL; unsigned long _address; @@ -1173,19 +1134,20 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, VM_BUG_ON(address & ~HPAGE_PMD_MASK); - pmd = mm_find_pmd(mm, address); - if (!pmd) { - result = SCAN_PMD_NULL; + result = find_pmd_or_thp_or_none(mm, address, &pmd); + if (result != SCAN_SUCCEED) goto out; - } - memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); + memset(cc->node_load, 0, sizeof(cc->node_load)); + nodes_clear(cc->alloc_nmask); pte = pte_offset_map_lock(mm, pmd, address, &ptl); for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { pte_t pteval = *_pte; if (is_swap_pte(pteval)) { - if (++unmapped <= khugepaged_max_ptes_swap) { + ++unmapped; + if (!cc->is_khugepaged || + unmapped <= khugepaged_max_ptes_swap) { /* * Always be strict with uffd-wp * enabled swap entries. Please see @@ -1203,8 +1165,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, } } if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + ++none_or_zero; if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) { + (!cc->is_khugepaged || + none_or_zero <= khugepaged_max_ptes_none)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; @@ -1234,27 +1198,30 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, goto out_unmap; } - if (page_mapcount(page) > 1 && - ++shared > khugepaged_max_ptes_shared) { - result = SCAN_EXCEED_SHARED_PTE; - count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); - goto out_unmap; + if (page_mapcount(page) > 1) { + ++shared; + if (cc->is_khugepaged && + shared > khugepaged_max_ptes_shared) { + result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); + goto out_unmap; + } } page = compound_head(page); /* * Record which node the original page is from and save this - * information to khugepaged_node_load[]. + * information to cc->node_load[]. * Khugepaged will allocate hugepage from the node has the max * hit record. */ node = page_to_nid(page); - if (khugepaged_scan_abort(node)) { + if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; goto out_unmap; } - khugepaged_node_load[node]++; + cc->node_load[node]++; if (!PageLRU(page)) { result = SCAN_PAGE_LRU; goto out_unmap; @@ -1271,15 +1238,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, /* * Check if the page has any GUP (or other external) pins. * - * Here the check is racy it may see total_mapcount > refcount - * in some cases. - * For example, one process with one forked child process. - * The parent has the PMD split due to MADV_DONTNEED, then - * the child is trying unmap the whole PMD, but khugepaged - * may be scanning the parent between the child has - * PageDoubleMap flag cleared and dec the mapcount. So - * khugepaged may see total_mapcount > refcount. - * + * Here the check may be racy: + * it may see total_mapcount > refcount in some cases? * But such case is ephemeral we could always retry collapse * later. However it may report false positive if the page * has excessive GUP pins (i.e. 512). Anyway the same check @@ -1289,43 +1249,51 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, result = SCAN_PAGE_COUNT; goto out_unmap; } - if (pte_young(pteval) || - page_is_young(page) || PageReferenced(page) || - mmu_notifier_test_young(vma->vm_mm, address)) + + /* + * If collapse was initiated by khugepaged, check that there is + * enough young pte to justify collapsing the page + */ + if (cc->is_khugepaged && + (pte_young(pteval) || page_is_young(page) || + PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, + address))) referenced++; } if (!writable) { result = SCAN_PAGE_RO; - } else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) { + } else if (cc->is_khugepaged && + (!referenced || + (unmapped && referenced < HPAGE_PMD_NR / 2))) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; - ret = 1; } out_unmap: pte_unmap_unlock(pte, ptl); - if (ret) { - node = khugepaged_find_target_node(); + if (result == SCAN_SUCCEED) { + result = collapse_huge_page(mm, address, referenced, + unmapped, cc); /* collapse_huge_page will return with the mmap_lock released */ - collapse_huge_page(mm, address, hpage, node, - referenced, unmapped); + *mmap_locked = false; } out: trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, none_or_zero, result, unmapped); - return ret; + return result; } -static void collect_mm_slot(struct mm_slot *mm_slot) +static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) { - struct mm_struct *mm = mm_slot->mm; + struct mm_slot *slot = &mm_slot->slot; + struct mm_struct *mm = slot->mm; lockdep_assert_held(&khugepaged_mm_lock); - if (khugepaged_test_exit(mm)) { + if (hpage_collapse_test_exit(mm)) { /* free mm_slot */ - hash_del(&mm_slot->hash); - list_del(&mm_slot->mm_node); + hash_del(&slot->hash); + list_del(&slot->mm_node); /* * Not strictly needed because the mm exited already. @@ -1334,7 +1302,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot) */ /* khugepaged_mm_lock actually not necessary for the below */ - free_mm_slot(mm_slot); + mm_slot_free(mm_slot_cache, mm_slot); mmdrop(mm); } } @@ -1343,31 +1311,105 @@ static void collect_mm_slot(struct mm_slot *mm_slot) /* * Notify khugepaged that given addr of the mm is pte-mapped THP. Then * khugepaged should try to collapse the page table. + * + * Note that following race exists: + * (1) khugepaged calls khugepaged_collapse_pte_mapped_thps() for mm_struct A, + * emptying the A's ->pte_mapped_thp[] array. + * (2) MADV_COLLAPSE collapses some file extent with target mm_struct B, and + * retract_page_tables() finds a VMA in mm_struct A mapping the same extent + * (at virtual address X) and adds an entry (for X) into mm_struct A's + * ->pte-mapped_thp[] array. + * (3) khugepaged calls khugepaged_collapse_scan_file() for mm_struct A at X, + * sees a pte-mapped THP (SCAN_PTE_MAPPED_HUGEPAGE) and adds an entry + * (for X) into mm_struct A's ->pte-mapped_thp[] array. + * Thus, it's possible the same address is added multiple times for the same + * mm_struct. Should this happen, we'll simply attempt + * collapse_pte_mapped_thp() multiple times for the same address, under the same + * exclusive mmap_lock, and assuming the first call is successful, subsequent + * attempts will return quickly (without grabbing any additional locks) when + * a huge pmd is found in find_pmd_or_thp_or_none(). Since this is a cheap + * check, and since this is a rare occurrence, the cost of preventing this + * "multiple-add" is thought to be more expensive than just handling it, should + * it occur. */ -static void khugepaged_add_pte_mapped_thp(struct mm_struct *mm, +static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) { - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; + bool ret = false; VM_BUG_ON(addr & ~HPAGE_PMD_MASK); spin_lock(&khugepaged_mm_lock); - mm_slot = get_mm_slot(mm); - if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) + slot = mm_slot_lookup(mm_slots_hash, mm); + mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); + if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) { mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr; + ret = true; + } spin_unlock(&khugepaged_mm_lock); + return ret; } +/* hpage must be locked, and mmap_lock must be held in write */ +static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmdp, struct page *hpage) +{ + struct vm_fault vmf = { + .vma = vma, + .address = addr, + .flags = 0, + .pmd = pmdp, + }; + + VM_BUG_ON(!PageTransHuge(hpage)); + mmap_assert_write_locked(vma->vm_mm); + + if (do_set_pmd(&vmf, hpage)) + return SCAN_FAIL; + + get_page(hpage); + return SCAN_SUCCEED; +} + +/* + * A note about locking: + * Trying to take the page table spinlocks would be useless here because those + * are only used to synchronize: + * + * - modifying terminal entries (ones that point to a data page, not to another + * page table) + * - installing *new* non-terminal entries + * + * Instead, we need roughly the same kind of protection as free_pgtables() or + * mm_take_all_locks() (but only for a single VMA): + * The mmap lock together with this VMA's rmap locks covers all paths towards + * the page table entries we're messing with here, except for hardware page + * table walks and lockless_pages_from_mm(). + */ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - spinlock_t *ptl; pmd_t pmd; + struct mmu_notifier_range range; mmap_assert_write_locked(mm); - ptl = pmd_lock(vma->vm_mm, pmdp); + if (vma->vm_file) + lockdep_assert_held_write(&vma->vm_file->f_mapping->i_mmap_rwsem); + /* + * All anon_vmas attached to the VMA have the same root and are + * therefore locked by the same lock. + */ + if (vma->anon_vma) + lockdep_assert_held_write(&vma->anon_vma->root->rwsem); + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, addr, + addr + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); pmd = pmdp_collapse_flush(vma, addr, pmdp); - spin_unlock(ptl); + tlb_remove_table_sync_one(); + mmu_notifier_invalidate_range_end(&range); mm_dec_nr_ptes(mm); page_table_check_pte_clear_range(mm, addr, pmd); pte_free(mm, pmd_pgtable(pmd)); @@ -1379,52 +1421,102 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v * * @mm: process address space where collapse happens * @addr: THP collapse address + * @install_pmd: If a huge PMD should be installed * * This function checks whether all the PTEs in the PMD are pointing to the * right THP. If so, retract the page table so the THP can refault in with - * as pmd-mapped. + * as pmd-mapped. Possibly install a huge PMD mapping the THP. */ -void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) +int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, + bool install_pmd) { unsigned long haddr = addr & HPAGE_PMD_MASK; - struct vm_area_struct *vma = find_vma(mm, haddr); + struct vm_area_struct *vma = vma_lookup(mm, haddr); struct page *hpage; pte_t *start_pte, *pte; pmd_t *pmd; spinlock_t *ptl; - int count = 0; + int count = 0, result = SCAN_FAIL; int i; + mmap_assert_write_locked(mm); + + /* Fast check before locking page if already PMD-mapped */ + result = find_pmd_or_thp_or_none(mm, haddr, &pmd); + if (result == SCAN_PMD_MAPPED) + return result; + if (!vma || !vma->vm_file || !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) - return; + return SCAN_VMA_CHECK; /* - * This vm_flags may not have VM_HUGEPAGE if the page was not - * collapsed by this mm. But we can still collapse if the page is - * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check() - * will not fail the vma for missing VM_HUGEPAGE + * If we are here, we've succeeded in replacing all the native pages + * in the page cache with a single hugepage. If a mm were to fault-in + * this memory (mapped by a suitably aligned VMA), we'd get the hugepage + * and map it by a PMD, regardless of sysfs THP settings. As such, let's + * analogously elide sysfs THP settings here. */ - if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false, false)) - return; + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) + return SCAN_VMA_CHECK; + + /* + * Symmetry with retract_page_tables(): Exclude MAP_PRIVATE mappings + * that got written to. Without this, we'd have to also lock the + * anon_vma if one exists. + */ + if (vma->anon_vma) + return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ if (userfaultfd_wp(vma)) - return; + return SCAN_PTE_UFFD_WP; hpage = find_lock_page(vma->vm_file->f_mapping, linear_page_index(vma, haddr)); if (!hpage) - return; + return SCAN_PAGE_NULL; - if (!PageHead(hpage)) + if (!PageHead(hpage)) { + result = SCAN_FAIL; goto drop_hpage; + } - pmd = mm_find_pmd(mm, haddr); - if (!pmd) + if (compound_order(hpage) != HPAGE_PMD_ORDER) { + result = SCAN_PAGE_COMPOUND; goto drop_hpage; + } + + switch (result) { + case SCAN_SUCCEED: + break; + case SCAN_PMD_NONE: + /* + * In MADV_COLLAPSE path, possible race with khugepaged where + * all pte entries have been removed and pmd cleared. If so, + * skip all the pte checks and just update the pmd mapping. + */ + goto maybe_install_pmd; + default: + goto drop_hpage; + } + /* + * We need to lock the mapping so that from here on, only GUP-fast and + * hardware page walks can access the parts of the page tables that + * we're operating on. + * See collapse_and_free_pmd(). + */ + i_mmap_lock_write(vma->vm_file->f_mapping); + + /* + * This spinlock should be unnecessary: Nobody else should be accessing + * the page tables under spinlock protection here, only + * lockless_pages_from_mm() and the hardware page walker can access page + * tables while all the high-level locks are held in write mode. + */ start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); + result = SCAN_FAIL; /* step 1: check all mapped PTEs are to the right huge page */ for (i = 0, addr = haddr, pte = start_pte; @@ -1436,8 +1528,10 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) continue; /* page swapped out, abort */ - if (!pte_present(*pte)) + if (!pte_present(*pte)) { + result = SCAN_PTE_NON_PRESENT; goto abort; + } page = vm_normal_page(vma, addr, *pte); if (WARN_ON_ONCE(page && is_zone_device_page(page))) @@ -1472,21 +1566,32 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count); } - /* step 4: collapse pmd */ + /* step 4: remove pte entries */ collapse_and_free_pmd(mm, vma, haddr, pmd); + + i_mmap_unlock_write(vma->vm_file->f_mapping); + +maybe_install_pmd: + /* step 5: install pmd entry */ + result = install_pmd + ? set_huge_pmd(vma, haddr, pmd, hpage) + : SCAN_SUCCEED; + drop_hpage: unlock_page(hpage); put_page(hpage); - return; + return result; abort: pte_unmap_unlock(start_pte, ptl); + i_mmap_unlock_write(vma->vm_file->f_mapping); goto drop_hpage; } -static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) +static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot) { - struct mm_struct *mm = mm_slot->mm; + struct mm_slot *slot = &mm_slot->slot; + struct mm_struct *mm = slot->mm; int i; if (likely(mm_slot->nr_pte_mapped_thp == 0)) @@ -1495,26 +1600,33 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) if (!mmap_write_trylock(mm)) return; - if (unlikely(khugepaged_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit(mm))) goto out; for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++) - collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]); + collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i], false); out: mm_slot->nr_pte_mapped_thp = 0; mmap_write_unlock(mm); } -static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) +static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, + struct mm_struct *target_mm, + unsigned long target_addr, struct page *hpage, + struct collapse_control *cc) { struct vm_area_struct *vma; - struct mm_struct *mm; - unsigned long addr; - pmd_t *pmd; + int target_result = SCAN_FAIL; i_mmap_lock_write(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { + int result = SCAN_FAIL; + struct mm_struct *mm = NULL; + unsigned long addr = 0; + pmd_t *pmd; + bool is_target = false; + /* * Check vma->anon_vma to exclude MAP_PRIVATE mappings that * got written to. These VMAs are likely not worth investing @@ -1529,27 +1641,37 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * An alternative would be drop the check, but check that page * table is clear before calling pmdp_collapse_flush() under * ptl. It has higher chance to recover THP for the VMA, but - * has higher cost too. + * has higher cost too. It would also probably require locking + * the anon_vma. */ - if (vma->anon_vma) - continue; + if (vma->anon_vma) { + result = SCAN_PAGE_ANON; + goto next; + } addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); - if (addr & ~HPAGE_PMD_MASK) - continue; - if (vma->vm_end < addr + HPAGE_PMD_SIZE) - continue; + if (addr & ~HPAGE_PMD_MASK || + vma->vm_end < addr + HPAGE_PMD_SIZE) { + result = SCAN_VMA_CHECK; + goto next; + } mm = vma->vm_mm; - pmd = mm_find_pmd(mm, addr); - if (!pmd) - continue; + is_target = mm == target_mm && addr == target_addr; + result = find_pmd_or_thp_or_none(mm, addr, &pmd); + if (result != SCAN_SUCCEED) + goto next; /* * We need exclusive mmap_lock to retract page table. * * We use trylock due to lock inversion: we need to acquire * mmap_lock while holding page lock. Fault path does it in * reverse order. Trylock is a way to avoid deadlock. + * + * Also, it's not MADV_COLLAPSE's job to collapse other + * mappings - let khugepaged take care of them later. */ - if (mmap_write_trylock(mm)) { + result = SCAN_PTE_MAPPED_HUGEPAGE; + if ((cc->is_khugepaged || is_target) && + mmap_write_trylock(mm)) { /* * When a vma is registered with uffd-wp, we can't * recycle the pmd pgtable because there can be pte @@ -1558,25 +1680,48 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * it'll always mapped in small page size for uffd-wp * registered ranges. */ - if (!khugepaged_test_exit(mm) && !userfaultfd_wp(vma)) - collapse_and_free_pmd(mm, vma, addr, pmd); + if (hpage_collapse_test_exit(mm)) { + result = SCAN_ANY_PROCESS; + goto unlock_next; + } + if (userfaultfd_wp(vma)) { + result = SCAN_PTE_UFFD_WP; + goto unlock_next; + } + collapse_and_free_pmd(mm, vma, addr, pmd); + if (!cc->is_khugepaged && is_target) + result = set_huge_pmd(vma, addr, pmd, hpage); + else + result = SCAN_SUCCEED; + +unlock_next: mmap_write_unlock(mm); - } else { - /* Try again later */ + goto next; + } + /* + * Calling context will handle target mm/addr. Otherwise, let + * khugepaged try again later. + */ + if (!is_target) { khugepaged_add_pte_mapped_thp(mm, addr); + continue; } +next: + if (is_target) + target_result = result; } i_mmap_unlock_write(mapping); + return target_result; } /** * collapse_file - collapse filemap/tmpfs/shmem pages into huge one. * * @mm: process address space where collapse happens + * @addr: virtual collapse start address * @file: file that collapse on * @start: collapse start address - * @hpage: new allocated huge page for collapse - * @node: appointed node the new huge page allocate from + * @cc: collapse context and scratchpad * * Basic scheme is simple, details are more complex: * - allocate and lock a new huge page; @@ -1593,37 +1738,25 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * + restore gaps in the page cache; * + unlock and free huge page; */ -static void collapse_file(struct mm_struct *mm, - struct file *file, pgoff_t start, - struct page **hpage, int node) +static int collapse_file(struct mm_struct *mm, unsigned long addr, + struct file *file, pgoff_t start, + struct collapse_control *cc) { struct address_space *mapping = file->f_mapping; - gfp_t gfp; - struct page *new_page; - pgoff_t index, end = start + HPAGE_PMD_NR; + struct page *hpage; + pgoff_t index = 0, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); int nr_none = 0, result = SCAN_SUCCEED; bool is_shmem = shmem_file(file); - int nr; + int nr = 0; VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); - /* Only allocate from the target node */ - gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; - - new_page = khugepaged_alloc_page(hpage, gfp, node); - if (!new_page) { - result = SCAN_ALLOC_HUGE_PAGE_FAIL; + result = alloc_charge_hpage(&hpage, mm, cc); + if (result != SCAN_SUCCEED) goto out; - } - - if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) { - result = SCAN_CGROUP_CHARGE_FAIL; - goto out; - } - count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); /* * Ensure we have slots for all the pages in the range. This is @@ -1641,14 +1774,14 @@ static void collapse_file(struct mm_struct *mm, } } while (1); - __SetPageLocked(new_page); + __SetPageLocked(hpage); if (is_shmem) - __SetPageSwapBacked(new_page); - new_page->index = start; - new_page->mapping = mapping; + __SetPageSwapBacked(hpage); + hpage->index = start; + hpage->mapping = mapping; /* - * At this point the new_page is locked and not up-to-date. + * At this point the hpage is locked and not up-to-date. * It's safe to insert it into the page cache, because nobody would * be able to map it or use it in another way until we unlock it. */ @@ -1656,6 +1789,7 @@ static void collapse_file(struct mm_struct *mm, xas_set(&xas, start); for (index = start; index < end; index++) { struct page *page = xas_next(&xas); + struct folio *folio; VM_BUG_ON(index != xas.xa_index); if (is_shmem) { @@ -1676,7 +1810,7 @@ static void collapse_file(struct mm_struct *mm, result = SCAN_FAIL; goto xa_locked; } - xas_store(&xas, new_page); + xas_store(&xas, hpage); nr_none++; continue; } @@ -1684,11 +1818,12 @@ static void collapse_file(struct mm_struct *mm, if (xa_is_value(page) || !PageUptodate(page)) { xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ - if (shmem_getpage(mapping->host, index, &page, - SGP_NOALLOC)) { + if (shmem_get_folio(mapping->host, index, + &folio, SGP_NOALLOC)) { result = SCAN_FAIL; goto xa_unlocked; } + page = folio_file_page(folio, index); } else if (trylock_page(page)) { get_page(page); xas_unlock_irq(&xas); @@ -1755,19 +1890,28 @@ static void collapse_file(struct mm_struct *mm, /* * If file was truncated then extended, or hole-punched, before * we locked the first page, then a THP might be there already. + * This will be discovered on the first iteration. */ if (PageTransCompound(page)) { - result = SCAN_PAGE_COMPOUND; + struct page *head = compound_head(page); + + result = compound_order(head) == HPAGE_PMD_ORDER && + head->index == start + /* Maybe PMD-mapped */ + ? SCAN_PTE_MAPPED_HUGEPAGE + : SCAN_PAGE_COMPOUND; goto out_unlock; } - if (page_mapping(page) != mapping) { + folio = page_folio(page); + + if (folio_mapping(folio) != mapping) { result = SCAN_TRUNCATED; goto out_unlock; } - if (!is_shmem && (PageDirty(page) || - PageWriteback(page))) { + if (!is_shmem && (folio_test_dirty(folio) || + folio_test_writeback(folio))) { /* * khugepaged only works on read-only fd, so this * page is dirty because it hasn't been flushed @@ -1777,20 +1921,20 @@ static void collapse_file(struct mm_struct *mm, goto out_unlock; } - if (isolate_lru_page(page)) { + if (folio_isolate_lru(folio)) { result = SCAN_DEL_PAGE_LRU; goto out_unlock; } - if (page_has_private(page) && - !try_to_release_page(page, GFP_KERNEL)) { + if (folio_has_private(folio) && + !filemap_release_folio(folio, GFP_KERNEL)) { result = SCAN_PAGE_HAS_PRIVATE; - putback_lru_page(page); + folio_putback_lru(folio); goto out_unlock; } - if (page_mapped(page)) - try_to_unmap(page_folio(page), + if (folio_mapped(folio)) + try_to_unmap(folio, TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); xas_lock_irq(&xas); @@ -1818,19 +1962,19 @@ static void collapse_file(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - xas_store(&xas, new_page); + xas_store(&xas, hpage); continue; out_unlock: unlock_page(page); put_page(page); goto xa_unlocked; } - nr = thp_nr_pages(new_page); + nr = thp_nr_pages(hpage); if (is_shmem) - __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr); + __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr); else { - __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr); + __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr); filemap_nr_thps_inc(mapping); /* * Paired with smp_mb() in do_dentry_open() to ensure @@ -1841,21 +1985,21 @@ out_unlock: smp_mb(); if (inode_is_open_for_write(mapping->host)) { result = SCAN_FAIL; - __mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr); + __mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); goto xa_locked; } } if (nr_none) { - __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none); + __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none); /* nr_none is always 0 for non-shmem. */ - __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none); + __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none); } /* Join all the small entries into a single multi-index entry */ xas_set_order(&xas, start, HPAGE_PMD_ORDER); - xas_store(&xas, new_page); + xas_store(&xas, hpage); xa_locked: xas_unlock_irq(&xas); xa_unlocked: @@ -1869,6 +2013,7 @@ xa_unlocked: if (result == SCAN_SUCCEED) { struct page *page, *tmp; + struct folio *folio; /* * Replacing old pages with new one has succeeded, now we @@ -1877,11 +2022,11 @@ xa_unlocked: index = start; list_for_each_entry_safe(page, tmp, &pagelist, lru) { while (index < page->index) { - clear_highpage(new_page + (index % HPAGE_PMD_NR)); + clear_highpage(hpage + (index % HPAGE_PMD_NR)); index++; } - copy_highpage(new_page + (page->index % HPAGE_PMD_NR), - page); + copy_highpage(hpage + (page->index % HPAGE_PMD_NR), + page); list_del(&page->lru); page->mapping = NULL; page_ref_unfreeze(page, 1); @@ -1892,23 +2037,25 @@ xa_unlocked: index++; } while (index < end) { - clear_highpage(new_page + (index % HPAGE_PMD_NR)); + clear_highpage(hpage + (index % HPAGE_PMD_NR)); index++; } - SetPageUptodate(new_page); - page_ref_add(new_page, HPAGE_PMD_NR - 1); + folio = page_folio(hpage); + folio_mark_uptodate(folio); + folio_ref_add(folio, HPAGE_PMD_NR - 1); + if (is_shmem) - set_page_dirty(new_page); - lru_cache_add(new_page); + folio_mark_dirty(folio); + folio_add_lru(folio); /* * Remove pte page tables, so we can re-fault the page as huge. */ - retract_page_tables(mapping, start); - *hpage = NULL; - - khugepaged_pages_collapsed++; + result = retract_page_tables(mapping, start, mm, addr, hpage, + cc); + unlock_page(hpage); + hpage = NULL; } else { struct page *page; @@ -1947,19 +2094,25 @@ xa_unlocked: VM_BUG_ON(nr_none); xas_unlock_irq(&xas); - new_page->mapping = NULL; + hpage->mapping = NULL; } - unlock_page(new_page); + if (hpage) + unlock_page(hpage); out: VM_BUG_ON(!list_empty(&pagelist)); - if (!IS_ERR_OR_NULL(*hpage)) - mem_cgroup_uncharge(page_folio(*hpage)); - /* TODO: tracepoints */ + if (hpage) { + mem_cgroup_uncharge(page_folio(hpage)); + put_page(hpage); + } + + trace_mm_khugepaged_collapse_file(mm, hpage, index, is_shmem, addr, file, nr, result); + return result; } -static void khugepaged_scan_file(struct mm_struct *mm, - struct file *file, pgoff_t start, struct page **hpage) +static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, + struct file *file, pgoff_t start, + struct collapse_control *cc) { struct page *page = NULL; struct address_space *mapping = file->f_mapping; @@ -1970,14 +2123,17 @@ static void khugepaged_scan_file(struct mm_struct *mm, present = 0; swap = 0; - memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); + memset(cc->node_load, 0, sizeof(cc->node_load)); + nodes_clear(cc->alloc_nmask); rcu_read_lock(); xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) { if (xas_retry(&xas, page)) continue; if (xa_is_value(page)) { - if (++swap > khugepaged_max_ptes_swap) { + ++swap; + if (cc->is_khugepaged && + swap > khugepaged_max_ptes_swap) { result = SCAN_EXCEED_SWAP_PTE; count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); break; @@ -1986,20 +2142,32 @@ static void khugepaged_scan_file(struct mm_struct *mm, } /* - * XXX: khugepaged should compact smaller compound pages + * TODO: khugepaged should compact smaller compound pages * into a PMD sized page */ if (PageTransCompound(page)) { - result = SCAN_PAGE_COMPOUND; + struct page *head = compound_head(page); + + result = compound_order(head) == HPAGE_PMD_ORDER && + head->index == start + /* Maybe PMD-mapped */ + ? SCAN_PTE_MAPPED_HUGEPAGE + : SCAN_PAGE_COMPOUND; + /* + * For SCAN_PTE_MAPPED_HUGEPAGE, further processing + * by the caller won't touch the page cache, and so + * it's safe to skip LRU and refcount checks before + * returning. + */ break; } node = page_to_nid(page); - if (khugepaged_scan_abort(node)) { + if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; break; } - khugepaged_node_load[node]++; + cc->node_load[node]++; if (!PageLRU(page)) { result = SCAN_PAGE_LRU; @@ -2028,54 +2196,67 @@ static void khugepaged_scan_file(struct mm_struct *mm, rcu_read_unlock(); if (result == SCAN_SUCCEED) { - if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { + if (cc->is_khugepaged && + present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { - node = khugepaged_find_target_node(); - collapse_file(mm, file, start, hpage, node); + result = collapse_file(mm, addr, file, start, cc); } } - /* TODO: tracepoints */ + trace_mm_khugepaged_scan_file(mm, page, file, present, swap, result); + return result; } #else -static void khugepaged_scan_file(struct mm_struct *mm, - struct file *file, pgoff_t start, struct page **hpage) +static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, + struct file *file, pgoff_t start, + struct collapse_control *cc) { BUILD_BUG(); } -static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) +static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot) +{ +} + +static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr) { + return false; } #endif -static unsigned int khugepaged_scan_mm_slot(unsigned int pages, - struct page **hpage) +static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, + struct collapse_control *cc) __releases(&khugepaged_mm_lock) __acquires(&khugepaged_mm_lock) { - struct mm_slot *mm_slot; + struct vma_iterator vmi; + struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; struct mm_struct *mm; struct vm_area_struct *vma; int progress = 0; VM_BUG_ON(!pages); lockdep_assert_held(&khugepaged_mm_lock); + *result = SCAN_FAIL; - if (khugepaged_scan.mm_slot) + if (khugepaged_scan.mm_slot) { mm_slot = khugepaged_scan.mm_slot; - else { - mm_slot = list_entry(khugepaged_scan.mm_head.next, + slot = &mm_slot->slot; + } else { + slot = list_entry(khugepaged_scan.mm_head.next, struct mm_slot, mm_node); + mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); khugepaged_scan.address = 0; khugepaged_scan.mm_slot = mm_slot; } spin_unlock(&khugepaged_mm_lock); khugepaged_collapse_pte_mapped_thps(mm_slot); - mm = mm_slot->mm; + mm = slot->mm; /* * Don't wait for semaphore (to avoid long wait times). Just move to * the next mm on the list. @@ -2083,19 +2264,21 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, vma = NULL; if (unlikely(!mmap_read_trylock(mm))) goto breakouterloop_mmap_lock; - if (likely(!khugepaged_test_exit(mm))) - vma = find_vma(mm, khugepaged_scan.address); progress++; - for (; vma; vma = vma->vm_next) { + if (unlikely(hpage_collapse_test_exit(mm))) + goto breakouterloop; + + vma_iter_init(&vmi, mm, khugepaged_scan.address); + for_each_vma(vmi, vma) { unsigned long hstart, hend; cond_resched(); - if (unlikely(khugepaged_test_exit(mm))) { + if (unlikely(hpage_collapse_test_exit(mm))) { progress++; break; } - if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) { + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) { skip: progress++; continue; @@ -2109,9 +2292,10 @@ skip: VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); while (khugepaged_scan.address < hend) { - int ret; + bool mmap_locked = true; + cond_resched(); - if (unlikely(khugepaged_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit(mm))) goto breakouterloop; VM_BUG_ON(khugepaged_scan.address < hstart || @@ -2123,19 +2307,48 @@ skip: khugepaged_scan.address); mmap_read_unlock(mm); - ret = 1; - khugepaged_scan_file(mm, file, pgoff, hpage); + *result = hpage_collapse_scan_file(mm, + khugepaged_scan.address, + file, pgoff, cc); + mmap_locked = false; fput(file); } else { - ret = khugepaged_scan_pmd(mm, vma, - khugepaged_scan.address, - hpage); + *result = hpage_collapse_scan_pmd(mm, vma, + khugepaged_scan.address, + &mmap_locked, + cc); + } + switch (*result) { + case SCAN_PTE_MAPPED_HUGEPAGE: { + pmd_t *pmd; + + *result = find_pmd_or_thp_or_none(mm, + khugepaged_scan.address, + &pmd); + if (*result != SCAN_SUCCEED) + break; + if (!khugepaged_add_pte_mapped_thp(mm, + khugepaged_scan.address)) + break; + } fallthrough; + case SCAN_SUCCEED: + ++khugepaged_pages_collapsed; + break; + default: + break; } + /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; progress += HPAGE_PMD_NR; - if (ret) - /* we released mmap_lock so break loop */ + if (!mmap_locked) + /* + * We released mmap_lock so break loop. Note + * that we drop mmap_lock before all hugepage + * allocations, so if allocation fails, we are + * guaranteed to break here and report the + * correct result back to caller. + */ goto breakouterloop_mmap_lock; if (progress >= pages) goto breakouterloop; @@ -2151,16 +2364,17 @@ breakouterloop_mmap_lock: * Release the current mm_slot if this mm is about to die, or * if we scanned all vmas of this mm. */ - if (khugepaged_test_exit(mm) || !vma) { + if (hpage_collapse_test_exit(mm) || !vma) { /* * Make sure that if mm_users is reaching zero while * khugepaged runs here, khugepaged_exit will find * mm_slot not pointing to the exiting mm. */ - if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { - khugepaged_scan.mm_slot = list_entry( - mm_slot->mm_node.next, - struct mm_slot, mm_node); + if (slot->mm_node.next != &khugepaged_scan.mm_head) { + slot = list_entry(slot->mm_node.next, + struct mm_slot, mm_node); + khugepaged_scan.mm_slot = + mm_slot_entry(slot, struct khugepaged_mm_slot, slot); khugepaged_scan.address = 0; } else { khugepaged_scan.mm_slot = NULL; @@ -2185,19 +2399,16 @@ static int khugepaged_wait_event(void) kthread_should_stop(); } -static void khugepaged_do_scan(void) +static void khugepaged_do_scan(struct collapse_control *cc) { - struct page *hpage = NULL; unsigned int progress = 0, pass_through_head = 0; unsigned int pages = READ_ONCE(khugepaged_pages_to_scan); bool wait = true; + int result = SCAN_SUCCEED; lru_add_drain_all(); - while (progress < pages) { - if (!khugepaged_prealloc_page(&hpage, &wait)) - break; - + while (true) { cond_resched(); if (unlikely(kthread_should_stop() || try_to_freeze())) @@ -2209,14 +2420,25 @@ static void khugepaged_do_scan(void) if (khugepaged_has_work() && pass_through_head < 2) progress += khugepaged_scan_mm_slot(pages - progress, - &hpage); + &result, cc); else progress = pages; spin_unlock(&khugepaged_mm_lock); - } - if (!IS_ERR_OR_NULL(hpage)) - put_page(hpage); + if (progress >= pages) + break; + + if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) { + /* + * If fail to allocate the first time, try to sleep for + * a while. When hit again, cancel the scan. + */ + if (!wait) + break; + wait = false; + khugepaged_alloc_sleep(); + } + } } static bool khugepaged_should_wakeup(void) @@ -2247,13 +2469,13 @@ static void khugepaged_wait_work(void) static int khugepaged(void *none) { - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; set_freezable(); set_user_nice(current, MAX_NICE); while (!kthread_should_stop()) { - khugepaged_do_scan(); + khugepaged_do_scan(&khugepaged_collapse_control); khugepaged_wait_work(); } @@ -2352,3 +2574,145 @@ void khugepaged_min_free_kbytes_update(void) set_recommended_min_free_kbytes(); mutex_unlock(&khugepaged_mutex); } + +bool current_is_khugepaged(void) +{ + return kthread_func(current) == khugepaged; +} + +static int madvise_collapse_errno(enum scan_result r) +{ + /* + * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide + * actionable feedback to caller, so they may take an appropriate + * fallback measure depending on the nature of the failure. + */ + switch (r) { + case SCAN_ALLOC_HUGE_PAGE_FAIL: + return -ENOMEM; + case SCAN_CGROUP_CHARGE_FAIL: + return -EBUSY; + /* Resource temporary unavailable - trying again might succeed */ + case SCAN_PAGE_LOCK: + case SCAN_PAGE_LRU: + case SCAN_DEL_PAGE_LRU: + return -EAGAIN; + /* + * Other: Trying again likely not to succeed / error intrinsic to + * specified memory range. khugepaged likely won't be able to collapse + * either. + */ + default: + return -EINVAL; + } +} + +int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + struct collapse_control *cc; + struct mm_struct *mm = vma->vm_mm; + unsigned long hstart, hend, addr; + int thps = 0, last_fail = SCAN_FAIL; + bool mmap_locked = true; + + BUG_ON(vma->vm_start > start); + BUG_ON(vma->vm_end < end); + + *prev = vma; + + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) + return -EINVAL; + + cc = kmalloc(sizeof(*cc), GFP_KERNEL); + if (!cc) + return -ENOMEM; + cc->is_khugepaged = false; + + mmgrab(mm); + lru_add_drain_all(); + + hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = end & HPAGE_PMD_MASK; + + for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { + int result = SCAN_FAIL; + + if (!mmap_locked) { + cond_resched(); + mmap_read_lock(mm); + mmap_locked = true; + result = hugepage_vma_revalidate(mm, addr, false, &vma, + cc); + if (result != SCAN_SUCCEED) { + last_fail = result; + goto out_nolock; + } + + hend = vma->vm_end & HPAGE_PMD_MASK; + } + mmap_assert_locked(mm); + memset(cc->node_load, 0, sizeof(cc->node_load)); + nodes_clear(cc->alloc_nmask); + if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { + struct file *file = get_file(vma->vm_file); + pgoff_t pgoff = linear_page_index(vma, addr); + + mmap_read_unlock(mm); + mmap_locked = false; + result = hpage_collapse_scan_file(mm, addr, file, pgoff, + cc); + fput(file); + } else { + result = hpage_collapse_scan_pmd(mm, vma, addr, + &mmap_locked, cc); + } + if (!mmap_locked) + *prev = NULL; /* Tell caller we dropped mmap_lock */ + +handle_result: + switch (result) { + case SCAN_SUCCEED: + case SCAN_PMD_MAPPED: + ++thps; + break; + case SCAN_PTE_MAPPED_HUGEPAGE: + BUG_ON(mmap_locked); + BUG_ON(*prev); + mmap_write_lock(mm); + result = collapse_pte_mapped_thp(mm, addr, true); + mmap_write_unlock(mm); + goto handle_result; + /* Whitelisted set of results where continuing OK */ + case SCAN_PMD_NULL: + case SCAN_PTE_NON_PRESENT: + case SCAN_PTE_UFFD_WP: + case SCAN_PAGE_RO: + case SCAN_LACK_REFERENCED_PAGE: + case SCAN_PAGE_NULL: + case SCAN_PAGE_COUNT: + case SCAN_PAGE_LOCK: + case SCAN_PAGE_COMPOUND: + case SCAN_PAGE_LRU: + case SCAN_DEL_PAGE_LRU: + last_fail = result; + break; + default: + last_fail = result; + /* Other error, exit */ + goto out_maybelock; + } + } + +out_maybelock: + /* Caller expects us to hold mmap_lock on return */ + if (!mmap_locked) + mmap_read_lock(mm); +out_nolock: + mmap_assert_locked(mm); + mmdrop(mm); + kfree(cc); + + return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0 + : madvise_collapse_errno(last_fail); +} diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 1eddc0132f7f..92f670edbf51 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -79,6 +79,7 @@ #include <linux/mutex.h> #include <linux/rcupdate.h> #include <linux/stacktrace.h> +#include <linux/stackdepot.h> #include <linux/cache.h> #include <linux/percpu.h> #include <linux/memblock.h> @@ -159,8 +160,7 @@ struct kmemleak_object { u32 checksum; /* memory ranges to be scanned inside an object (empty for all) */ struct hlist_head area_list; - unsigned long trace[MAX_TRACE]; - unsigned int trace_len; + depot_stack_handle_t trace_handle; unsigned long jiffies; /* creation timestamp */ pid_t pid; /* pid of the current task */ char comm[TASK_COMM_LEN]; /* executable name */ @@ -346,19 +346,22 @@ static void print_unreferenced(struct seq_file *seq, struct kmemleak_object *object) { int i; + unsigned long *entries; + unsigned int nr_entries; unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies); + nr_entries = stack_depot_fetch(object->trace_handle, &entries); warn_or_seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", - object->pointer, object->size); + object->pointer, object->size); warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n", - object->comm, object->pid, object->jiffies, - msecs_age / 1000, msecs_age % 1000); + object->comm, object->pid, object->jiffies, + msecs_age / 1000, msecs_age % 1000); hex_dump_object(seq, object); warn_or_seq_printf(seq, " backtrace:\n"); - for (i = 0; i < object->trace_len; i++) { - void *ptr = (void *)object->trace[i]; - warn_or_seq_printf(seq, " [<%p>] %pS\n", ptr, ptr); + for (i = 0; i < nr_entries; i++) { + void *ptr = (void *)entries[i]; + warn_or_seq_printf(seq, " [<%pK>] %pS\n", ptr, ptr); } } @@ -370,15 +373,16 @@ static void print_unreferenced(struct seq_file *seq, static void dump_object_info(struct kmemleak_object *object) { pr_notice("Object 0x%08lx (size %zu):\n", - object->pointer, object->size); + object->pointer, object->size); pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", - object->comm, object->pid, object->jiffies); + object->comm, object->pid, object->jiffies); pr_notice(" min_count = %d\n", object->min_count); pr_notice(" count = %d\n", object->count); pr_notice(" flags = 0x%x\n", object->flags); pr_notice(" checksum = %u\n", object->checksum); pr_notice(" backtrace:\n"); - stack_trace_print(object->trace, object->trace_len, 4); + if (object->trace_handle) + stack_depot_print(object->trace_handle); } /* @@ -591,12 +595,18 @@ static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int ali return object; } -/* - * Save stack trace to the given array of MAX_TRACE size. - */ -static int __save_stack_trace(unsigned long *trace) +static noinline depot_stack_handle_t set_track_prepare(void) { - return stack_trace_save(trace, MAX_TRACE, 2); + depot_stack_handle_t trace_handle; + unsigned long entries[MAX_TRACE]; + unsigned int nr_entries; + + if (!kmemleak_initialized) + return 0; + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3); + trace_handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT); + + return trace_handle; } /* @@ -604,9 +614,8 @@ static int __save_stack_trace(unsigned long *trace) * memory block and add it to the object_list and object_tree_root (or * object_phys_tree_root). */ -static struct kmemleak_object *__create_object(unsigned long ptr, size_t size, - int min_count, gfp_t gfp, - bool is_phys) +static void __create_object(unsigned long ptr, size_t size, + int min_count, gfp_t gfp, bool is_phys) { unsigned long flags; struct kmemleak_object *object, *parent; @@ -618,7 +627,7 @@ static struct kmemleak_object *__create_object(unsigned long ptr, size_t size, if (!object) { pr_warn("Cannot allocate a kmemleak_object structure\n"); kmemleak_disable(); - return NULL; + return; } INIT_LIST_HEAD(&object->object_list); @@ -654,7 +663,7 @@ static struct kmemleak_object *__create_object(unsigned long ptr, size_t size, } /* kernel backtrace */ - object->trace_len = __save_stack_trace(object->trace); + object->trace_handle = set_track_prepare(); raw_spin_lock_irqsave(&kmemleak_lock, flags); @@ -687,32 +696,29 @@ static struct kmemleak_object *__create_object(unsigned long ptr, size_t size, */ dump_object_info(parent); kmem_cache_free(object_cache, object); - object = NULL; goto out; } } rb_link_node(&object->rb_node, rb_parent, link); rb_insert_color(&object->rb_node, is_phys ? &object_phys_tree_root : &object_tree_root); - list_add_tail_rcu(&object->object_list, &object_list); out: raw_spin_unlock_irqrestore(&kmemleak_lock, flags); - return object; } /* Create kmemleak object which allocated with virtual address. */ -static struct kmemleak_object *create_object(unsigned long ptr, size_t size, - int min_count, gfp_t gfp) +static void create_object(unsigned long ptr, size_t size, + int min_count, gfp_t gfp) { - return __create_object(ptr, size, min_count, gfp, false); + __create_object(ptr, size, min_count, gfp, false); } /* Create kmemleak object which allocated with physical address. */ -static struct kmemleak_object *create_object_phys(unsigned long ptr, size_t size, - int min_count, gfp_t gfp) +static void create_object_phys(unsigned long ptr, size_t size, + int min_count, gfp_t gfp) { - return __create_object(ptr, size, min_count, gfp, true); + __create_object(ptr, size, min_count, gfp, true); } /* @@ -1094,7 +1100,7 @@ void __ref kmemleak_update_trace(const void *ptr) } raw_spin_lock_irqsave(&object->lock, flags); - object->trace_len = __save_stack_trace(object->trace); + object->trace_handle = set_track_prepare(); raw_spin_unlock_irqrestore(&object->lock, flags); put_object(object); @@ -1464,6 +1470,27 @@ static void scan_gray_list(void) } /* + * Conditionally call resched() in an object iteration loop while making sure + * that the given object won't go away without RCU read lock by performing a + * get_object() if !pinned. + * + * Return: false if can't do a cond_resched() due to get_object() failure + * true otherwise + */ +static bool kmemleak_cond_resched(struct kmemleak_object *object, bool pinned) +{ + if (!pinned && !get_object(object)) + return false; + + rcu_read_unlock(); + cond_resched(); + rcu_read_lock(); + if (!pinned) + put_object(object); + return true; +} + +/* * Scan data sections and all the referenced memory blocks allocated via the * kernel's standard allocators. This function must be called with the * scan_mutex held. @@ -1474,7 +1501,7 @@ static void kmemleak_scan(void) struct zone *zone; int __maybe_unused i; int new_leaks = 0; - int loop1_cnt = 0; + int loop_cnt = 0; jiffies_last_scan = jiffies; @@ -1483,7 +1510,6 @@ static void kmemleak_scan(void) list_for_each_entry_rcu(object, &object_list, object_list) { bool obj_pinned = false; - loop1_cnt++; raw_spin_lock_irq(&object->lock); #ifdef DEBUG /* @@ -1517,24 +1543,11 @@ static void kmemleak_scan(void) raw_spin_unlock_irq(&object->lock); /* - * Do a cond_resched() to avoid soft lockup every 64k objects. - * Make sure a reference has been taken so that the object - * won't go away without RCU read lock. + * Do a cond_resched() every 64k objects to avoid soft lockup. */ - if (!(loop1_cnt & 0xffff)) { - if (!obj_pinned && !get_object(object)) { - /* Try the next object instead */ - loop1_cnt--; - continue; - } - - rcu_read_unlock(); - cond_resched(); - rcu_read_lock(); - - if (!obj_pinned) - put_object(object); - } + if (!(++loop_cnt & 0xffff) && + !kmemleak_cond_resched(object, obj_pinned)) + loop_cnt--; /* Try again on next object */ } rcu_read_unlock(); @@ -1601,8 +1614,16 @@ static void kmemleak_scan(void) * scan and color them gray until the next scan. */ rcu_read_lock(); + loop_cnt = 0; list_for_each_entry_rcu(object, &object_list, object_list) { /* + * Do a cond_resched() every 64k objects to avoid soft lockup. + */ + if (!(++loop_cnt & 0xffff) && + !kmemleak_cond_resched(object, false)) + loop_cnt--; /* Try again on next object */ + + /* * This is racy but we can save the overhead of lock/unlock * calls. The missed objects, if any, should be caught in * the next scan. @@ -1635,8 +1656,16 @@ static void kmemleak_scan(void) * Scanning result reporting. */ rcu_read_lock(); + loop_cnt = 0; list_for_each_entry_rcu(object, &object_list, object_list) { /* + * Do a cond_resched() every 64k objects to avoid soft lockup. + */ + if (!(++loop_cnt & 0xffff) && + !kmemleak_cond_resched(object, false)) + loop_cnt--; /* Try again on next object */ + + /* * This is racy but we can save the overhead of lock/unlock * calls. The missed objects, if any, should be caught in * the next scan. @@ -2064,6 +2093,7 @@ void __init kmemleak_init(void) if (kmemleak_error) return; + stack_depot_init(); jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); diff --git a/mm/kmsan/Makefile b/mm/kmsan/Makefile new file mode 100644 index 000000000000..98eab2856626 --- /dev/null +++ b/mm/kmsan/Makefile @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for KernelMemorySanitizer (KMSAN). +# +# +obj-y := core.o instrumentation.o init.o hooks.o report.o shadow.o + +KMSAN_SANITIZE := n +KCOV_INSTRUMENT := n +UBSAN_SANITIZE := n + +# Disable instrumentation of KMSAN runtime with other tools. +CC_FLAGS_KMSAN_RUNTIME := -fno-stack-protector +CC_FLAGS_KMSAN_RUNTIME += $(call cc-option,-fno-conserve-stack) +CC_FLAGS_KMSAN_RUNTIME += -DDISABLE_BRANCH_PROFILING + +CFLAGS_REMOVE.o = $(CC_FLAGS_FTRACE) + +CFLAGS_core.o := $(CC_FLAGS_KMSAN_RUNTIME) +CFLAGS_hooks.o := $(CC_FLAGS_KMSAN_RUNTIME) +CFLAGS_init.o := $(CC_FLAGS_KMSAN_RUNTIME) +CFLAGS_instrumentation.o := $(CC_FLAGS_KMSAN_RUNTIME) +CFLAGS_report.o := $(CC_FLAGS_KMSAN_RUNTIME) +CFLAGS_shadow.o := $(CC_FLAGS_KMSAN_RUNTIME) + +obj-$(CONFIG_KMSAN_KUNIT_TEST) += kmsan_test.o +KMSAN_SANITIZE_kmsan_test.o := y +CFLAGS_kmsan_test.o += $(call cc-disable-warning, uninitialized) diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c new file mode 100644 index 000000000000..112dce135c7f --- /dev/null +++ b/mm/kmsan/core.c @@ -0,0 +1,450 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN runtime library. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko <glider@google.com> + * + */ + +#include <asm/page.h> +#include <linux/compiler.h> +#include <linux/export.h> +#include <linux/highmem.h> +#include <linux/interrupt.h> +#include <linux/kernel.h> +#include <linux/kmsan_types.h> +#include <linux/memory.h> +#include <linux/mm.h> +#include <linux/mm_types.h> +#include <linux/mmzone.h> +#include <linux/percpu-defs.h> +#include <linux/preempt.h> +#include <linux/slab.h> +#include <linux/stackdepot.h> +#include <linux/stacktrace.h> +#include <linux/types.h> +#include <linux/vmalloc.h> + +#include "../slab.h" +#include "kmsan.h" + +bool kmsan_enabled __read_mostly; + +/* + * Per-CPU KMSAN context to be used in interrupts, where current->kmsan is + * unavaliable. + */ +DEFINE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx); + +void kmsan_internal_task_create(struct task_struct *task) +{ + struct kmsan_ctx *ctx = &task->kmsan_ctx; + struct thread_info *info = current_thread_info(); + + __memset(ctx, 0, sizeof(*ctx)); + ctx->allow_reporting = true; + kmsan_internal_unpoison_memory(info, sizeof(*info), false); +} + +void kmsan_internal_poison_memory(void *address, size_t size, gfp_t flags, + unsigned int poison_flags) +{ + u32 extra_bits = + kmsan_extra_bits(/*depth*/ 0, poison_flags & KMSAN_POISON_FREE); + bool checked = poison_flags & KMSAN_POISON_CHECK; + depot_stack_handle_t handle; + + handle = kmsan_save_stack_with_flags(flags, extra_bits); + kmsan_internal_set_shadow_origin(address, size, -1, handle, checked); +} + +void kmsan_internal_unpoison_memory(void *address, size_t size, bool checked) +{ + kmsan_internal_set_shadow_origin(address, size, 0, 0, checked); +} + +depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags, + unsigned int extra) +{ + unsigned long entries[KMSAN_STACK_DEPTH]; + unsigned int nr_entries; + + nr_entries = stack_trace_save(entries, KMSAN_STACK_DEPTH, 0); + + /* Don't sleep (see might_sleep_if() in __alloc_pages_nodemask()). */ + flags &= ~__GFP_DIRECT_RECLAIM; + + return __stack_depot_save(entries, nr_entries, extra, flags, true); +} + +/* Copy the metadata following the memmove() behavior. */ +void kmsan_internal_memmove_metadata(void *dst, void *src, size_t n) +{ + depot_stack_handle_t old_origin = 0, new_origin = 0; + int src_slots, dst_slots, i, iter, step, skip_bits; + depot_stack_handle_t *origin_src, *origin_dst; + void *shadow_src, *shadow_dst; + u32 *align_shadow_src, shadow; + bool backwards; + + shadow_dst = kmsan_get_metadata(dst, KMSAN_META_SHADOW); + if (!shadow_dst) + return; + KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(dst, n)); + + shadow_src = kmsan_get_metadata(src, KMSAN_META_SHADOW); + if (!shadow_src) { + /* + * @src is untracked: zero out destination shadow, ignore the + * origins, we're done. + */ + __memset(shadow_dst, 0, n); + return; + } + KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(src, n)); + + __memmove(shadow_dst, shadow_src, n); + + origin_dst = kmsan_get_metadata(dst, KMSAN_META_ORIGIN); + origin_src = kmsan_get_metadata(src, KMSAN_META_ORIGIN); + KMSAN_WARN_ON(!origin_dst || !origin_src); + src_slots = (ALIGN((u64)src + n, KMSAN_ORIGIN_SIZE) - + ALIGN_DOWN((u64)src, KMSAN_ORIGIN_SIZE)) / + KMSAN_ORIGIN_SIZE; + dst_slots = (ALIGN((u64)dst + n, KMSAN_ORIGIN_SIZE) - + ALIGN_DOWN((u64)dst, KMSAN_ORIGIN_SIZE)) / + KMSAN_ORIGIN_SIZE; + KMSAN_WARN_ON((src_slots < 1) || (dst_slots < 1)); + KMSAN_WARN_ON((src_slots - dst_slots > 1) || + (dst_slots - src_slots < -1)); + + backwards = dst > src; + i = backwards ? min(src_slots, dst_slots) - 1 : 0; + iter = backwards ? -1 : 1; + + align_shadow_src = + (u32 *)ALIGN_DOWN((u64)shadow_src, KMSAN_ORIGIN_SIZE); + for (step = 0; step < min(src_slots, dst_slots); step++, i += iter) { + KMSAN_WARN_ON(i < 0); + shadow = align_shadow_src[i]; + if (i == 0) { + /* + * If @src isn't aligned on KMSAN_ORIGIN_SIZE, don't + * look at the first @src % KMSAN_ORIGIN_SIZE bytes + * of the first shadow slot. + */ + skip_bits = ((u64)src % KMSAN_ORIGIN_SIZE) * 8; + shadow = (shadow >> skip_bits) << skip_bits; + } + if (i == src_slots - 1) { + /* + * If @src + n isn't aligned on + * KMSAN_ORIGIN_SIZE, don't look at the last + * (@src + n) % KMSAN_ORIGIN_SIZE bytes of the + * last shadow slot. + */ + skip_bits = (((u64)src + n) % KMSAN_ORIGIN_SIZE) * 8; + shadow = (shadow << skip_bits) >> skip_bits; + } + /* + * Overwrite the origin only if the corresponding + * shadow is nonempty. + */ + if (origin_src[i] && (origin_src[i] != old_origin) && shadow) { + old_origin = origin_src[i]; + new_origin = kmsan_internal_chain_origin(old_origin); + /* + * kmsan_internal_chain_origin() may return + * NULL, but we don't want to lose the previous + * origin value. + */ + if (!new_origin) + new_origin = old_origin; + } + if (shadow) + origin_dst[i] = new_origin; + else + origin_dst[i] = 0; + } + /* + * If dst_slots is greater than src_slots (i.e. + * dst_slots == src_slots + 1), there is an extra origin slot at the + * beginning or end of the destination buffer, for which we take the + * origin from the previous slot. + * This is only done if the part of the source shadow corresponding to + * slot is non-zero. + * + * E.g. if we copy 8 aligned bytes that are marked as uninitialized + * and have origins o111 and o222, to an unaligned buffer with offset 1, + * these two origins are copied to three origin slots, so one of then + * needs to be duplicated, depending on the copy direction (@backwards) + * + * src shadow: |uuuu|uuuu|....| + * src origin: |o111|o222|....| + * + * backwards = 0: + * dst shadow: |.uuu|uuuu|u...| + * dst origin: |....|o111|o222| - fill the empty slot with o111 + * backwards = 1: + * dst shadow: |.uuu|uuuu|u...| + * dst origin: |o111|o222|....| - fill the empty slot with o222 + */ + if (src_slots < dst_slots) { + if (backwards) { + shadow = align_shadow_src[src_slots - 1]; + skip_bits = (((u64)dst + n) % KMSAN_ORIGIN_SIZE) * 8; + shadow = (shadow << skip_bits) >> skip_bits; + if (shadow) + /* src_slots > 0, therefore dst_slots is at least 2 */ + origin_dst[dst_slots - 1] = + origin_dst[dst_slots - 2]; + } else { + shadow = align_shadow_src[0]; + skip_bits = ((u64)dst % KMSAN_ORIGIN_SIZE) * 8; + shadow = (shadow >> skip_bits) << skip_bits; + if (shadow) + origin_dst[0] = origin_dst[1]; + } + } +} + +depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id) +{ + unsigned long entries[3]; + u32 extra_bits; + int depth; + bool uaf; + + if (!id) + return id; + /* + * Make sure we have enough spare bits in @id to hold the UAF bit and + * the chain depth. + */ + BUILD_BUG_ON( + (1 << STACK_DEPOT_EXTRA_BITS) <= (KMSAN_MAX_ORIGIN_DEPTH << 1)); + + extra_bits = stack_depot_get_extra_bits(id); + depth = kmsan_depth_from_eb(extra_bits); + uaf = kmsan_uaf_from_eb(extra_bits); + + /* + * Stop chaining origins once the depth reached KMSAN_MAX_ORIGIN_DEPTH. + * This mostly happens in the case structures with uninitialized padding + * are copied around many times. Origin chains for such structures are + * usually periodic, and it does not make sense to fully store them. + */ + if (depth == KMSAN_MAX_ORIGIN_DEPTH) + return id; + + depth++; + extra_bits = kmsan_extra_bits(depth, uaf); + + entries[0] = KMSAN_CHAIN_MAGIC_ORIGIN; + entries[1] = kmsan_save_stack_with_flags(GFP_ATOMIC, 0); + entries[2] = id; + /* + * @entries is a local var in non-instrumented code, so KMSAN does not + * know it is initialized. Explicitly unpoison it to avoid false + * positives when __stack_depot_save() passes it to instrumented code. + */ + kmsan_internal_unpoison_memory(entries, sizeof(entries), false); + return __stack_depot_save(entries, ARRAY_SIZE(entries), extra_bits, + GFP_ATOMIC, true); +} + +void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b, + u32 origin, bool checked) +{ + u64 address = (u64)addr; + void *shadow_start; + u32 *origin_start; + size_t pad = 0; + + KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(addr, size)); + shadow_start = kmsan_get_metadata(addr, KMSAN_META_SHADOW); + if (!shadow_start) { + /* + * kmsan_metadata_is_contiguous() is true, so either all shadow + * and origin pages are NULL, or all are non-NULL. + */ + if (checked) { + pr_err("%s: not memsetting %ld bytes starting at %px, because the shadow is NULL\n", + __func__, size, addr); + KMSAN_WARN_ON(true); + } + return; + } + __memset(shadow_start, b, size); + + if (!IS_ALIGNED(address, KMSAN_ORIGIN_SIZE)) { + pad = address % KMSAN_ORIGIN_SIZE; + address -= pad; + size += pad; + } + size = ALIGN(size, KMSAN_ORIGIN_SIZE); + origin_start = + (u32 *)kmsan_get_metadata((void *)address, KMSAN_META_ORIGIN); + + for (int i = 0; i < size / KMSAN_ORIGIN_SIZE; i++) + origin_start[i] = origin; +} + +struct page *kmsan_vmalloc_to_page_or_null(void *vaddr) +{ + struct page *page; + + if (!kmsan_internal_is_vmalloc_addr(vaddr) && + !kmsan_internal_is_module_addr(vaddr)) + return NULL; + page = vmalloc_to_page(vaddr); + if (pfn_valid(page_to_pfn(page))) + return page; + else + return NULL; +} + +void kmsan_internal_check_memory(void *addr, size_t size, const void *user_addr, + int reason) +{ + depot_stack_handle_t cur_origin = 0, new_origin = 0; + unsigned long addr64 = (unsigned long)addr; + depot_stack_handle_t *origin = NULL; + unsigned char *shadow = NULL; + int cur_off_start = -1; + int chunk_size; + size_t pos = 0; + + if (!size) + return; + KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(addr, size)); + while (pos < size) { + chunk_size = min(size - pos, + PAGE_SIZE - ((addr64 + pos) % PAGE_SIZE)); + shadow = kmsan_get_metadata((void *)(addr64 + pos), + KMSAN_META_SHADOW); + if (!shadow) { + /* + * This page is untracked. If there were uninitialized + * bytes before, report them. + */ + if (cur_origin) { + kmsan_enter_runtime(); + kmsan_report(cur_origin, addr, size, + cur_off_start, pos - 1, user_addr, + reason); + kmsan_leave_runtime(); + } + cur_origin = 0; + cur_off_start = -1; + pos += chunk_size; + continue; + } + for (int i = 0; i < chunk_size; i++) { + if (!shadow[i]) { + /* + * This byte is unpoisoned. If there were + * poisoned bytes before, report them. + */ + if (cur_origin) { + kmsan_enter_runtime(); + kmsan_report(cur_origin, addr, size, + cur_off_start, pos + i - 1, + user_addr, reason); + kmsan_leave_runtime(); + } + cur_origin = 0; + cur_off_start = -1; + continue; + } + origin = kmsan_get_metadata((void *)(addr64 + pos + i), + KMSAN_META_ORIGIN); + KMSAN_WARN_ON(!origin); + new_origin = *origin; + /* + * Encountered new origin - report the previous + * uninitialized range. + */ + if (cur_origin != new_origin) { + if (cur_origin) { + kmsan_enter_runtime(); + kmsan_report(cur_origin, addr, size, + cur_off_start, pos + i - 1, + user_addr, reason); + kmsan_leave_runtime(); + } + cur_origin = new_origin; + cur_off_start = pos + i; + } + } + pos += chunk_size; + } + KMSAN_WARN_ON(pos != size); + if (cur_origin) { + kmsan_enter_runtime(); + kmsan_report(cur_origin, addr, size, cur_off_start, pos - 1, + user_addr, reason); + kmsan_leave_runtime(); + } +} + +bool kmsan_metadata_is_contiguous(void *addr, size_t size) +{ + char *cur_shadow = NULL, *next_shadow = NULL, *cur_origin = NULL, + *next_origin = NULL; + u64 cur_addr = (u64)addr, next_addr = cur_addr + PAGE_SIZE; + depot_stack_handle_t *origin_p; + bool all_untracked = false; + + if (!size) + return true; + + /* The whole range belongs to the same page. */ + if (ALIGN_DOWN(cur_addr + size - 1, PAGE_SIZE) == + ALIGN_DOWN(cur_addr, PAGE_SIZE)) + return true; + + cur_shadow = kmsan_get_metadata((void *)cur_addr, /*is_origin*/ false); + if (!cur_shadow) + all_untracked = true; + cur_origin = kmsan_get_metadata((void *)cur_addr, /*is_origin*/ true); + if (all_untracked && cur_origin) + goto report; + + for (; next_addr < (u64)addr + size; + cur_addr = next_addr, cur_shadow = next_shadow, + cur_origin = next_origin, next_addr += PAGE_SIZE) { + next_shadow = kmsan_get_metadata((void *)next_addr, false); + next_origin = kmsan_get_metadata((void *)next_addr, true); + if (all_untracked) { + if (next_shadow || next_origin) + goto report; + if (!next_shadow && !next_origin) + continue; + } + if (((u64)cur_shadow == ((u64)next_shadow - PAGE_SIZE)) && + ((u64)cur_origin == ((u64)next_origin - PAGE_SIZE))) + continue; + goto report; + } + return true; + +report: + pr_err("%s: attempting to access two shadow page ranges.\n", __func__); + pr_err("Access of size %ld at %px.\n", size, addr); + pr_err("Addresses belonging to different ranges: %px and %px\n", + (void *)cur_addr, (void *)next_addr); + pr_err("page[0].shadow: %px, page[1].shadow: %px\n", cur_shadow, + next_shadow); + pr_err("page[0].origin: %px, page[1].origin: %px\n", cur_origin, + next_origin); + origin_p = kmsan_get_metadata(addr, KMSAN_META_ORIGIN); + if (origin_p) { + pr_err("Origin: %08x\n", *origin_p); + kmsan_print_origin(*origin_p); + } else { + pr_err("Origin: unavailable\n"); + } + return false; +} diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c new file mode 100644 index 000000000000..3807502766a3 --- /dev/null +++ b/mm/kmsan/hooks.c @@ -0,0 +1,385 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN hooks for kernel subsystems. + * + * These functions handle creation of KMSAN metadata for memory allocations. + * + * Copyright (C) 2018-2022 Google LLC + * Author: Alexander Potapenko <glider@google.com> + * + */ + +#include <linux/cacheflush.h> +#include <linux/dma-direction.h> +#include <linux/gfp.h> +#include <linux/kmsan.h> +#include <linux/mm.h> +#include <linux/mm_types.h> +#include <linux/scatterlist.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/usb.h> + +#include "../internal.h" +#include "../slab.h" +#include "kmsan.h" + +/* + * Instrumented functions shouldn't be called under + * kmsan_enter_runtime()/kmsan_leave_runtime(), because this will lead to + * skipping effects of functions like memset() inside instrumented code. + */ + +void kmsan_task_create(struct task_struct *task) +{ + kmsan_enter_runtime(); + kmsan_internal_task_create(task); + kmsan_leave_runtime(); +} + +void kmsan_task_exit(struct task_struct *task) +{ + struct kmsan_ctx *ctx = &task->kmsan_ctx; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + ctx->allow_reporting = false; +} + +void kmsan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags) +{ + if (unlikely(object == NULL)) + return; + if (!kmsan_enabled || kmsan_in_runtime()) + return; + /* + * There's a ctor or this is an RCU cache - do nothing. The memory + * status hasn't changed since last use. + */ + if (s->ctor || (s->flags & SLAB_TYPESAFE_BY_RCU)) + return; + + kmsan_enter_runtime(); + if (flags & __GFP_ZERO) + kmsan_internal_unpoison_memory(object, s->object_size, + KMSAN_POISON_CHECK); + else + kmsan_internal_poison_memory(object, s->object_size, flags, + KMSAN_POISON_CHECK); + kmsan_leave_runtime(); +} + +void kmsan_slab_free(struct kmem_cache *s, void *object) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + /* RCU slabs could be legally used after free within the RCU period */ + if (unlikely(s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))) + return; + /* + * If there's a constructor, freed memory must remain in the same state + * until the next allocation. We cannot save its state to detect + * use-after-free bugs, instead we just keep it unpoisoned. + */ + if (s->ctor) + return; + kmsan_enter_runtime(); + kmsan_internal_poison_memory(object, s->object_size, GFP_KERNEL, + KMSAN_POISON_CHECK | KMSAN_POISON_FREE); + kmsan_leave_runtime(); +} + +void kmsan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) +{ + if (unlikely(ptr == NULL)) + return; + if (!kmsan_enabled || kmsan_in_runtime()) + return; + kmsan_enter_runtime(); + if (flags & __GFP_ZERO) + kmsan_internal_unpoison_memory((void *)ptr, size, + /*checked*/ true); + else + kmsan_internal_poison_memory((void *)ptr, size, flags, + KMSAN_POISON_CHECK); + kmsan_leave_runtime(); +} + +void kmsan_kfree_large(const void *ptr) +{ + struct page *page; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + kmsan_enter_runtime(); + page = virt_to_head_page((void *)ptr); + KMSAN_WARN_ON(ptr != page_address(page)); + kmsan_internal_poison_memory((void *)ptr, + PAGE_SIZE << compound_order(page), + GFP_KERNEL, + KMSAN_POISON_CHECK | KMSAN_POISON_FREE); + kmsan_leave_runtime(); +} + +static unsigned long vmalloc_shadow(unsigned long addr) +{ + return (unsigned long)kmsan_get_metadata((void *)addr, + KMSAN_META_SHADOW); +} + +static unsigned long vmalloc_origin(unsigned long addr) +{ + return (unsigned long)kmsan_get_metadata((void *)addr, + KMSAN_META_ORIGIN); +} + +void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end) +{ + __vunmap_range_noflush(vmalloc_shadow(start), vmalloc_shadow(end)); + __vunmap_range_noflush(vmalloc_origin(start), vmalloc_origin(end)); + flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end)); + flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end)); +} + +/* + * This function creates new shadow/origin pages for the physical pages mapped + * into the virtual memory. If those physical pages already had shadow/origin, + * those are ignored. + */ +void kmsan_ioremap_page_range(unsigned long start, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int page_shift) +{ + gfp_t gfp_mask = GFP_KERNEL | __GFP_ZERO; + struct page *shadow, *origin; + unsigned long off = 0; + int nr; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + nr = (end - start) / PAGE_SIZE; + kmsan_enter_runtime(); + for (int i = 0; i < nr; i++, off += PAGE_SIZE) { + shadow = alloc_pages(gfp_mask, 1); + origin = alloc_pages(gfp_mask, 1); + __vmap_pages_range_noflush( + vmalloc_shadow(start + off), + vmalloc_shadow(start + off + PAGE_SIZE), prot, &shadow, + PAGE_SHIFT); + __vmap_pages_range_noflush( + vmalloc_origin(start + off), + vmalloc_origin(start + off + PAGE_SIZE), prot, &origin, + PAGE_SHIFT); + } + flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end)); + flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end)); + kmsan_leave_runtime(); +} + +void kmsan_iounmap_page_range(unsigned long start, unsigned long end) +{ + unsigned long v_shadow, v_origin; + struct page *shadow, *origin; + int nr; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + nr = (end - start) / PAGE_SIZE; + kmsan_enter_runtime(); + v_shadow = (unsigned long)vmalloc_shadow(start); + v_origin = (unsigned long)vmalloc_origin(start); + for (int i = 0; i < nr; + i++, v_shadow += PAGE_SIZE, v_origin += PAGE_SIZE) { + shadow = kmsan_vmalloc_to_page_or_null((void *)v_shadow); + origin = kmsan_vmalloc_to_page_or_null((void *)v_origin); + __vunmap_range_noflush(v_shadow, vmalloc_shadow(end)); + __vunmap_range_noflush(v_origin, vmalloc_origin(end)); + if (shadow) + __free_pages(shadow, 1); + if (origin) + __free_pages(origin, 1); + } + flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end)); + flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end)); + kmsan_leave_runtime(); +} + +void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy, + size_t left) +{ + unsigned long ua_flags; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + /* + * At this point we've copied the memory already. It's hard to check it + * before copying, as the size of actually copied buffer is unknown. + */ + + /* copy_to_user() may copy zero bytes. No need to check. */ + if (!to_copy) + return; + /* Or maybe copy_to_user() failed to copy anything. */ + if (to_copy <= left) + return; + + ua_flags = user_access_save(); + if ((u64)to < TASK_SIZE) { + /* This is a user memory access, check it. */ + kmsan_internal_check_memory((void *)from, to_copy - left, to, + REASON_COPY_TO_USER); + } else { + /* Otherwise this is a kernel memory access. This happens when a + * compat syscall passes an argument allocated on the kernel + * stack to a real syscall. + * Don't check anything, just copy the shadow of the copied + * bytes. + */ + kmsan_internal_memmove_metadata((void *)to, (void *)from, + to_copy - left); + } + user_access_restore(ua_flags); +} +EXPORT_SYMBOL(kmsan_copy_to_user); + +/* Helper function to check an URB. */ +void kmsan_handle_urb(const struct urb *urb, bool is_out) +{ + if (!urb) + return; + if (is_out) + kmsan_internal_check_memory(urb->transfer_buffer, + urb->transfer_buffer_length, + /*user_addr*/ 0, REASON_SUBMIT_URB); + else + kmsan_internal_unpoison_memory(urb->transfer_buffer, + urb->transfer_buffer_length, + /*checked*/ false); +} +EXPORT_SYMBOL_GPL(kmsan_handle_urb); + +static void kmsan_handle_dma_page(const void *addr, size_t size, + enum dma_data_direction dir) +{ + switch (dir) { + case DMA_BIDIRECTIONAL: + kmsan_internal_check_memory((void *)addr, size, /*user_addr*/ 0, + REASON_ANY); + kmsan_internal_unpoison_memory((void *)addr, size, + /*checked*/ false); + break; + case DMA_TO_DEVICE: + kmsan_internal_check_memory((void *)addr, size, /*user_addr*/ 0, + REASON_ANY); + break; + case DMA_FROM_DEVICE: + kmsan_internal_unpoison_memory((void *)addr, size, + /*checked*/ false); + break; + case DMA_NONE: + break; + } +} + +/* Helper function to handle DMA data transfers. */ +void kmsan_handle_dma(struct page *page, size_t offset, size_t size, + enum dma_data_direction dir) +{ + u64 page_offset, to_go, addr; + + if (PageHighMem(page)) + return; + addr = (u64)page_address(page) + offset; + /* + * The kernel may occasionally give us adjacent DMA pages not belonging + * to the same allocation. Process them separately to avoid triggering + * internal KMSAN checks. + */ + while (size > 0) { + page_offset = addr % PAGE_SIZE; + to_go = min(PAGE_SIZE - page_offset, (u64)size); + kmsan_handle_dma_page((void *)addr, to_go, dir); + addr += to_go; + size -= to_go; + } +} + +void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, + enum dma_data_direction dir) +{ + struct scatterlist *item; + int i; + + for_each_sg(sg, item, nents, i) + kmsan_handle_dma(sg_page(item), item->offset, item->length, + dir); +} + +/* Functions from kmsan-checks.h follow. */ +void kmsan_poison_memory(const void *address, size_t size, gfp_t flags) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + kmsan_enter_runtime(); + /* The users may want to poison/unpoison random memory. */ + kmsan_internal_poison_memory((void *)address, size, flags, + KMSAN_POISON_NOCHECK); + kmsan_leave_runtime(); +} +EXPORT_SYMBOL(kmsan_poison_memory); + +void kmsan_unpoison_memory(const void *address, size_t size) +{ + unsigned long ua_flags; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + ua_flags = user_access_save(); + kmsan_enter_runtime(); + /* The users may want to poison/unpoison random memory. */ + kmsan_internal_unpoison_memory((void *)address, size, + KMSAN_POISON_NOCHECK); + kmsan_leave_runtime(); + user_access_restore(ua_flags); +} +EXPORT_SYMBOL(kmsan_unpoison_memory); + +/* + * Version of kmsan_unpoison_memory() that can be called from within the KMSAN + * runtime. + * + * Non-instrumented IRQ entry functions receive struct pt_regs from assembly + * code. Those regs need to be unpoisoned, otherwise using them will result in + * false positives. + * Using kmsan_unpoison_memory() is not an option in entry code, because the + * return value of in_task() is inconsistent - as a result, certain calls to + * kmsan_unpoison_memory() are ignored. kmsan_unpoison_entry_regs() ensures that + * the registers are unpoisoned even if kmsan_in_runtime() is true in the early + * entry code. + */ +void kmsan_unpoison_entry_regs(const struct pt_regs *regs) +{ + unsigned long ua_flags; + + if (!kmsan_enabled) + return; + + ua_flags = user_access_save(); + kmsan_internal_unpoison_memory((void *)regs, sizeof(*regs), + KMSAN_POISON_NOCHECK); + user_access_restore(ua_flags); +} + +void kmsan_check_memory(const void *addr, size_t size) +{ + if (!kmsan_enabled) + return; + return kmsan_internal_check_memory((void *)addr, size, /*user_addr*/ 0, + REASON_ANY); +} +EXPORT_SYMBOL(kmsan_check_memory); diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c new file mode 100644 index 000000000000..7fb794242fad --- /dev/null +++ b/mm/kmsan/init.c @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN initialization routines. + * + * Copyright (C) 2017-2021 Google LLC + * Author: Alexander Potapenko <glider@google.com> + * + */ + +#include "kmsan.h" + +#include <asm/sections.h> +#include <linux/mm.h> +#include <linux/memblock.h> + +#include "../internal.h" + +#define NUM_FUTURE_RANGES 128 +struct start_end_pair { + u64 start, end; +}; + +static struct start_end_pair start_end_pairs[NUM_FUTURE_RANGES] __initdata; +static int future_index __initdata; + +/* + * Record a range of memory for which the metadata pages will be created once + * the page allocator becomes available. + */ +static void __init kmsan_record_future_shadow_range(void *start, void *end) +{ + u64 nstart = (u64)start, nend = (u64)end, cstart, cend; + bool merged = false; + + KMSAN_WARN_ON(future_index == NUM_FUTURE_RANGES); + KMSAN_WARN_ON((nstart >= nend) || !nstart || !nend); + nstart = ALIGN_DOWN(nstart, PAGE_SIZE); + nend = ALIGN(nend, PAGE_SIZE); + + /* + * Scan the existing ranges to see if any of them overlaps with + * [start, end). In that case, merge the two ranges instead of + * creating a new one. + * The number of ranges is less than 20, so there is no need to organize + * them into a more intelligent data structure. + */ + for (int i = 0; i < future_index; i++) { + cstart = start_end_pairs[i].start; + cend = start_end_pairs[i].end; + if ((cstart < nstart && cend < nstart) || + (cstart > nend && cend > nend)) + /* ranges are disjoint - do not merge */ + continue; + start_end_pairs[i].start = min(nstart, cstart); + start_end_pairs[i].end = max(nend, cend); + merged = true; + break; + } + if (merged) + return; + start_end_pairs[future_index].start = nstart; + start_end_pairs[future_index].end = nend; + future_index++; +} + +/* + * Initialize the shadow for existing mappings during kernel initialization. + * These include kernel text/data sections, NODE_DATA and future ranges + * registered while creating other data (e.g. percpu). + * + * Allocations via memblock can be only done before slab is initialized. + */ +void __init kmsan_init_shadow(void) +{ + const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); + phys_addr_t p_start, p_end; + u64 loop; + int nid; + + for_each_reserved_mem_range(loop, &p_start, &p_end) + kmsan_record_future_shadow_range(phys_to_virt(p_start), + phys_to_virt(p_end)); + /* Allocate shadow for .data */ + kmsan_record_future_shadow_range(_sdata, _edata); + + for_each_online_node(nid) + kmsan_record_future_shadow_range( + NODE_DATA(nid), (char *)NODE_DATA(nid) + nd_size); + + for (int i = 0; i < future_index; i++) + kmsan_init_alloc_meta_for_range( + (void *)start_end_pairs[i].start, + (void *)start_end_pairs[i].end); +} + +struct metadata_page_pair { + struct page *shadow, *origin; +}; +static struct metadata_page_pair held_back[MAX_ORDER] __initdata; + +/* + * Eager metadata allocation. When the memblock allocator is freeing pages to + * pagealloc, we use 2/3 of them as metadata for the remaining 1/3. + * We store the pointers to the returned blocks of pages in held_back[] grouped + * by their order: when kmsan_memblock_free_pages() is called for the first + * time with a certain order, it is reserved as a shadow block, for the second + * time - as an origin block. On the third time the incoming block receives its + * shadow and origin ranges from the previously saved shadow and origin blocks, + * after which held_back[order] can be used again. + * + * At the very end there may be leftover blocks in held_back[]. They are + * collected later by kmsan_memblock_discard(). + */ +bool kmsan_memblock_free_pages(struct page *page, unsigned int order) +{ + struct page *shadow, *origin; + + if (!held_back[order].shadow) { + held_back[order].shadow = page; + return false; + } + if (!held_back[order].origin) { + held_back[order].origin = page; + return false; + } + shadow = held_back[order].shadow; + origin = held_back[order].origin; + kmsan_setup_meta(page, shadow, origin, order); + + held_back[order].shadow = NULL; + held_back[order].origin = NULL; + return true; +} + +#define MAX_BLOCKS 8 +struct smallstack { + struct page *items[MAX_BLOCKS]; + int index; + int order; +}; + +static struct smallstack collect = { + .index = 0, + .order = MAX_ORDER, +}; + +static void smallstack_push(struct smallstack *stack, struct page *pages) +{ + KMSAN_WARN_ON(stack->index == MAX_BLOCKS); + stack->items[stack->index] = pages; + stack->index++; +} +#undef MAX_BLOCKS + +static struct page *smallstack_pop(struct smallstack *stack) +{ + struct page *ret; + + KMSAN_WARN_ON(stack->index == 0); + stack->index--; + ret = stack->items[stack->index]; + stack->items[stack->index] = NULL; + return ret; +} + +static void do_collection(void) +{ + struct page *page, *shadow, *origin; + + while (collect.index >= 3) { + page = smallstack_pop(&collect); + shadow = smallstack_pop(&collect); + origin = smallstack_pop(&collect); + kmsan_setup_meta(page, shadow, origin, collect.order); + __free_pages_core(page, collect.order); + } +} + +static void collect_split(void) +{ + struct smallstack tmp = { + .order = collect.order - 1, + .index = 0, + }; + struct page *page; + + if (!collect.order) + return; + while (collect.index) { + page = smallstack_pop(&collect); + smallstack_push(&tmp, &page[0]); + smallstack_push(&tmp, &page[1 << tmp.order]); + } + __memcpy(&collect, &tmp, sizeof(tmp)); +} + +/* + * Memblock is about to go away. Split the page blocks left over in held_back[] + * and return 1/3 of that memory to the system. + */ +static void kmsan_memblock_discard(void) +{ + /* + * For each order=N: + * - push held_back[N].shadow and .origin to @collect; + * - while there are >= 3 elements in @collect, do garbage collection: + * - pop 3 ranges from @collect; + * - use two of them as shadow and origin for the third one; + * - repeat; + * - split each remaining element from @collect into 2 ranges of + * order=N-1, + * - repeat. + */ + collect.order = MAX_ORDER - 1; + for (int i = MAX_ORDER - 1; i >= 0; i--) { + if (held_back[i].shadow) + smallstack_push(&collect, held_back[i].shadow); + if (held_back[i].origin) + smallstack_push(&collect, held_back[i].origin); + held_back[i].shadow = NULL; + held_back[i].origin = NULL; + do_collection(); + collect_split(); + } +} + +void __init kmsan_init_runtime(void) +{ + /* Assuming current is init_task */ + kmsan_internal_task_create(current); + kmsan_memblock_discard(); + pr_info("Starting KernelMemorySanitizer\n"); + pr_info("ATTENTION: KMSAN is a debugging tool! Do not use it on production machines!\n"); + kmsan_enabled = true; +} diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c new file mode 100644 index 000000000000..770fe02904f3 --- /dev/null +++ b/mm/kmsan/instrumentation.c @@ -0,0 +1,310 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN compiler API. + * + * This file implements __msan_XXX hooks that Clang inserts into the code + * compiled with -fsanitize=kernel-memory. + * See Documentation/dev-tools/kmsan.rst for more information on how KMSAN + * instrumentation works. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko <glider@google.com> + * + */ + +#include "kmsan.h" +#include <linux/gfp.h> +#include <linux/kmsan_string.h> +#include <linux/mm.h> +#include <linux/uaccess.h> + +static inline bool is_bad_asm_addr(void *addr, uintptr_t size, bool is_store) +{ + if ((u64)addr < TASK_SIZE) + return true; + if (!kmsan_get_metadata(addr, KMSAN_META_SHADOW)) + return true; + return false; +} + +static inline struct shadow_origin_ptr +get_shadow_origin_ptr(void *addr, u64 size, bool store) +{ + unsigned long ua_flags = user_access_save(); + struct shadow_origin_ptr ret; + + ret = kmsan_get_shadow_origin_ptr(addr, size, store); + user_access_restore(ua_flags); + return ret; +} + +/* Get shadow and origin pointers for a memory load with non-standard size. */ +struct shadow_origin_ptr __msan_metadata_ptr_for_load_n(void *addr, + uintptr_t size) +{ + return get_shadow_origin_ptr(addr, size, /*store*/ false); +} +EXPORT_SYMBOL(__msan_metadata_ptr_for_load_n); + +/* Get shadow and origin pointers for a memory store with non-standard size. */ +struct shadow_origin_ptr __msan_metadata_ptr_for_store_n(void *addr, + uintptr_t size) +{ + return get_shadow_origin_ptr(addr, size, /*store*/ true); +} +EXPORT_SYMBOL(__msan_metadata_ptr_for_store_n); + +/* + * Declare functions that obtain shadow/origin pointers for loads and stores + * with fixed size. + */ +#define DECLARE_METADATA_PTR_GETTER(size) \ + struct shadow_origin_ptr __msan_metadata_ptr_for_load_##size( \ + void *addr) \ + { \ + return get_shadow_origin_ptr(addr, size, /*store*/ false); \ + } \ + EXPORT_SYMBOL(__msan_metadata_ptr_for_load_##size); \ + struct shadow_origin_ptr __msan_metadata_ptr_for_store_##size( \ + void *addr) \ + { \ + return get_shadow_origin_ptr(addr, size, /*store*/ true); \ + } \ + EXPORT_SYMBOL(__msan_metadata_ptr_for_store_##size) + +DECLARE_METADATA_PTR_GETTER(1); +DECLARE_METADATA_PTR_GETTER(2); +DECLARE_METADATA_PTR_GETTER(4); +DECLARE_METADATA_PTR_GETTER(8); + +/* + * Handle a memory store performed by inline assembly. KMSAN conservatively + * attempts to unpoison the outputs of asm() directives to prevent false + * positives caused by missed stores. + * + * __msan_instrument_asm_store() may be called for inline assembly code when + * entering or leaving IRQ. We omit the check for kmsan_in_runtime() to ensure + * the memory written to in these cases is also marked as initialized. + */ +void __msan_instrument_asm_store(void *addr, uintptr_t size) +{ + unsigned long ua_flags; + + if (!kmsan_enabled) + return; + + ua_flags = user_access_save(); + /* + * Most of the accesses are below 32 bytes. The two exceptions so far + * are clwb() (64 bytes) and FPU state (512 bytes). + * It's unlikely that the assembly will touch more than 512 bytes. + */ + if (size > 512) { + WARN_ONCE(1, "assembly store size too big: %ld\n", size); + size = 8; + } + if (is_bad_asm_addr(addr, size, /*is_store*/ true)) { + user_access_restore(ua_flags); + return; + } + /* Unpoisoning the memory on best effort. */ + kmsan_internal_unpoison_memory(addr, size, /*checked*/ false); + user_access_restore(ua_flags); +} +EXPORT_SYMBOL(__msan_instrument_asm_store); + +/* + * KMSAN instrumentation pass replaces LLVM memcpy, memmove and memset + * intrinsics with calls to respective __msan_ functions. We use + * get_param0_metadata() and set_retval_metadata() to store the shadow/origin + * values for the destination argument of these functions and use them for the + * functions' return values. + */ +static inline void get_param0_metadata(u64 *shadow, + depot_stack_handle_t *origin) +{ + struct kmsan_ctx *ctx = kmsan_get_context(); + + *shadow = *(u64 *)(ctx->cstate.param_tls); + *origin = ctx->cstate.param_origin_tls[0]; +} + +static inline void set_retval_metadata(u64 shadow, depot_stack_handle_t origin) +{ + struct kmsan_ctx *ctx = kmsan_get_context(); + + *(u64 *)(ctx->cstate.retval_tls) = shadow; + ctx->cstate.retval_origin_tls = origin; +} + +/* Handle llvm.memmove intrinsic. */ +void *__msan_memmove(void *dst, const void *src, uintptr_t n) +{ + depot_stack_handle_t origin; + void *result; + u64 shadow; + + get_param0_metadata(&shadow, &origin); + result = __memmove(dst, src, n); + if (!n) + /* Some people call memmove() with zero length. */ + return result; + if (!kmsan_enabled || kmsan_in_runtime()) + return result; + + kmsan_enter_runtime(); + kmsan_internal_memmove_metadata(dst, (void *)src, n); + kmsan_leave_runtime(); + + set_retval_metadata(shadow, origin); + return result; +} +EXPORT_SYMBOL(__msan_memmove); + +/* Handle llvm.memcpy intrinsic. */ +void *__msan_memcpy(void *dst, const void *src, uintptr_t n) +{ + depot_stack_handle_t origin; + void *result; + u64 shadow; + + get_param0_metadata(&shadow, &origin); + result = __memcpy(dst, src, n); + if (!n) + /* Some people call memcpy() with zero length. */ + return result; + + if (!kmsan_enabled || kmsan_in_runtime()) + return result; + + kmsan_enter_runtime(); + /* Using memmove instead of memcpy doesn't affect correctness. */ + kmsan_internal_memmove_metadata(dst, (void *)src, n); + kmsan_leave_runtime(); + + set_retval_metadata(shadow, origin); + return result; +} +EXPORT_SYMBOL(__msan_memcpy); + +/* Handle llvm.memset intrinsic. */ +void *__msan_memset(void *dst, int c, uintptr_t n) +{ + depot_stack_handle_t origin; + void *result; + u64 shadow; + + get_param0_metadata(&shadow, &origin); + result = __memset(dst, c, n); + if (!kmsan_enabled || kmsan_in_runtime()) + return result; + + kmsan_enter_runtime(); + /* + * Clang doesn't pass parameter metadata here, so it is impossible to + * use shadow of @c to set up the shadow for @dst. + */ + kmsan_internal_unpoison_memory(dst, n, /*checked*/ false); + kmsan_leave_runtime(); + + set_retval_metadata(shadow, origin); + return result; +} +EXPORT_SYMBOL(__msan_memset); + +/* + * Create a new origin from an old one. This is done when storing an + * uninitialized value to memory. When reporting an error, KMSAN unrolls and + * prints the whole chain of stores that preceded the use of this value. + */ +depot_stack_handle_t __msan_chain_origin(depot_stack_handle_t origin) +{ + depot_stack_handle_t ret = 0; + unsigned long ua_flags; + + if (!kmsan_enabled || kmsan_in_runtime()) + return ret; + + ua_flags = user_access_save(); + + /* Creating new origins may allocate memory. */ + kmsan_enter_runtime(); + ret = kmsan_internal_chain_origin(origin); + kmsan_leave_runtime(); + user_access_restore(ua_flags); + return ret; +} +EXPORT_SYMBOL(__msan_chain_origin); + +/* Poison a local variable when entering a function. */ +void __msan_poison_alloca(void *address, uintptr_t size, char *descr) +{ + depot_stack_handle_t handle; + unsigned long entries[4]; + unsigned long ua_flags; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + ua_flags = user_access_save(); + entries[0] = KMSAN_ALLOCA_MAGIC_ORIGIN; + entries[1] = (u64)descr; + entries[2] = (u64)__builtin_return_address(0); + /* + * With frame pointers enabled, it is possible to quickly fetch the + * second frame of the caller stack without calling the unwinder. + * Without them, simply do not bother. + */ + if (IS_ENABLED(CONFIG_UNWINDER_FRAME_POINTER)) + entries[3] = (u64)__builtin_return_address(1); + else + entries[3] = 0; + + /* stack_depot_save() may allocate memory. */ + kmsan_enter_runtime(); + handle = stack_depot_save(entries, ARRAY_SIZE(entries), GFP_ATOMIC); + kmsan_leave_runtime(); + + kmsan_internal_set_shadow_origin(address, size, -1, handle, + /*checked*/ true); + user_access_restore(ua_flags); +} +EXPORT_SYMBOL(__msan_poison_alloca); + +/* Unpoison a local variable. */ +void __msan_unpoison_alloca(void *address, uintptr_t size) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + kmsan_enter_runtime(); + kmsan_internal_unpoison_memory(address, size, /*checked*/ true); + kmsan_leave_runtime(); +} +EXPORT_SYMBOL(__msan_unpoison_alloca); + +/* + * Report that an uninitialized value with the given origin was used in a way + * that constituted undefined behavior. + */ +void __msan_warning(u32 origin) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + kmsan_enter_runtime(); + kmsan_report(origin, /*address*/ 0, /*size*/ 0, + /*off_first*/ 0, /*off_last*/ 0, /*user_addr*/ 0, + REASON_ANY); + kmsan_leave_runtime(); +} +EXPORT_SYMBOL(__msan_warning); + +/* + * At the beginning of an instrumented function, obtain the pointer to + * `struct kmsan_context_state` holding the metadata for function parameters. + */ +struct kmsan_context_state *__msan_get_context_state(void) +{ + return &kmsan_get_context()->cstate; +} +EXPORT_SYMBOL(__msan_get_context_state); diff --git a/mm/kmsan/kmsan.h b/mm/kmsan/kmsan.h new file mode 100644 index 000000000000..a14744205435 --- /dev/null +++ b/mm/kmsan/kmsan.h @@ -0,0 +1,211 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Functions used by the KMSAN runtime. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko <glider@google.com> + * + */ + +#ifndef __MM_KMSAN_KMSAN_H +#define __MM_KMSAN_KMSAN_H + +#include <asm/pgtable_64_types.h> +#include <linux/irqflags.h> +#include <linux/sched.h> +#include <linux/stackdepot.h> +#include <linux/stacktrace.h> +#include <linux/nmi.h> +#include <linux/mm.h> +#include <linux/printk.h> + +#define KMSAN_ALLOCA_MAGIC_ORIGIN 0xabcd0100 +#define KMSAN_CHAIN_MAGIC_ORIGIN 0xabcd0200 + +#define KMSAN_POISON_NOCHECK 0x0 +#define KMSAN_POISON_CHECK 0x1 +#define KMSAN_POISON_FREE 0x2 + +#define KMSAN_ORIGIN_SIZE 4 +#define KMSAN_MAX_ORIGIN_DEPTH 7 + +#define KMSAN_STACK_DEPTH 64 + +#define KMSAN_META_SHADOW (false) +#define KMSAN_META_ORIGIN (true) + +extern bool kmsan_enabled; +extern int panic_on_kmsan; + +/* + * KMSAN performs a lot of consistency checks that are currently enabled by + * default. BUG_ON is normally discouraged in the kernel, unless used for + * debugging, but KMSAN itself is a debugging tool, so it makes little sense to + * recover if something goes wrong. + */ +#define KMSAN_WARN_ON(cond) \ + ({ \ + const bool __cond = WARN_ON(cond); \ + if (unlikely(__cond)) { \ + WRITE_ONCE(kmsan_enabled, false); \ + if (panic_on_kmsan) { \ + /* Can't call panic() here because */ \ + /* of uaccess checks. */ \ + BUG(); \ + } \ + } \ + __cond; \ + }) + +/* + * A pair of metadata pointers to be returned by the instrumentation functions. + */ +struct shadow_origin_ptr { + void *shadow, *origin; +}; + +struct shadow_origin_ptr kmsan_get_shadow_origin_ptr(void *addr, u64 size, + bool store); +void *kmsan_get_metadata(void *addr, bool is_origin); +void __init kmsan_init_alloc_meta_for_range(void *start, void *end); + +enum kmsan_bug_reason { + REASON_ANY, + REASON_COPY_TO_USER, + REASON_SUBMIT_URB, +}; + +void kmsan_print_origin(depot_stack_handle_t origin); + +/** + * kmsan_report() - Report a use of uninitialized value. + * @origin: Stack ID of the uninitialized value. + * @address: Address at which the memory access happens. + * @size: Memory access size. + * @off_first: Offset (from @address) of the first byte to be reported. + * @off_last: Offset (from @address) of the last byte to be reported. + * @user_addr: When non-NULL, denotes the userspace address to which the kernel + * is leaking data. + * @reason: Error type from enum kmsan_bug_reason. + * + * kmsan_report() prints an error message for a consequent group of bytes + * sharing the same origin. If an uninitialized value is used in a comparison, + * this function is called once without specifying the addresses. When checking + * a memory range, KMSAN may call kmsan_report() multiple times with the same + * @address, @size, @user_addr and @reason, but different @off_first and + * @off_last corresponding to different @origin values. + */ +void kmsan_report(depot_stack_handle_t origin, void *address, int size, + int off_first, int off_last, const void *user_addr, + enum kmsan_bug_reason reason); + +DECLARE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx); + +static __always_inline struct kmsan_ctx *kmsan_get_context(void) +{ + return in_task() ? ¤t->kmsan_ctx : raw_cpu_ptr(&kmsan_percpu_ctx); +} + +/* + * When a compiler hook or KMSAN runtime function is invoked, it may make a + * call to instrumented code and eventually call itself recursively. To avoid + * that, we guard the runtime entry regions with + * kmsan_enter_runtime()/kmsan_leave_runtime() and exit the hook if + * kmsan_in_runtime() is true. + * + * Non-runtime code may occasionally get executed in nested IRQs from the + * runtime code (e.g. when called via smp_call_function_single()). Because some + * KMSAN routines may take locks (e.g. for memory allocation), we conservatively + * bail out instead of calling them. To minimize the effect of this (potentially + * missing initialization events) kmsan_in_runtime() is not checked in + * non-blocking runtime functions. + */ +static __always_inline bool kmsan_in_runtime(void) +{ + if ((hardirq_count() >> HARDIRQ_SHIFT) > 1) + return true; + if (in_nmi()) + return true; + return kmsan_get_context()->kmsan_in_runtime; +} + +static __always_inline void kmsan_enter_runtime(void) +{ + struct kmsan_ctx *ctx; + + ctx = kmsan_get_context(); + KMSAN_WARN_ON(ctx->kmsan_in_runtime++); +} + +static __always_inline void kmsan_leave_runtime(void) +{ + struct kmsan_ctx *ctx = kmsan_get_context(); + + KMSAN_WARN_ON(--ctx->kmsan_in_runtime); +} + +depot_stack_handle_t kmsan_save_stack(void); +depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags, + unsigned int extra_bits); + +/* + * Pack and unpack the origin chain depth and UAF flag to/from the extra bits + * provided by the stack depot. + * The UAF flag is stored in the lowest bit, followed by the depth in the upper + * bits. + * set_dsh_extra_bits() is responsible for clamping the value. + */ +static __always_inline unsigned int kmsan_extra_bits(unsigned int depth, + bool uaf) +{ + return (depth << 1) | uaf; +} + +static __always_inline bool kmsan_uaf_from_eb(unsigned int extra_bits) +{ + return extra_bits & 1; +} + +static __always_inline unsigned int kmsan_depth_from_eb(unsigned int extra_bits) +{ + return extra_bits >> 1; +} + +/* + * kmsan_internal_ functions are supposed to be very simple and not require the + * kmsan_in_runtime() checks. + */ +void kmsan_internal_memmove_metadata(void *dst, void *src, size_t n); +void kmsan_internal_poison_memory(void *address, size_t size, gfp_t flags, + unsigned int poison_flags); +void kmsan_internal_unpoison_memory(void *address, size_t size, bool checked); +void kmsan_internal_set_shadow_origin(void *address, size_t size, int b, + u32 origin, bool checked); +depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id); + +void kmsan_internal_task_create(struct task_struct *task); + +bool kmsan_metadata_is_contiguous(void *addr, size_t size); +void kmsan_internal_check_memory(void *addr, size_t size, const void *user_addr, + int reason); + +struct page *kmsan_vmalloc_to_page_or_null(void *vaddr); +void kmsan_setup_meta(struct page *page, struct page *shadow, + struct page *origin, int order); + +/* + * kmsan_internal_is_module_addr() and kmsan_internal_is_vmalloc_addr() are + * non-instrumented versions of is_module_address() and is_vmalloc_addr() that + * are safe to call from KMSAN runtime without recursion. + */ +static inline bool kmsan_internal_is_module_addr(void *vaddr) +{ + return ((u64)vaddr >= MODULES_VADDR) && ((u64)vaddr < MODULES_END); +} + +static inline bool kmsan_internal_is_vmalloc_addr(void *addr) +{ + return ((u64)addr >= VMALLOC_START) && ((u64)addr < VMALLOC_END); +} + +#endif /* __MM_KMSAN_KMSAN_H */ diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c new file mode 100644 index 000000000000..088e21a48dc4 --- /dev/null +++ b/mm/kmsan/kmsan_test.c @@ -0,0 +1,585 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test cases for KMSAN. + * For each test case checks the presence (or absence) of generated reports. + * Relies on 'console' tracepoint to capture reports as they appear in the + * kernel log. + * + * Copyright (C) 2021-2022, Google LLC. + * Author: Alexander Potapenko <glider@google.com> + * + */ + +#include <kunit/test.h> +#include "kmsan.h" + +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/kmsan.h> +#include <linux/mm.h> +#include <linux/random.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/tracepoint.h> +#include <linux/vmalloc.h> +#include <trace/events/printk.h> + +static DEFINE_PER_CPU(int, per_cpu_var); + +/* Report as observed from console. */ +static struct { + spinlock_t lock; + bool available; + bool ignore; /* Stop console output collection. */ + char header[256]; +} observed = { + .lock = __SPIN_LOCK_UNLOCKED(observed.lock), +}; + +/* Probe for console output: obtains observed lines of interest. */ +static void probe_console(void *ignore, const char *buf, size_t len) +{ + unsigned long flags; + + if (observed.ignore) + return; + spin_lock_irqsave(&observed.lock, flags); + + if (strnstr(buf, "BUG: KMSAN: ", len)) { + /* + * KMSAN report and related to the test. + * + * The provided @buf is not NUL-terminated; copy no more than + * @len bytes and let strscpy() add the missing NUL-terminator. + */ + strscpy(observed.header, buf, + min(len + 1, sizeof(observed.header))); + WRITE_ONCE(observed.available, true); + observed.ignore = true; + } + spin_unlock_irqrestore(&observed.lock, flags); +} + +/* Check if a report related to the test exists. */ +static bool report_available(void) +{ + return READ_ONCE(observed.available); +} + +/* Information we expect in a report. */ +struct expect_report { + const char *error_type; /* Error type. */ + /* + * Kernel symbol from the error header, or NULL if no report is + * expected. + */ + const char *symbol; +}; + +/* Check observed report matches information in @r. */ +static bool report_matches(const struct expect_report *r) +{ + typeof(observed.header) expected_header; + unsigned long flags; + bool ret = false; + const char *end; + char *cur; + + /* Doubled-checked locking. */ + if (!report_available() || !r->symbol) + return (!report_available() && !r->symbol); + + /* Generate expected report contents. */ + + /* Title */ + cur = expected_header; + end = &expected_header[sizeof(expected_header) - 1]; + + cur += scnprintf(cur, end - cur, "BUG: KMSAN: %s", r->error_type); + + scnprintf(cur, end - cur, " in %s", r->symbol); + /* The exact offset won't match, remove it; also strip module name. */ + cur = strchr(expected_header, '+'); + if (cur) + *cur = '\0'; + + spin_lock_irqsave(&observed.lock, flags); + if (!report_available()) + goto out; /* A new report is being captured. */ + + /* Finally match expected output to what we actually observed. */ + ret = strstr(observed.header, expected_header); +out: + spin_unlock_irqrestore(&observed.lock, flags); + + return ret; +} + +/* ===== Test cases ===== */ + +/* Prevent replacing branch with select in LLVM. */ +static noinline void check_true(char *arg) +{ + pr_info("%s is true\n", arg); +} + +static noinline void check_false(char *arg) +{ + pr_info("%s is false\n", arg); +} + +#define USE(x) \ + do { \ + if (x) \ + check_true(#x); \ + else \ + check_false(#x); \ + } while (0) + +#define EXPECTATION_ETYPE_FN(e, reason, fn) \ + struct expect_report e = { \ + .error_type = reason, \ + .symbol = fn, \ + } + +#define EXPECTATION_NO_REPORT(e) EXPECTATION_ETYPE_FN(e, NULL, NULL) +#define EXPECTATION_UNINIT_VALUE_FN(e, fn) \ + EXPECTATION_ETYPE_FN(e, "uninit-value", fn) +#define EXPECTATION_UNINIT_VALUE(e) EXPECTATION_UNINIT_VALUE_FN(e, __func__) +#define EXPECTATION_USE_AFTER_FREE(e) \ + EXPECTATION_ETYPE_FN(e, "use-after-free", __func__) + +/* Test case: ensure that kmalloc() returns uninitialized memory. */ +static void test_uninit_kmalloc(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE(expect); + int *ptr; + + kunit_info(test, "uninitialized kmalloc test (UMR report)\n"); + ptr = kmalloc(sizeof(*ptr), GFP_KERNEL); + USE(*ptr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that kmalloc'ed memory becomes initialized after memset(). + */ +static void test_init_kmalloc(struct kunit *test) +{ + EXPECTATION_NO_REPORT(expect); + int *ptr; + + kunit_info(test, "initialized kmalloc test (no reports)\n"); + ptr = kmalloc(sizeof(*ptr), GFP_KERNEL); + memset(ptr, 0, sizeof(*ptr)); + USE(*ptr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test case: ensure that kzalloc() returns initialized memory. */ +static void test_init_kzalloc(struct kunit *test) +{ + EXPECTATION_NO_REPORT(expect); + int *ptr; + + kunit_info(test, "initialized kzalloc test (no reports)\n"); + ptr = kzalloc(sizeof(*ptr), GFP_KERNEL); + USE(*ptr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test case: ensure that local variables are uninitialized by default. */ +static void test_uninit_stack_var(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE(expect); + volatile int cond; + + kunit_info(test, "uninitialized stack variable (UMR report)\n"); + USE(cond); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test case: ensure that local variables with initializers are initialized. */ +static void test_init_stack_var(struct kunit *test) +{ + EXPECTATION_NO_REPORT(expect); + volatile int cond = 1; + + kunit_info(test, "initialized stack variable (no reports)\n"); + USE(cond); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static noinline void two_param_fn_2(int arg1, int arg2) +{ + USE(arg1); + USE(arg2); +} + +static noinline void one_param_fn(int arg) +{ + two_param_fn_2(arg, arg); + USE(arg); +} + +static noinline void two_param_fn(int arg1, int arg2) +{ + int init = 0; + + one_param_fn(init); + USE(arg1); + USE(arg2); +} + +static void test_params(struct kunit *test) +{ +#ifdef CONFIG_KMSAN_CHECK_PARAM_RETVAL + /* + * With eager param/retval checking enabled, KMSAN will report an error + * before the call to two_param_fn(). + */ + EXPECTATION_UNINIT_VALUE_FN(expect, "test_params"); +#else + EXPECTATION_UNINIT_VALUE_FN(expect, "two_param_fn"); +#endif + volatile int uninit, init = 1; + + kunit_info(test, + "uninit passed through a function parameter (UMR report)\n"); + two_param_fn(uninit, init); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static int signed_sum3(int a, int b, int c) +{ + return a + b + c; +} + +/* + * Test case: ensure that uninitialized values are tracked through function + * arguments. + */ +static void test_uninit_multiple_params(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE(expect); + volatile char b = 3, c; + volatile int a; + + kunit_info(test, "uninitialized local passed to fn (UMR report)\n"); + USE(signed_sum3(a, b, c)); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Helper function to make an array uninitialized. */ +static noinline void do_uninit_local_array(char *array, int start, int stop) +{ + volatile char uninit; + + for (int i = start; i < stop; i++) + array[i] = uninit; +} + +/* + * Test case: ensure kmsan_check_memory() reports an error when checking + * uninitialized memory. + */ +static void test_uninit_kmsan_check_memory(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE_FN(expect, "test_uninit_kmsan_check_memory"); + volatile char local_array[8]; + + kunit_info( + test, + "kmsan_check_memory() called on uninit local (UMR report)\n"); + do_uninit_local_array((char *)local_array, 5, 7); + + kmsan_check_memory((char *)local_array, 8); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: check that a virtual memory range created with vmap() from + * initialized pages is still considered as initialized. + */ +static void test_init_kmsan_vmap_vunmap(struct kunit *test) +{ + EXPECTATION_NO_REPORT(expect); + const int npages = 2; + struct page **pages; + void *vbuf; + + kunit_info(test, "pages initialized via vmap (no reports)\n"); + + pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL); + for (int i = 0; i < npages; i++) + pages[i] = alloc_page(GFP_KERNEL); + vbuf = vmap(pages, npages, VM_MAP, PAGE_KERNEL); + memset(vbuf, 0xfe, npages * PAGE_SIZE); + for (int i = 0; i < npages; i++) + kmsan_check_memory(page_address(pages[i]), PAGE_SIZE); + + if (vbuf) + vunmap(vbuf); + for (int i = 0; i < npages; i++) { + if (pages[i]) + __free_page(pages[i]); + } + kfree(pages); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that memset() can initialize a buffer allocated via + * vmalloc(). + */ +static void test_init_vmalloc(struct kunit *test) +{ + EXPECTATION_NO_REPORT(expect); + int npages = 8; + char *buf; + + kunit_info(test, "vmalloc buffer can be initialized (no reports)\n"); + buf = vmalloc(PAGE_SIZE * npages); + buf[0] = 1; + memset(buf, 0xfe, PAGE_SIZE * npages); + USE(buf[0]); + for (int i = 0; i < npages; i++) + kmsan_check_memory(&buf[PAGE_SIZE * i], PAGE_SIZE); + vfree(buf); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test case: ensure that use-after-free reporting works. */ +static void test_uaf(struct kunit *test) +{ + EXPECTATION_USE_AFTER_FREE(expect); + volatile int value; + volatile int *var; + + kunit_info(test, "use-after-free in kmalloc-ed buffer (UMR report)\n"); + var = kmalloc(80, GFP_KERNEL); + var[3] = 0xfeedface; + kfree((int *)var); + /* Copy the invalid value before checking it. */ + value = var[3]; + USE(value); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that uninitialized values are propagated through per-CPU + * memory. + */ +static void test_percpu_propagate(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE(expect); + volatile int uninit, check; + + kunit_info(test, + "uninit local stored to per_cpu memory (UMR report)\n"); + + this_cpu_write(per_cpu_var, uninit); + check = this_cpu_read(per_cpu_var); + USE(check); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that passing uninitialized values to printk() leads to an + * error report. + */ +static void test_printk(struct kunit *test) +{ +#ifdef CONFIG_KMSAN_CHECK_PARAM_RETVAL + /* + * With eager param/retval checking enabled, KMSAN will report an error + * before the call to pr_info(). + */ + EXPECTATION_UNINIT_VALUE_FN(expect, "test_printk"); +#else + EXPECTATION_UNINIT_VALUE_FN(expect, "number"); +#endif + volatile int uninit; + + kunit_info(test, "uninit local passed to pr_info() (UMR report)\n"); + pr_info("%px contains %d\n", &uninit, uninit); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that memcpy() correctly copies uninitialized values between + * aligned `src` and `dst`. + */ +static void test_memcpy_aligned_to_aligned(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE_FN(expect, "test_memcpy_aligned_to_aligned"); + volatile int uninit_src; + volatile int dst = 0; + + kunit_info( + test, + "memcpy()ing aligned uninit src to aligned dst (UMR report)\n"); + OPTIMIZER_HIDE_VAR(uninit_src); + memcpy((void *)&dst, (void *)&uninit_src, sizeof(uninit_src)); + kmsan_check_memory((void *)&dst, sizeof(dst)); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that memcpy() correctly copies uninitialized values between + * aligned `src` and unaligned `dst`. + * + * Copying aligned 4-byte value to an unaligned one leads to touching two + * aligned 4-byte values. This test case checks that KMSAN correctly reports an + * error on the first of the two values. + */ +static void test_memcpy_aligned_to_unaligned(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE_FN(expect, "test_memcpy_aligned_to_unaligned"); + volatile int uninit_src; + volatile char dst[8] = { 0 }; + + kunit_info( + test, + "memcpy()ing aligned uninit src to unaligned dst (UMR report)\n"); + OPTIMIZER_HIDE_VAR(uninit_src); + memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src)); + kmsan_check_memory((void *)dst, 4); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that memcpy() correctly copies uninitialized values between + * aligned `src` and unaligned `dst`. + * + * Copying aligned 4-byte value to an unaligned one leads to touching two + * aligned 4-byte values. This test case checks that KMSAN correctly reports an + * error on the second of the two values. + */ +static void test_memcpy_aligned_to_unaligned2(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE_FN(expect, + "test_memcpy_aligned_to_unaligned2"); + volatile int uninit_src; + volatile char dst[8] = { 0 }; + + kunit_info( + test, + "memcpy()ing aligned uninit src to unaligned dst - part 2 (UMR report)\n"); + OPTIMIZER_HIDE_VAR(uninit_src); + memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src)); + kmsan_check_memory((void *)&dst[4], sizeof(uninit_src)); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static noinline void fibonacci(int *array, int size, int start) { + if (start < 2 || (start == size)) + return; + array[start] = array[start - 1] + array[start - 2]; + fibonacci(array, size, start + 1); +} + +static void test_long_origin_chain(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE_FN(expect, + "test_long_origin_chain"); + /* (KMSAN_MAX_ORIGIN_DEPTH * 2) recursive calls to fibonacci(). */ + volatile int accum[KMSAN_MAX_ORIGIN_DEPTH * 2 + 2]; + int last = ARRAY_SIZE(accum) - 1; + + kunit_info( + test, + "origin chain exceeding KMSAN_MAX_ORIGIN_DEPTH (UMR report)\n"); + /* + * We do not set accum[1] to 0, so the uninitializedness will be carried + * over to accum[2..last]. + */ + accum[0] = 1; + fibonacci((int *)accum, ARRAY_SIZE(accum), 2); + kmsan_check_memory((void *)&accum[last], sizeof(int)); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static struct kunit_case kmsan_test_cases[] = { + KUNIT_CASE(test_uninit_kmalloc), + KUNIT_CASE(test_init_kmalloc), + KUNIT_CASE(test_init_kzalloc), + KUNIT_CASE(test_uninit_stack_var), + KUNIT_CASE(test_init_stack_var), + KUNIT_CASE(test_params), + KUNIT_CASE(test_uninit_multiple_params), + KUNIT_CASE(test_uninit_kmsan_check_memory), + KUNIT_CASE(test_init_kmsan_vmap_vunmap), + KUNIT_CASE(test_init_vmalloc), + KUNIT_CASE(test_uaf), + KUNIT_CASE(test_percpu_propagate), + KUNIT_CASE(test_printk), + KUNIT_CASE(test_memcpy_aligned_to_aligned), + KUNIT_CASE(test_memcpy_aligned_to_unaligned), + KUNIT_CASE(test_memcpy_aligned_to_unaligned2), + KUNIT_CASE(test_long_origin_chain), + {}, +}; + +/* ===== End test cases ===== */ + +static int test_init(struct kunit *test) +{ + unsigned long flags; + + spin_lock_irqsave(&observed.lock, flags); + observed.header[0] = '\0'; + observed.ignore = false; + observed.available = false; + spin_unlock_irqrestore(&observed.lock, flags); + + return 0; +} + +static void test_exit(struct kunit *test) +{ +} + +static void register_tracepoints(struct tracepoint *tp, void *ignore) +{ + check_trace_callback_type_console(probe_console); + if (!strcmp(tp->name, "console")) + WARN_ON(tracepoint_probe_register(tp, probe_console, NULL)); +} + +static void unregister_tracepoints(struct tracepoint *tp, void *ignore) +{ + if (!strcmp(tp->name, "console")) + tracepoint_probe_unregister(tp, probe_console, NULL); +} + +static int kmsan_suite_init(struct kunit_suite *suite) +{ + /* + * Because we want to be able to build the test as a module, we need to + * iterate through all known tracepoints, since the static registration + * won't work here. + */ + for_each_kernel_tracepoint(register_tracepoints, NULL); + return 0; +} + +static void kmsan_suite_exit(struct kunit_suite *suite) +{ + for_each_kernel_tracepoint(unregister_tracepoints, NULL); + tracepoint_synchronize_unregister(); +} + +static struct kunit_suite kmsan_test_suite = { + .name = "kmsan", + .test_cases = kmsan_test_cases, + .init = test_init, + .exit = test_exit, + .suite_init = kmsan_suite_init, + .suite_exit = kmsan_suite_exit, +}; +kunit_test_suites(&kmsan_test_suite); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Alexander Potapenko <glider@google.com>"); diff --git a/mm/kmsan/report.c b/mm/kmsan/report.c new file mode 100644 index 000000000000..02736ec757f2 --- /dev/null +++ b/mm/kmsan/report.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN error reporting routines. + * + * Copyright (C) 2019-2022 Google LLC + * Author: Alexander Potapenko <glider@google.com> + * + */ + +#include <linux/console.h> +#include <linux/moduleparam.h> +#include <linux/stackdepot.h> +#include <linux/stacktrace.h> +#include <linux/uaccess.h> + +#include "kmsan.h" + +static DEFINE_RAW_SPINLOCK(kmsan_report_lock); +#define DESCR_SIZE 128 +/* Protected by kmsan_report_lock */ +static char report_local_descr[DESCR_SIZE]; +int panic_on_kmsan __read_mostly; + +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "kmsan." +module_param_named(panic, panic_on_kmsan, int, 0); + +/* + * Skip internal KMSAN frames. + */ +static int get_stack_skipnr(const unsigned long stack_entries[], + int num_entries) +{ + int len, skip; + char buf[64]; + + for (skip = 0; skip < num_entries; ++skip) { + len = scnprintf(buf, sizeof(buf), "%ps", + (void *)stack_entries[skip]); + + /* Never show __msan_* or kmsan_* functions. */ + if ((strnstr(buf, "__msan_", len) == buf) || + (strnstr(buf, "kmsan_", len) == buf)) + continue; + + /* + * No match for runtime functions -- @skip entries to skip to + * get to first frame of interest. + */ + break; + } + + return skip; +} + +/* + * Currently the descriptions of locals generated by Clang look as follows: + * ----local_name@function_name + * We want to print only the name of the local, as other information in that + * description can be confusing. + * The meaningful part of the description is copied to a global buffer to avoid + * allocating memory. + */ +static char *pretty_descr(char *descr) +{ + int pos = 0, len = strlen(descr); + + for (int i = 0; i < len; i++) { + if (descr[i] == '@') + break; + if (descr[i] == '-') + continue; + report_local_descr[pos] = descr[i]; + if (pos + 1 == DESCR_SIZE) + break; + pos++; + } + report_local_descr[pos] = 0; + return report_local_descr; +} + +void kmsan_print_origin(depot_stack_handle_t origin) +{ + unsigned long *entries = NULL, *chained_entries = NULL; + unsigned int nr_entries, chained_nr_entries, skipnr; + void *pc1 = NULL, *pc2 = NULL; + depot_stack_handle_t head; + unsigned long magic; + char *descr = NULL; + unsigned int depth; + + if (!origin) + return; + + while (true) { + nr_entries = stack_depot_fetch(origin, &entries); + depth = kmsan_depth_from_eb(stack_depot_get_extra_bits(origin)); + magic = nr_entries ? entries[0] : 0; + if ((nr_entries == 4) && (magic == KMSAN_ALLOCA_MAGIC_ORIGIN)) { + descr = (char *)entries[1]; + pc1 = (void *)entries[2]; + pc2 = (void *)entries[3]; + pr_err("Local variable %s created at:\n", + pretty_descr(descr)); + if (pc1) + pr_err(" %pSb\n", pc1); + if (pc2) + pr_err(" %pSb\n", pc2); + break; + } + if ((nr_entries == 3) && (magic == KMSAN_CHAIN_MAGIC_ORIGIN)) { + /* + * Origin chains deeper than KMSAN_MAX_ORIGIN_DEPTH are + * not stored, so the output may be incomplete. + */ + if (depth == KMSAN_MAX_ORIGIN_DEPTH) + pr_err("<Zero or more stacks not recorded to save memory>\n\n"); + head = entries[1]; + origin = entries[2]; + pr_err("Uninit was stored to memory at:\n"); + chained_nr_entries = + stack_depot_fetch(head, &chained_entries); + kmsan_internal_unpoison_memory( + chained_entries, + chained_nr_entries * sizeof(*chained_entries), + /*checked*/ false); + skipnr = get_stack_skipnr(chained_entries, + chained_nr_entries); + stack_trace_print(chained_entries + skipnr, + chained_nr_entries - skipnr, 0); + pr_err("\n"); + continue; + } + pr_err("Uninit was created at:\n"); + if (nr_entries) { + skipnr = get_stack_skipnr(entries, nr_entries); + stack_trace_print(entries + skipnr, nr_entries - skipnr, + 0); + } else { + pr_err("(stack is not available)\n"); + } + break; + } +} + +void kmsan_report(depot_stack_handle_t origin, void *address, int size, + int off_first, int off_last, const void *user_addr, + enum kmsan_bug_reason reason) +{ + unsigned long stack_entries[KMSAN_STACK_DEPTH]; + int num_stack_entries, skipnr; + char *bug_type = NULL; + unsigned long ua_flags; + bool is_uaf; + + if (!kmsan_enabled) + return; + if (!current->kmsan_ctx.allow_reporting) + return; + if (!origin) + return; + + current->kmsan_ctx.allow_reporting = false; + ua_flags = user_access_save(); + raw_spin_lock(&kmsan_report_lock); + pr_err("=====================================================\n"); + is_uaf = kmsan_uaf_from_eb(stack_depot_get_extra_bits(origin)); + switch (reason) { + case REASON_ANY: + bug_type = is_uaf ? "use-after-free" : "uninit-value"; + break; + case REASON_COPY_TO_USER: + bug_type = is_uaf ? "kernel-infoleak-after-free" : + "kernel-infoleak"; + break; + case REASON_SUBMIT_URB: + bug_type = is_uaf ? "kernel-usb-infoleak-after-free" : + "kernel-usb-infoleak"; + break; + } + + num_stack_entries = + stack_trace_save(stack_entries, KMSAN_STACK_DEPTH, 1); + skipnr = get_stack_skipnr(stack_entries, num_stack_entries); + + pr_err("BUG: KMSAN: %s in %pSb\n", bug_type, + (void *)stack_entries[skipnr]); + stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr, + 0); + pr_err("\n"); + + kmsan_print_origin(origin); + + if (size) { + pr_err("\n"); + if (off_first == off_last) + pr_err("Byte %d of %d is uninitialized\n", off_first, + size); + else + pr_err("Bytes %d-%d of %d are uninitialized\n", + off_first, off_last, size); + } + if (address) + pr_err("Memory access of size %d starts at %px\n", size, + address); + if (user_addr && reason == REASON_COPY_TO_USER) + pr_err("Data copied to user address %px\n", user_addr); + pr_err("\n"); + dump_stack_print_info(KERN_ERR); + pr_err("=====================================================\n"); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + raw_spin_unlock(&kmsan_report_lock); + if (panic_on_kmsan) + panic("kmsan.panic set ...\n"); + user_access_restore(ua_flags); + current->kmsan_ctx.allow_reporting = true; +} diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c new file mode 100644 index 000000000000..a787c04e9583 --- /dev/null +++ b/mm/kmsan/shadow.c @@ -0,0 +1,299 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN shadow implementation. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko <glider@google.com> + * + */ + +#include <asm/kmsan.h> +#include <asm/tlbflush.h> +#include <linux/cacheflush.h> +#include <linux/memblock.h> +#include <linux/mm_types.h> +#include <linux/slab.h> +#include <linux/smp.h> +#include <linux/stddef.h> + +#include "../internal.h" +#include "kmsan.h" + +#define shadow_page_for(page) ((page)->kmsan_shadow) + +#define origin_page_for(page) ((page)->kmsan_origin) + +static void *shadow_ptr_for(struct page *page) +{ + return page_address(shadow_page_for(page)); +} + +static void *origin_ptr_for(struct page *page) +{ + return page_address(origin_page_for(page)); +} + +static bool page_has_metadata(struct page *page) +{ + return shadow_page_for(page) && origin_page_for(page); +} + +static void set_no_shadow_origin_page(struct page *page) +{ + shadow_page_for(page) = NULL; + origin_page_for(page) = NULL; +} + +/* + * Dummy load and store pages to be used when the real metadata is unavailable. + * There are separate pages for loads and stores, so that every load returns a + * zero, and every store doesn't affect other loads. + */ +static char dummy_load_page[PAGE_SIZE] __aligned(PAGE_SIZE); +static char dummy_store_page[PAGE_SIZE] __aligned(PAGE_SIZE); + +static unsigned long vmalloc_meta(void *addr, bool is_origin) +{ + unsigned long addr64 = (unsigned long)addr, off; + + KMSAN_WARN_ON(is_origin && !IS_ALIGNED(addr64, KMSAN_ORIGIN_SIZE)); + if (kmsan_internal_is_vmalloc_addr(addr)) { + off = addr64 - VMALLOC_START; + return off + (is_origin ? KMSAN_VMALLOC_ORIGIN_START : + KMSAN_VMALLOC_SHADOW_START); + } + if (kmsan_internal_is_module_addr(addr)) { + off = addr64 - MODULES_VADDR; + return off + (is_origin ? KMSAN_MODULES_ORIGIN_START : + KMSAN_MODULES_SHADOW_START); + } + return 0; +} + +static struct page *virt_to_page_or_null(void *vaddr) +{ + if (kmsan_virt_addr_valid(vaddr)) + return virt_to_page(vaddr); + else + return NULL; +} + +struct shadow_origin_ptr kmsan_get_shadow_origin_ptr(void *address, u64 size, + bool store) +{ + struct shadow_origin_ptr ret; + void *shadow; + + /* + * Even if we redirect this memory access to the dummy page, it will + * go out of bounds. + */ + KMSAN_WARN_ON(size > PAGE_SIZE); + + if (!kmsan_enabled) + goto return_dummy; + + KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(address, size)); + shadow = kmsan_get_metadata(address, KMSAN_META_SHADOW); + if (!shadow) + goto return_dummy; + + ret.shadow = shadow; + ret.origin = kmsan_get_metadata(address, KMSAN_META_ORIGIN); + return ret; + +return_dummy: + if (store) { + /* Ignore this store. */ + ret.shadow = dummy_store_page; + ret.origin = dummy_store_page; + } else { + /* This load will return zero. */ + ret.shadow = dummy_load_page; + ret.origin = dummy_load_page; + } + return ret; +} + +/* + * Obtain the shadow or origin pointer for the given address, or NULL if there's + * none. The caller must check the return value for being non-NULL if needed. + * The return value of this function should not depend on whether we're in the + * runtime or not. + */ +void *kmsan_get_metadata(void *address, bool is_origin) +{ + u64 addr = (u64)address, pad, off; + struct page *page; + void *ret; + + if (is_origin && !IS_ALIGNED(addr, KMSAN_ORIGIN_SIZE)) { + pad = addr % KMSAN_ORIGIN_SIZE; + addr -= pad; + } + address = (void *)addr; + if (kmsan_internal_is_vmalloc_addr(address) || + kmsan_internal_is_module_addr(address)) + return (void *)vmalloc_meta(address, is_origin); + + ret = arch_kmsan_get_meta_or_null(address, is_origin); + if (ret) + return ret; + + page = virt_to_page_or_null(address); + if (!page) + return NULL; + if (!page_has_metadata(page)) + return NULL; + off = addr % PAGE_SIZE; + + return (is_origin ? origin_ptr_for(page) : shadow_ptr_for(page)) + off; +} + +void kmsan_copy_page_meta(struct page *dst, struct page *src) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + if (!dst || !page_has_metadata(dst)) + return; + if (!src || !page_has_metadata(src)) { + kmsan_internal_unpoison_memory(page_address(dst), PAGE_SIZE, + /*checked*/ false); + return; + } + + kmsan_enter_runtime(); + __memcpy(shadow_ptr_for(dst), shadow_ptr_for(src), PAGE_SIZE); + __memcpy(origin_ptr_for(dst), origin_ptr_for(src), PAGE_SIZE); + kmsan_leave_runtime(); +} +EXPORT_SYMBOL(kmsan_copy_page_meta); + +void kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags) +{ + bool initialized = (flags & __GFP_ZERO) || !kmsan_enabled; + struct page *shadow, *origin; + depot_stack_handle_t handle; + int pages = 1 << order; + + if (!page) + return; + + shadow = shadow_page_for(page); + origin = origin_page_for(page); + + if (initialized) { + __memset(page_address(shadow), 0, PAGE_SIZE * pages); + __memset(page_address(origin), 0, PAGE_SIZE * pages); + return; + } + + /* Zero pages allocated by the runtime should also be initialized. */ + if (kmsan_in_runtime()) + return; + + __memset(page_address(shadow), -1, PAGE_SIZE * pages); + kmsan_enter_runtime(); + handle = kmsan_save_stack_with_flags(flags, /*extra_bits*/ 0); + kmsan_leave_runtime(); + /* + * Addresses are page-aligned, pages are contiguous, so it's ok + * to just fill the origin pages with @handle. + */ + for (int i = 0; i < PAGE_SIZE * pages / sizeof(handle); i++) + ((depot_stack_handle_t *)page_address(origin))[i] = handle; +} + +void kmsan_free_page(struct page *page, unsigned int order) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + kmsan_enter_runtime(); + kmsan_internal_poison_memory(page_address(page), + PAGE_SIZE << compound_order(page), + GFP_KERNEL, + KMSAN_POISON_CHECK | KMSAN_POISON_FREE); + kmsan_leave_runtime(); +} + +void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift) +{ + unsigned long shadow_start, origin_start, shadow_end, origin_end; + struct page **s_pages, **o_pages; + int nr, mapped; + + if (!kmsan_enabled) + return; + + shadow_start = vmalloc_meta((void *)start, KMSAN_META_SHADOW); + shadow_end = vmalloc_meta((void *)end, KMSAN_META_SHADOW); + if (!shadow_start) + return; + + nr = (end - start) / PAGE_SIZE; + s_pages = kcalloc(nr, sizeof(*s_pages), GFP_KERNEL); + o_pages = kcalloc(nr, sizeof(*o_pages), GFP_KERNEL); + if (!s_pages || !o_pages) + goto ret; + for (int i = 0; i < nr; i++) { + s_pages[i] = shadow_page_for(pages[i]); + o_pages[i] = origin_page_for(pages[i]); + } + prot = __pgprot(pgprot_val(prot) | _PAGE_NX); + prot = PAGE_KERNEL; + + origin_start = vmalloc_meta((void *)start, KMSAN_META_ORIGIN); + origin_end = vmalloc_meta((void *)end, KMSAN_META_ORIGIN); + kmsan_enter_runtime(); + mapped = __vmap_pages_range_noflush(shadow_start, shadow_end, prot, + s_pages, page_shift); + KMSAN_WARN_ON(mapped); + mapped = __vmap_pages_range_noflush(origin_start, origin_end, prot, + o_pages, page_shift); + KMSAN_WARN_ON(mapped); + kmsan_leave_runtime(); + flush_tlb_kernel_range(shadow_start, shadow_end); + flush_tlb_kernel_range(origin_start, origin_end); + flush_cache_vmap(shadow_start, shadow_end); + flush_cache_vmap(origin_start, origin_end); + +ret: + kfree(s_pages); + kfree(o_pages); +} + +/* Allocate metadata for pages allocated at boot time. */ +void __init kmsan_init_alloc_meta_for_range(void *start, void *end) +{ + struct page *shadow_p, *origin_p; + void *shadow, *origin; + struct page *page; + u64 size; + + start = (void *)ALIGN_DOWN((u64)start, PAGE_SIZE); + size = ALIGN((u64)end - (u64)start, PAGE_SIZE); + shadow = memblock_alloc(size, PAGE_SIZE); + origin = memblock_alloc(size, PAGE_SIZE); + for (u64 addr = 0; addr < size; addr += PAGE_SIZE) { + page = virt_to_page_or_null((char *)start + addr); + shadow_p = virt_to_page_or_null((char *)shadow + addr); + set_no_shadow_origin_page(shadow_p); + shadow_page_for(page) = shadow_p; + origin_p = virt_to_page_or_null((char *)origin + addr); + set_no_shadow_origin_page(origin_p); + origin_page_for(page) = origin_p; + } +} + +void kmsan_setup_meta(struct page *page, struct page *shadow, + struct page *origin, int order) +{ + for (int i = 0; i < (1 << order); i++) { + set_no_shadow_origin_page(&shadow[i]); + set_no_shadow_origin_page(&origin[i]); + shadow_page_for(&page[i]) = &shadow[i]; + origin_page_for(&page[i]) = &origin[i]; + } +} @@ -39,9 +39,11 @@ #include <linux/freezer.h> #include <linux/oom.h> #include <linux/numa.h> +#include <linux/pagewalk.h> #include <asm/tlbflush.h> #include "internal.h" +#include "mm_slot.h" #ifdef CONFIG_NUMA #define NUMA(x) (x) @@ -82,7 +84,7 @@ * different KSM page copy of that content * * Internally, the regular nodes, "dups" and "chains" are represented - * using the same struct stable_node structure. + * using the same struct ksm_stable_node structure. * * In addition to the stable tree, KSM uses a second data structure called the * unstable tree: this tree holds pointers to pages which have been found to @@ -112,17 +114,13 @@ */ /** - * struct mm_slot - ksm information per mm that is being scanned - * @link: link to the mm_slots hash list - * @mm_list: link into the mm_slots list, rooted in ksm_mm_head + * struct ksm_mm_slot - ksm information per mm that is being scanned + * @slot: hash lookup from mm to mm_slot * @rmap_list: head for this mm_slot's singly-linked list of rmap_items - * @mm: the mm that this information is valid for */ -struct mm_slot { - struct hlist_node link; - struct list_head mm_list; - struct rmap_item *rmap_list; - struct mm_struct *mm; +struct ksm_mm_slot { + struct mm_slot slot; + struct ksm_rmap_item *rmap_list; }; /** @@ -135,14 +133,14 @@ struct mm_slot { * There is only the one ksm_scan instance of this cursor structure. */ struct ksm_scan { - struct mm_slot *mm_slot; + struct ksm_mm_slot *mm_slot; unsigned long address; - struct rmap_item **rmap_list; + struct ksm_rmap_item **rmap_list; unsigned long seqnr; }; /** - * struct stable_node - node of the stable rbtree + * struct ksm_stable_node - node of the stable rbtree * @node: rb node of this ksm page in the stable tree * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list * @hlist_dup: linked into the stable_node->hlist with a stable_node chain @@ -153,7 +151,7 @@ struct ksm_scan { * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN * @nid: NUMA node id of stable tree in which linked (may not match kpfn) */ -struct stable_node { +struct ksm_stable_node { union { struct rb_node node; /* when node of stable tree */ struct { /* when listed for migration */ @@ -182,7 +180,7 @@ struct stable_node { }; /** - * struct rmap_item - reverse mapping item for virtual addresses + * struct ksm_rmap_item - reverse mapping item for virtual addresses * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree * @nid: NUMA node id of unstable tree in which linked (may not match page) @@ -193,8 +191,8 @@ struct stable_node { * @head: pointer to stable_node heading this list in the stable tree * @hlist: link into hlist of rmap_items hanging off that stable_node */ -struct rmap_item { - struct rmap_item *rmap_list; +struct ksm_rmap_item { + struct ksm_rmap_item *rmap_list; union { struct anon_vma *anon_vma; /* when stable */ #ifdef CONFIG_NUMA @@ -207,7 +205,7 @@ struct rmap_item { union { struct rb_node node; /* when node of unstable tree */ struct { /* when listed from stable tree */ - struct stable_node *head; + struct ksm_stable_node *head; struct hlist_node hlist; }; }; @@ -230,8 +228,8 @@ static LIST_HEAD(migrate_nodes); #define MM_SLOTS_HASH_BITS 10 static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); -static struct mm_slot ksm_mm_head = { - .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), +static struct ksm_mm_slot ksm_mm_head = { + .slot.mm_node = LIST_HEAD_INIT(ksm_mm_head.slot.mm_node), }; static struct ksm_scan ksm_scan = { .mm_slot = &ksm_mm_head, @@ -298,21 +296,21 @@ static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait); static DEFINE_MUTEX(ksm_thread_mutex); static DEFINE_SPINLOCK(ksm_mmlist_lock); -#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\ +#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create(#__struct,\ sizeof(struct __struct), __alignof__(struct __struct),\ (__flags), NULL) static int __init ksm_slab_init(void) { - rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); + rmap_item_cache = KSM_KMEM_CACHE(ksm_rmap_item, 0); if (!rmap_item_cache) goto out; - stable_node_cache = KSM_KMEM_CACHE(stable_node, 0); + stable_node_cache = KSM_KMEM_CACHE(ksm_stable_node, 0); if (!stable_node_cache) goto out_free1; - mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); + mm_slot_cache = KSM_KMEM_CACHE(ksm_mm_slot, 0); if (!mm_slot_cache) goto out_free2; @@ -334,18 +332,18 @@ static void __init ksm_slab_free(void) mm_slot_cache = NULL; } -static __always_inline bool is_stable_node_chain(struct stable_node *chain) +static __always_inline bool is_stable_node_chain(struct ksm_stable_node *chain) { return chain->rmap_hlist_len == STABLE_NODE_CHAIN; } -static __always_inline bool is_stable_node_dup(struct stable_node *dup) +static __always_inline bool is_stable_node_dup(struct ksm_stable_node *dup) { return dup->head == STABLE_NODE_DUP_HEAD; } -static inline void stable_node_chain_add_dup(struct stable_node *dup, - struct stable_node *chain) +static inline void stable_node_chain_add_dup(struct ksm_stable_node *dup, + struct ksm_stable_node *chain) { VM_BUG_ON(is_stable_node_dup(dup)); dup->head = STABLE_NODE_DUP_HEAD; @@ -354,14 +352,14 @@ static inline void stable_node_chain_add_dup(struct stable_node *dup, ksm_stable_node_dups++; } -static inline void __stable_node_dup_del(struct stable_node *dup) +static inline void __stable_node_dup_del(struct ksm_stable_node *dup) { VM_BUG_ON(!is_stable_node_dup(dup)); hlist_del(&dup->hlist_dup); ksm_stable_node_dups--; } -static inline void stable_node_dup_del(struct stable_node *dup) +static inline void stable_node_dup_del(struct ksm_stable_node *dup) { VM_BUG_ON(is_stable_node_chain(dup)); if (is_stable_node_dup(dup)) @@ -373,9 +371,9 @@ static inline void stable_node_dup_del(struct stable_node *dup) #endif } -static inline struct rmap_item *alloc_rmap_item(void) +static inline struct ksm_rmap_item *alloc_rmap_item(void) { - struct rmap_item *rmap_item; + struct ksm_rmap_item *rmap_item; rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); @@ -384,14 +382,15 @@ static inline struct rmap_item *alloc_rmap_item(void) return rmap_item; } -static inline void free_rmap_item(struct rmap_item *rmap_item) +static inline void free_rmap_item(struct ksm_rmap_item *rmap_item) { ksm_rmap_items--; + rmap_item->mm->ksm_rmap_items--; rmap_item->mm = NULL; /* debug safety */ kmem_cache_free(rmap_item_cache, rmap_item); } -static inline struct stable_node *alloc_stable_node(void) +static inline struct ksm_stable_node *alloc_stable_node(void) { /* * The allocation can take too long with GFP_KERNEL when memory is under @@ -401,43 +400,13 @@ static inline struct stable_node *alloc_stable_node(void) return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH); } -static inline void free_stable_node(struct stable_node *stable_node) +static inline void free_stable_node(struct ksm_stable_node *stable_node) { VM_BUG_ON(stable_node->rmap_hlist_len && !is_stable_node_chain(stable_node)); kmem_cache_free(stable_node_cache, stable_node); } -static inline struct mm_slot *alloc_mm_slot(void) -{ - if (!mm_slot_cache) /* initialization failed */ - return NULL; - return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); -} - -static inline void free_mm_slot(struct mm_slot *mm_slot) -{ - kmem_cache_free(mm_slot_cache, mm_slot); -} - -static struct mm_slot *get_mm_slot(struct mm_struct *mm) -{ - struct mm_slot *slot; - - hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm) - if (slot->mm == mm) - return slot; - - return NULL; -} - -static void insert_to_mm_slots_hash(struct mm_struct *mm, - struct mm_slot *mm_slot) -{ - mm_slot->mm = mm; - hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm); -} - /* * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's * page tables after it has passed through ksm_exit() - which, if necessary, @@ -451,47 +420,74 @@ static inline bool ksm_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } +static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + struct page *page = NULL; + spinlock_t *ptl; + pte_t *pte; + int ret; + + if (pmd_leaf(*pmd) || !pmd_present(*pmd)) + return 0; + + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (pte_present(*pte)) { + page = vm_normal_page(walk->vma, addr, *pte); + } else if (!pte_none(*pte)) { + swp_entry_t entry = pte_to_swp_entry(*pte); + + /* + * As KSM pages remain KSM pages until freed, no need to wait + * here for migration to end. + */ + if (is_migration_entry(entry)) + page = pfn_swap_entry_to_page(entry); + } + ret = page && PageKsm(page); + pte_unmap_unlock(pte, ptl); + return ret; +} + +static const struct mm_walk_ops break_ksm_ops = { + .pmd_entry = break_ksm_pmd_entry, +}; + /* - * We use break_ksm to break COW on a ksm page: it's a stripped down - * - * if (get_user_pages(addr, 1, FOLL_WRITE, &page, NULL) == 1) - * put_page(page); + * We use break_ksm to break COW on a ksm page by triggering unsharing, + * such that the ksm page will get replaced by an exclusive anonymous page. * - * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, + * We take great care only to touch a ksm page, in a VM_MERGEABLE vma, * in case the application has unmapped and remapped mm,addr meanwhile. * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP * mmap of /dev/mem, where we would not want to touch it. * - * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context + * FAULT_FLAG_REMOTE/FOLL_REMOTE are because we do this outside the context * of the process that owns 'vma'. We also do not want to enforce * protection keys here anyway. */ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) { - struct page *page; vm_fault_t ret = 0; do { + int ksm_page; + cond_resched(); - page = follow_page(vma, addr, - FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); - if (IS_ERR_OR_NULL(page) || is_zone_device_page(page)) - break; - if (PageKsm(page)) - ret = handle_mm_fault(vma, addr, - FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE, - NULL); - else - ret = VM_FAULT_WRITE; - put_page(page); - } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); + ksm_page = walk_page_range_vma(vma, addr, addr + 1, + &break_ksm_ops, NULL); + if (WARN_ON_ONCE(ksm_page < 0)) + return ksm_page; + if (!ksm_page) + return 0; + ret = handle_mm_fault(vma, addr, + FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, + NULL); + } while (!(ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); /* - * We must loop because handle_mm_fault() may back out if there's - * any difficulty e.g. if pte accessed bit gets updated concurrently. - * - * VM_FAULT_WRITE is what we have been hoping for: it indicates that - * COW has been broken, even if the vma does not permit VM_WRITE; - * but note that a concurrent fault might break PageKsm for us. + * We must loop until we no longer find a KSM page because + * handle_mm_fault() may back out if there's any difficulty e.g. if + * pte accessed bit gets updated concurrently. * * VM_FAULT_SIGBUS could occur if we race with truncation of the * backing file, which also invalidates anonymous pages: that's @@ -528,7 +524,7 @@ static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm, return vma; } -static void break_cow(struct rmap_item *rmap_item) +static void break_cow(struct ksm_rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; unsigned long addr = rmap_item->address; @@ -547,7 +543,7 @@ static void break_cow(struct rmap_item *rmap_item) mmap_read_unlock(mm); } -static struct page *get_mergeable_page(struct rmap_item *rmap_item) +static struct page *get_mergeable_page(struct ksm_rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; unsigned long addr = rmap_item->address; @@ -560,12 +556,15 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) goto out; page = follow_page(vma, addr, FOLL_GET); - if (IS_ERR_OR_NULL(page) || is_zone_device_page(page)) + if (IS_ERR_OR_NULL(page)) goto out; + if (is_zone_device_page(page)) + goto out_putpage; if (PageAnon(page)) { flush_anon_page(vma, page, addr); flush_dcache_page(page); } else { +out_putpage: put_page(page); out: page = NULL; @@ -585,10 +584,10 @@ static inline int get_kpfn_nid(unsigned long kpfn) return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn)); } -static struct stable_node *alloc_stable_node_chain(struct stable_node *dup, +static struct ksm_stable_node *alloc_stable_node_chain(struct ksm_stable_node *dup, struct rb_root *root) { - struct stable_node *chain = alloc_stable_node(); + struct ksm_stable_node *chain = alloc_stable_node(); VM_BUG_ON(is_stable_node_chain(dup)); if (likely(chain)) { INIT_HLIST_HEAD(&chain->hlist); @@ -618,7 +617,7 @@ static struct stable_node *alloc_stable_node_chain(struct stable_node *dup, return chain; } -static inline void free_stable_node_chain(struct stable_node *chain, +static inline void free_stable_node_chain(struct ksm_stable_node *chain, struct rb_root *root) { rb_erase(&chain->node, root); @@ -626,9 +625,9 @@ static inline void free_stable_node_chain(struct stable_node *chain, ksm_stable_node_chains--; } -static void remove_node_from_stable_tree(struct stable_node *stable_node) +static void remove_node_from_stable_tree(struct ksm_stable_node *stable_node) { - struct rmap_item *rmap_item; + struct ksm_rmap_item *rmap_item; /* check it's not STABLE_NODE_CHAIN or negative */ BUG_ON(stable_node->rmap_hlist_len < 0); @@ -690,7 +689,7 @@ enum get_ksm_page_flags { * a page to put something that might look like our key in page->mapping. * is on its way to being freed; but it is an anomaly to bear in mind. */ -static struct page *get_ksm_page(struct stable_node *stable_node, +static struct page *get_ksm_page(struct ksm_stable_node *stable_node, enum get_ksm_page_flags flags) { struct page *page; @@ -769,10 +768,10 @@ stale: * Removing rmap_item from stable or unstable tree. * This function will clean the information from the stable/unstable tree. */ -static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) +static void remove_rmap_item_from_tree(struct ksm_rmap_item *rmap_item) { if (rmap_item->address & STABLE_FLAG) { - struct stable_node *stable_node; + struct ksm_stable_node *stable_node; struct page *page; stable_node = rmap_item->head; @@ -819,10 +818,10 @@ out: cond_resched(); /* we're called from many long loops */ } -static void remove_trailing_rmap_items(struct rmap_item **rmap_list) +static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list) { while (*rmap_list) { - struct rmap_item *rmap_item = *rmap_list; + struct ksm_rmap_item *rmap_item = *rmap_list; *rmap_list = rmap_item->rmap_list; remove_rmap_item_from_tree(rmap_item); free_rmap_item(rmap_item); @@ -859,18 +858,18 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, return err; } -static inline struct stable_node *folio_stable_node(struct folio *folio) +static inline struct ksm_stable_node *folio_stable_node(struct folio *folio) { return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL; } -static inline struct stable_node *page_stable_node(struct page *page) +static inline struct ksm_stable_node *page_stable_node(struct page *page) { return folio_stable_node(page_folio(page)); } static inline void set_page_stable_node(struct page *page, - struct stable_node *stable_node) + struct ksm_stable_node *stable_node) { VM_BUG_ON_PAGE(PageAnon(page) && PageAnonExclusive(page), page); page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); @@ -880,7 +879,7 @@ static inline void set_page_stable_node(struct page *page, /* * Only called through the sysfs control interface: */ -static int remove_stable_node(struct stable_node *stable_node) +static int remove_stable_node(struct ksm_stable_node *stable_node) { struct page *page; int err; @@ -918,10 +917,10 @@ static int remove_stable_node(struct stable_node *stable_node) return err; } -static int remove_stable_node_chain(struct stable_node *stable_node, +static int remove_stable_node_chain(struct ksm_stable_node *stable_node, struct rb_root *root) { - struct stable_node *dup; + struct ksm_stable_node *dup; struct hlist_node *hlist_safe; if (!is_stable_node_chain(stable_node)) { @@ -945,14 +944,14 @@ static int remove_stable_node_chain(struct stable_node *stable_node, static int remove_all_stable_nodes(void) { - struct stable_node *stable_node, *next; + struct ksm_stable_node *stable_node, *next; int nid; int err = 0; for (nid = 0; nid < ksm_nr_node_ids; nid++) { while (root_stable_tree[nid].rb_node) { stable_node = rb_entry(root_stable_tree[nid].rb_node, - struct stable_node, node); + struct ksm_stable_node, node); if (remove_stable_node_chain(stable_node, root_stable_tree + nid)) { err = -EBUSY; @@ -971,21 +970,25 @@ static int remove_all_stable_nodes(void) static int unmerge_and_remove_all_rmap_items(void) { - struct mm_slot *mm_slot; + struct ksm_mm_slot *mm_slot; + struct mm_slot *slot; struct mm_struct *mm; struct vm_area_struct *vma; int err = 0; spin_lock(&ksm_mmlist_lock); - ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next, - struct mm_slot, mm_list); + slot = list_entry(ksm_mm_head.slot.mm_node.next, + struct mm_slot, mm_node); + ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); spin_unlock(&ksm_mmlist_lock); - for (mm_slot = ksm_scan.mm_slot; - mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { - mm = mm_slot->mm; + for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head; + mm_slot = ksm_scan.mm_slot) { + VMA_ITERATOR(vmi, mm_slot->slot.mm, 0); + + mm = mm_slot->slot.mm; mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (ksm_test_exit(mm)) break; if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) @@ -1000,14 +1003,15 @@ static int unmerge_and_remove_all_rmap_items(void) mmap_read_unlock(mm); spin_lock(&ksm_mmlist_lock); - ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, - struct mm_slot, mm_list); + slot = list_entry(mm_slot->slot.mm_node.next, + struct mm_slot, mm_node); + ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (ksm_test_exit(mm)) { - hash_del(&mm_slot->link); - list_del(&mm_slot->mm_list); + hash_del(&mm_slot->slot.hash); + list_del(&mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); - free_mm_slot(mm_slot); + mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); mmdrop(mm); } else @@ -1065,7 +1069,6 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, anon_exclusive = PageAnonExclusive(page); if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) || - (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) || anon_exclusive || mm_tlb_flush_pending(mm)) { pte_t entry; @@ -1095,6 +1098,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, goto out_unlock; } + /* See page_try_share_anon_rmap(): clear PTE first. */ if (anon_exclusive && page_try_share_anon_rmap(page)) { set_pte_at(mm, pvmw.address, pvmw.pte, entry); goto out_unlock; @@ -1102,11 +1106,11 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, if (pte_dirty(entry)) set_page_dirty(page); + entry = pte_mkclean(entry); + + if (pte_write(entry)) + entry = pte_wrprotect(entry); - if (pte_protnone(entry)) - entry = pte_mkclean(pte_clear_savedwrite(entry)); - else - entry = pte_mkclean(pte_wrprotect(entry)); set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry); } *orig_pte = *pvmw.pte; @@ -1133,7 +1137,9 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage, pte_t orig_pte) { struct mm_struct *mm = vma->vm_mm; + struct folio *folio; pmd_t *pmd; + pmd_t pmde; pte_t *ptep; pte_t newpte; spinlock_t *ptl; @@ -1148,6 +1154,15 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, pmd = mm_find_pmd(mm, addr); if (!pmd) goto out; + /* + * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() + * without holding anon_vma lock for write. So when looking for a + * genuine pmde (in which to find pte), test present and !THP together. + */ + pmde = *pmd; + barrier(); + if (!pmd_present(pmde) || pmd_trans_huge(pmde)) + goto out; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); @@ -1191,10 +1206,11 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, ptep_clear_flush(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, newpte); + folio = page_folio(page); page_remove_rmap(page, vma, false); - if (!page_mapped(page)) - try_to_free_swap(page); - put_page(page); + if (!folio_mapped(folio)) + folio_free_swap(folio); + folio_put(folio); pte_unmap_unlock(ptep, ptl); err = 0; @@ -1278,7 +1294,7 @@ out: * * This function returns 0 if the pages were merged, -EFAULT otherwise. */ -static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, +static int try_to_merge_with_ksm_page(struct ksm_rmap_item *rmap_item, struct page *page, struct page *kpage) { struct mm_struct *mm = rmap_item->mm; @@ -1315,9 +1331,9 @@ out: * Note that this function upgrades page to ksm page: if one of the pages * is already a ksm page, try_to_merge_with_ksm_page should be used. */ -static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, +static struct page *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item, struct page *page, - struct rmap_item *tree_rmap_item, + struct ksm_rmap_item *tree_rmap_item, struct page *tree_page) { int err; @@ -1337,7 +1353,7 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, } static __always_inline -bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset) +bool __is_page_sharing_candidate(struct ksm_stable_node *stable_node, int offset) { VM_BUG_ON(stable_node->rmap_hlist_len < 0); /* @@ -1351,17 +1367,17 @@ bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset) } static __always_inline -bool is_page_sharing_candidate(struct stable_node *stable_node) +bool is_page_sharing_candidate(struct ksm_stable_node *stable_node) { return __is_page_sharing_candidate(stable_node, 0); } -static struct page *stable_node_dup(struct stable_node **_stable_node_dup, - struct stable_node **_stable_node, +static struct page *stable_node_dup(struct ksm_stable_node **_stable_node_dup, + struct ksm_stable_node **_stable_node, struct rb_root *root, bool prune_stale_stable_nodes) { - struct stable_node *dup, *found = NULL, *stable_node = *_stable_node; + struct ksm_stable_node *dup, *found = NULL, *stable_node = *_stable_node; struct hlist_node *hlist_safe; struct page *_tree_page, *tree_page = NULL; int nr = 0; @@ -1475,7 +1491,7 @@ static struct page *stable_node_dup(struct stable_node **_stable_node_dup, return tree_page; } -static struct stable_node *stable_node_dup_any(struct stable_node *stable_node, +static struct ksm_stable_node *stable_node_dup_any(struct ksm_stable_node *stable_node, struct rb_root *root) { if (!is_stable_node_chain(stable_node)) @@ -1502,12 +1518,12 @@ static struct stable_node *stable_node_dup_any(struct stable_node *stable_node, * function and will be overwritten in all cases, the caller doesn't * need to initialize it. */ -static struct page *__stable_node_chain(struct stable_node **_stable_node_dup, - struct stable_node **_stable_node, +static struct page *__stable_node_chain(struct ksm_stable_node **_stable_node_dup, + struct ksm_stable_node **_stable_node, struct rb_root *root, bool prune_stale_stable_nodes) { - struct stable_node *stable_node = *_stable_node; + struct ksm_stable_node *stable_node = *_stable_node; if (!is_stable_node_chain(stable_node)) { if (is_page_sharing_candidate(stable_node)) { *_stable_node_dup = stable_node; @@ -1524,18 +1540,18 @@ static struct page *__stable_node_chain(struct stable_node **_stable_node_dup, prune_stale_stable_nodes); } -static __always_inline struct page *chain_prune(struct stable_node **s_n_d, - struct stable_node **s_n, +static __always_inline struct page *chain_prune(struct ksm_stable_node **s_n_d, + struct ksm_stable_node **s_n, struct rb_root *root) { return __stable_node_chain(s_n_d, s_n, root, true); } -static __always_inline struct page *chain(struct stable_node **s_n_d, - struct stable_node *s_n, +static __always_inline struct page *chain(struct ksm_stable_node **s_n_d, + struct ksm_stable_node *s_n, struct rb_root *root) { - struct stable_node *old_stable_node = s_n; + struct ksm_stable_node *old_stable_node = s_n; struct page *tree_page; tree_page = __stable_node_chain(s_n_d, &s_n, root, false); @@ -1559,8 +1575,8 @@ static struct page *stable_tree_search(struct page *page) struct rb_root *root; struct rb_node **new; struct rb_node *parent; - struct stable_node *stable_node, *stable_node_dup, *stable_node_any; - struct stable_node *page_node; + struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any; + struct ksm_stable_node *page_node; page_node = page_stable_node(page); if (page_node && page_node->head != &migrate_nodes) { @@ -1580,7 +1596,7 @@ again: int ret; cond_resched(); - stable_node = rb_entry(*new, struct stable_node, node); + stable_node = rb_entry(*new, struct ksm_stable_node, node); stable_node_any = NULL; tree_page = chain_prune(&stable_node_dup, &stable_node, root); /* @@ -1803,14 +1819,14 @@ chain_append: * This function returns the stable tree node just allocated on success, * NULL otherwise. */ -static struct stable_node *stable_tree_insert(struct page *kpage) +static struct ksm_stable_node *stable_tree_insert(struct page *kpage) { int nid; unsigned long kpfn; struct rb_root *root; struct rb_node **new; struct rb_node *parent; - struct stable_node *stable_node, *stable_node_dup, *stable_node_any; + struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any; bool need_chain = false; kpfn = page_to_pfn(kpage); @@ -1825,7 +1841,7 @@ again: int ret; cond_resched(); - stable_node = rb_entry(*new, struct stable_node, node); + stable_node = rb_entry(*new, struct ksm_stable_node, node); stable_node_any = NULL; tree_page = chain(&stable_node_dup, stable_node, root); if (!stable_node_dup) { @@ -1894,7 +1910,7 @@ again: rb_insert_color(&stable_node_dup->node, root); } else { if (!is_stable_node_chain(stable_node)) { - struct stable_node *orig = stable_node; + struct ksm_stable_node *orig = stable_node; /* chain is missing so create it */ stable_node = alloc_stable_node_chain(orig, root); if (!stable_node) { @@ -1923,7 +1939,7 @@ again: * the same walking algorithm in an rbtree. */ static -struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, +struct ksm_rmap_item *unstable_tree_search_insert(struct ksm_rmap_item *rmap_item, struct page *page, struct page **tree_pagep) { @@ -1937,12 +1953,12 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, new = &root->rb_node; while (*new) { - struct rmap_item *tree_rmap_item; + struct ksm_rmap_item *tree_rmap_item; struct page *tree_page; int ret; cond_resched(); - tree_rmap_item = rb_entry(*new, struct rmap_item, node); + tree_rmap_item = rb_entry(*new, struct ksm_rmap_item, node); tree_page = get_mergeable_page(tree_rmap_item); if (!tree_page) return NULL; @@ -1994,8 +2010,8 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, * rmap_items hanging off a given node of the stable tree, all sharing * the same ksm page. */ -static void stable_tree_append(struct rmap_item *rmap_item, - struct stable_node *stable_node, +static void stable_tree_append(struct ksm_rmap_item *rmap_item, + struct ksm_stable_node *stable_node, bool max_page_sharing_bypass) { /* @@ -2037,12 +2053,12 @@ static void stable_tree_append(struct rmap_item *rmap_item, * @page: the page that we are searching identical page to. * @rmap_item: the reverse mapping into the virtual address of this page */ -static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) +static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; - struct rmap_item *tree_rmap_item; + struct ksm_rmap_item *tree_rmap_item; struct page *tree_page = NULL; - struct stable_node *stable_node; + struct ksm_stable_node *stable_node; struct page *kpage; unsigned int checksum; int err; @@ -2198,11 +2214,11 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) } } -static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, - struct rmap_item **rmap_list, +static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, + struct ksm_rmap_item **rmap_list, unsigned long addr) { - struct rmap_item *rmap_item; + struct ksm_rmap_item *rmap_item; while (*rmap_list) { rmap_item = *rmap_list; @@ -2218,7 +2234,8 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, rmap_item = alloc_rmap_item(); if (rmap_item) { /* It has already been zeroed */ - rmap_item->mm = mm_slot->mm; + rmap_item->mm = mm_slot->slot.mm; + rmap_item->mm->ksm_rmap_items++; rmap_item->address = addr; rmap_item->rmap_list = *rmap_list; *rmap_list = rmap_item; @@ -2226,19 +2243,21 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, return rmap_item; } -static struct rmap_item *scan_get_next_rmap_item(struct page **page) +static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) { struct mm_struct *mm; + struct ksm_mm_slot *mm_slot; struct mm_slot *slot; struct vm_area_struct *vma; - struct rmap_item *rmap_item; + struct ksm_rmap_item *rmap_item; + struct vma_iterator vmi; int nid; - if (list_empty(&ksm_mm_head.mm_list)) + if (list_empty(&ksm_mm_head.slot.mm_node)) return NULL; - slot = ksm_scan.mm_slot; - if (slot == &ksm_mm_head) { + mm_slot = ksm_scan.mm_slot; + if (mm_slot == &ksm_mm_head) { /* * A number of pages can hang around indefinitely on per-cpu * pagevecs, raised page count preventing write_protect_page @@ -2258,7 +2277,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) * so prune them once before each full scan. */ if (!ksm_merge_across_nodes) { - struct stable_node *stable_node, *next; + struct ksm_stable_node *stable_node, *next; struct page *page; list_for_each_entry_safe(stable_node, next, @@ -2275,28 +2294,31 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) root_unstable_tree[nid] = RB_ROOT; spin_lock(&ksm_mmlist_lock); - slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); - ksm_scan.mm_slot = slot; + slot = list_entry(mm_slot->slot.mm_node.next, + struct mm_slot, mm_node); + mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); + ksm_scan.mm_slot = mm_slot; spin_unlock(&ksm_mmlist_lock); /* * Although we tested list_empty() above, a racing __ksm_exit * of the last mm on the list may have removed it since then. */ - if (slot == &ksm_mm_head) + if (mm_slot == &ksm_mm_head) return NULL; next_mm: ksm_scan.address = 0; - ksm_scan.rmap_list = &slot->rmap_list; + ksm_scan.rmap_list = &mm_slot->rmap_list; } + slot = &mm_slot->slot; mm = slot->mm; + vma_iter_init(&vmi, mm, ksm_scan.address); + mmap_read_lock(mm); if (ksm_test_exit(mm)) - vma = NULL; - else - vma = find_vma(mm, ksm_scan.address); + goto no_vmas; - for (; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_MERGEABLE)) continue; if (ksm_scan.address < vma->vm_start) @@ -2308,15 +2330,17 @@ next_mm: if (ksm_test_exit(mm)) break; *page = follow_page(vma, ksm_scan.address, FOLL_GET); - if (IS_ERR_OR_NULL(*page) || is_zone_device_page(*page)) { + if (IS_ERR_OR_NULL(*page)) { ksm_scan.address += PAGE_SIZE; cond_resched(); continue; } + if (is_zone_device_page(*page)) + goto next_page; if (PageAnon(*page)) { flush_anon_page(vma, *page, ksm_scan.address); flush_dcache_page(*page); - rmap_item = get_next_rmap_item(slot, + rmap_item = get_next_rmap_item(mm_slot, ksm_scan.rmap_list, ksm_scan.address); if (rmap_item) { ksm_scan.rmap_list = @@ -2327,6 +2351,7 @@ next_mm: mmap_read_unlock(mm); return rmap_item; } +next_page: put_page(*page); ksm_scan.address += PAGE_SIZE; cond_resched(); @@ -2334,8 +2359,9 @@ next_mm: } if (ksm_test_exit(mm)) { +no_vmas: ksm_scan.address = 0; - ksm_scan.rmap_list = &slot->rmap_list; + ksm_scan.rmap_list = &mm_slot->rmap_list; } /* * Nuke all the rmap_items that are above this current rmap: @@ -2344,8 +2370,9 @@ next_mm: remove_trailing_rmap_items(ksm_scan.rmap_list); spin_lock(&ksm_mmlist_lock); - ksm_scan.mm_slot = list_entry(slot->mm_list.next, - struct mm_slot, mm_list); + slot = list_entry(mm_slot->slot.mm_node.next, + struct mm_slot, mm_node); + ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (ksm_scan.address == 0) { /* * We've completed a full scan of all vmas, holding mmap_lock @@ -2356,11 +2383,11 @@ next_mm: * or when all VM_MERGEABLE areas have been unmapped (and * mmap_lock then protects against race with MADV_MERGEABLE). */ - hash_del(&slot->link); - list_del(&slot->mm_list); + hash_del(&mm_slot->slot.hash); + list_del(&mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); - free_mm_slot(slot); + mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); mmap_read_unlock(mm); mmdrop(mm); @@ -2377,8 +2404,8 @@ next_mm: } /* Repeat until we've completed scanning the whole list */ - slot = ksm_scan.mm_slot; - if (slot != &ksm_mm_head) + mm_slot = ksm_scan.mm_slot; + if (mm_slot != &ksm_mm_head) goto next_mm; ksm_scan.seqnr++; @@ -2391,7 +2418,7 @@ next_mm: */ static void ksm_do_scan(unsigned int scan_npages) { - struct rmap_item *rmap_item; + struct ksm_rmap_item *rmap_item; struct page *page; while (scan_npages-- && likely(!freezing(current))) { @@ -2406,7 +2433,7 @@ static void ksm_do_scan(unsigned int scan_npages) static int ksmd_should_run(void) { - return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list); + return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.slot.mm_node); } static int ksm_scan_thread(void *nothing) @@ -2495,18 +2522,21 @@ EXPORT_SYMBOL_GPL(ksm_madvise); int __ksm_enter(struct mm_struct *mm) { - struct mm_slot *mm_slot; + struct ksm_mm_slot *mm_slot; + struct mm_slot *slot; int needs_wakeup; - mm_slot = alloc_mm_slot(); + mm_slot = mm_slot_alloc(mm_slot_cache); if (!mm_slot) return -ENOMEM; + slot = &mm_slot->slot; + /* Check ksm_run too? Would need tighter locking */ - needs_wakeup = list_empty(&ksm_mm_head.mm_list); + needs_wakeup = list_empty(&ksm_mm_head.slot.mm_node); spin_lock(&ksm_mmlist_lock); - insert_to_mm_slots_hash(mm, mm_slot); + mm_slot_insert(mm_slots_hash, mm, slot); /* * When KSM_RUN_MERGE (or KSM_RUN_STOP), * insert just behind the scanning cursor, to let the area settle @@ -2518,9 +2548,9 @@ int __ksm_enter(struct mm_struct *mm) * missed: then we might as well insert at the end of the list. */ if (ksm_run & KSM_RUN_UNMERGE) - list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list); + list_add_tail(&slot->mm_node, &ksm_mm_head.slot.mm_node); else - list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); + list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); set_bit(MMF_VM_MERGEABLE, &mm->flags); @@ -2534,7 +2564,8 @@ int __ksm_enter(struct mm_struct *mm) void __ksm_exit(struct mm_struct *mm) { - struct mm_slot *mm_slot; + struct ksm_mm_slot *mm_slot; + struct mm_slot *slot; int easy_to_free = 0; /* @@ -2547,21 +2578,22 @@ void __ksm_exit(struct mm_struct *mm) */ spin_lock(&ksm_mmlist_lock); - mm_slot = get_mm_slot(mm); + slot = mm_slot_lookup(mm_slots_hash, mm); + mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (mm_slot && ksm_scan.mm_slot != mm_slot) { if (!mm_slot->rmap_list) { - hash_del(&mm_slot->link); - list_del(&mm_slot->mm_list); + hash_del(&slot->hash); + list_del(&slot->mm_node); easy_to_free = 1; } else { - list_move(&mm_slot->mm_list, - &ksm_scan.mm_slot->mm_list); + list_move(&slot->mm_node, + &ksm_scan.mm_slot->slot.mm_node); } } spin_unlock(&ksm_mmlist_lock); if (easy_to_free) { - free_mm_slot(mm_slot); + mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); mmdrop(mm); } else if (mm_slot) { @@ -2612,8 +2644,8 @@ struct page *ksm_might_need_to_copy(struct page *page, void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) { - struct stable_node *stable_node; - struct rmap_item *rmap_item; + struct ksm_stable_node *stable_node; + struct ksm_rmap_item *rmap_item; int search_new_forks = 0; VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio); @@ -2683,7 +2715,7 @@ again: #ifdef CONFIG_MIGRATION void folio_migrate_ksm(struct folio *newfolio, struct folio *folio) { - struct stable_node *stable_node; + struct ksm_stable_node *stable_node; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio); @@ -2716,7 +2748,7 @@ static void wait_while_offlining(void) } } -static bool stable_node_dup_remove_range(struct stable_node *stable_node, +static bool stable_node_dup_remove_range(struct ksm_stable_node *stable_node, unsigned long start_pfn, unsigned long end_pfn) { @@ -2732,12 +2764,12 @@ static bool stable_node_dup_remove_range(struct stable_node *stable_node, return false; } -static bool stable_node_chain_remove_range(struct stable_node *stable_node, +static bool stable_node_chain_remove_range(struct ksm_stable_node *stable_node, unsigned long start_pfn, unsigned long end_pfn, struct rb_root *root) { - struct stable_node *dup; + struct ksm_stable_node *dup; struct hlist_node *hlist_safe; if (!is_stable_node_chain(stable_node)) { @@ -2761,14 +2793,14 @@ static bool stable_node_chain_remove_range(struct stable_node *stable_node, static void ksm_check_stable_tree(unsigned long start_pfn, unsigned long end_pfn) { - struct stable_node *stable_node, *next; + struct ksm_stable_node *stable_node, *next; struct rb_node *node; int nid; for (nid = 0; nid < ksm_nr_node_ids; nid++) { node = rb_first(root_stable_tree + nid); while (node) { - stable_node = rb_entry(node, struct stable_node, node); + stable_node = rb_entry(node, struct ksm_stable_node, node); if (stable_node_chain_remove_range(stable_node, start_pfn, end_pfn, root_stable_tree + @@ -3206,7 +3238,7 @@ static int __init ksm_init(void) #ifdef CONFIG_MEMORY_HOTREMOVE /* There is no significance to this priority 100 */ - hotplug_memory_notifier(ksm_memory_callback, 100); + hotplug_memory_notifier(ksm_memory_callback, KSM_CALLBACK_PRI); #endif return 0; diff --git a/mm/maccess.c b/mm/maccess.c index 5f4d240f67ec..074f6b086671 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -97,7 +97,7 @@ long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count) return src - unsafe_addr; Efault: pagefault_enable(); - dst[-1] = '\0'; + dst[0] = '\0'; return -EFAULT; } diff --git a/mm/madvise.c b/mm/madvise.c index 5f0f0948a50e..a56a6d17e201 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -59,6 +59,7 @@ static int madvise_need_mmap_write(int behavior) case MADV_FREE: case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: + case MADV_COLLAPSE: return 0; default: /* be safe, default to 1. list exceptions explicitly */ @@ -94,9 +95,6 @@ struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { mmap_assert_locked(vma->vm_mm); - if (vma->vm_file) - return NULL; - return vma->anon_name; } @@ -182,7 +180,7 @@ success: * vm_flags is protected by the mmap_lock held in write mode. */ vma->vm_flags = new_flags; - if (!vma->vm_file) { + if (!vma->vm_file || vma_is_anon_shmem(vma)) { error = replace_anon_vma_name(vma, anon_name); if (error) return error; @@ -225,6 +223,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, put_page(page); } swap_read_unplug(splug); + cond_resched(); return 0; } @@ -320,6 +319,21 @@ static long madvise_willneed(struct vm_area_struct *vma, return 0; } +static inline bool can_do_file_pageout(struct vm_area_struct *vma) +{ + if (!vma->vm_file) + return false; + /* + * paging out pagecache only for non-anonymous mappings that correspond + * to the files the calling process could (if tried) open for writing; + * otherwise we'd be including shared non-exclusive mappings, which + * opens a side channel. + */ + return inode_owner_or_capable(&init_user_ns, + file_inode(vma->vm_file)) || + file_permission(vma->vm_file, MAY_WRITE) == 0; +} + static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -333,10 +347,14 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, spinlock_t *ptl; struct page *page = NULL; LIST_HEAD(page_list); + bool pageout_anon_only_filter; if (fatal_signal_pending(current)) return -EINTR; + pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) && + !can_do_file_pageout(vma); + #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (pmd_trans_huge(*pmd)) { pmd_t orig_pmd; @@ -363,6 +381,9 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, if (page_mapcount(page) != 1) goto huge_unlock; + if (pageout_anon_only_filter && !PageAnon(page)) + goto huge_unlock; + if (next - addr != HPAGE_PMD_SIZE) { int err; @@ -431,6 +452,8 @@ regular_page: if (PageTransCompound(page)) { if (page_mapcount(page) != 1) break; + if (pageout_anon_only_filter && !PageAnon(page)) + break; get_page(page); if (!trylock_page(page)) { put_page(page); @@ -451,8 +474,14 @@ regular_page: continue; } - /* Do not interfere with other mappings of this page */ - if (page_mapcount(page) != 1) + /* + * Do not interfere with other mappings of this page and + * non-LRU page. + */ + if (!PageLRU(page) || page_mapcount(page) != 1) + continue; + + if (pageout_anon_only_filter && !PageAnon(page)) continue; VM_BUG_ON_PAGE(PageTransCompound(page), page); @@ -549,23 +578,6 @@ static void madvise_pageout_page_range(struct mmu_gather *tlb, tlb_end_vma(tlb, vma); } -static inline bool can_do_pageout(struct vm_area_struct *vma) -{ - if (vma_is_anonymous(vma)) - return true; - if (!vma->vm_file) - return false; - /* - * paging out pagecache only for non-anonymous mappings that correspond - * to the files the calling process could (if tried) open for writing; - * otherwise we'd be including shared non-exclusive mappings, which - * opens a side channel. - */ - return inode_owner_or_capable(&init_user_ns, - file_inode(vma->vm_file)) || - file_permission(vma->vm_file, MAY_WRITE) == 0; -} - static long madvise_pageout(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start_addr, unsigned long end_addr) @@ -577,7 +589,14 @@ static long madvise_pageout(struct vm_area_struct *vma, if (!can_madv_lru_vma(vma)) return -EINVAL; - if (!can_do_pageout(vma)) + /* + * If the VMA belongs to a private file mapping, there can be private + * dirty pages which can be paged out if even this process is neither + * owner nor write capable of the file. We allow private file mappings + * further to pageout dirty anon pages. + */ + if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) && + (vma->vm_flags & VM_MAYSHARE))) return 0; lru_add_drain(); @@ -597,6 +616,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *orig_pte, *pte, ptent; + struct folio *folio; struct page *page; int nr_swap = 0; unsigned long next; @@ -641,56 +661,56 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, page = vm_normal_page(vma, addr, ptent); if (!page || is_zone_device_page(page)) continue; + folio = page_folio(page); /* - * If pmd isn't transhuge but the page is THP and + * If pmd isn't transhuge but the folio is large and * is owned by only this process, split it and * deactivate all pages. */ - if (PageTransCompound(page)) { - if (page_mapcount(page) != 1) + if (folio_test_large(folio)) { + if (folio_mapcount(folio) != 1) goto out; - get_page(page); - if (!trylock_page(page)) { - put_page(page); + folio_get(folio); + if (!folio_trylock(folio)) { + folio_put(folio); goto out; } pte_unmap_unlock(orig_pte, ptl); - if (split_huge_page(page)) { - unlock_page(page); - put_page(page); + if (split_folio(folio)) { + folio_unlock(folio); + folio_put(folio); orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); goto out; } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); pte--; addr -= PAGE_SIZE; continue; } - VM_BUG_ON_PAGE(PageTransCompound(page), page); - - if (PageSwapCache(page) || PageDirty(page)) { - if (!trylock_page(page)) + if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { + if (!folio_trylock(folio)) continue; /* - * If page is shared with others, we couldn't clear - * PG_dirty of the page. + * If folio is shared with others, we mustn't clear + * the folio's dirty flag. */ - if (page_mapcount(page) != 1) { - unlock_page(page); + if (folio_mapcount(folio) != 1) { + folio_unlock(folio); continue; } - if (PageSwapCache(page) && !try_to_free_swap(page)) { - unlock_page(page); + if (folio_test_swapcache(folio) && + !folio_free_swap(folio)) { + folio_unlock(folio); continue; } - ClearPageDirty(page); - unlock_page(page); + folio_clear_dirty(folio); + folio_unlock(folio); } if (pte_young(ptent) || pte_dirty(ptent)) { @@ -708,7 +728,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, set_pte_at(mm, addr, pte, ptent); tlb_remove_tlb_entry(tlb, pte, addr); } - mark_page_lazyfree(page); + mark_page_lazyfree(&folio->page); } out: if (nr_swap) { @@ -767,8 +787,8 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about * data it wants to keep. Be sure to free swap resources too. The - * zap_page_range call sets things up for shrink_active_list to actually free - * these pages later if no one else has touched them in the meantime, + * zap_page_range_single call sets things up for shrink_active_list to actually + * free these pages later if no one else has touched them in the meantime, * although we could add these pages to a global reuse list for * shrink_active_list to pick up before reclaiming other pages. * @@ -785,7 +805,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, static long madvise_dontneed_single_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - zap_page_range(vma, start, end - start); + zap_page_range_single(vma, start, end - start, NULL); return 0; } @@ -808,7 +828,14 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, if (start & ~huge_page_mask(hstate_vma(vma))) return false; - *end = ALIGN(*end, huge_page_size(hstate_vma(vma))); + /* + * Madvise callers expect the length to be rounded up to PAGE_SIZE + * boundaries, and may be unaware that this VMA uses huge pages. + * Avoid unexpected data loss by rounding down the number of + * huge pages freed. + */ + *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma))); + return true; } @@ -823,6 +850,9 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) return -EINVAL; + if (start == end) + return 0; + if (!userfaultfd_remove(vma, start, end)) { *prev = NULL; /* mmap_lock has been dropped, prev is stale */ @@ -1057,6 +1087,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, if (error) goto out; break; + case MADV_COLLAPSE: + return madvise_collapse(vma, prev, start, end); } anon_name = anon_vma_name(vma); @@ -1150,6 +1182,7 @@ madvise_behavior_valid(int behavior) #ifdef CONFIG_TRANSPARENT_HUGEPAGE case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: + case MADV_COLLAPSE: #endif case MADV_DONTDUMP: case MADV_DODUMP: @@ -1166,13 +1199,13 @@ madvise_behavior_valid(int behavior) } } -static bool -process_madvise_behavior_valid(int behavior) +static bool process_madvise_behavior_valid(int behavior) { switch (behavior) { case MADV_COLD: case MADV_PAGEOUT: case MADV_WILLNEED: + case MADV_COLLAPSE: return true; default: return false; @@ -1238,7 +1271,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, if (start >= end) break; if (prev) - vma = prev->vm_next; + vma = find_vma(mm, prev->vm_end); else /* madvise_remove dropped mmap_lock */ vma = find_vma(mm, start); } @@ -1255,7 +1288,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma, int error; /* Only anonymous mappings can be named */ - if (vma->vm_file) + if (vma->vm_file && !vma_is_anon_shmem(vma)) return -EBADF; error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, @@ -1339,6 +1372,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, * MADV_NOHUGEPAGE - mark the given range as not worth being backed by * transparent huge pages so the existing pages will not be * coalesced into THP and new pages will not be allocated as THP. + * MADV_COLLAPSE - synchronously coalesce pages into new THP. * MADV_DONTDUMP - the application wants to prevent pages in the given range * from being included in its core dump. * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. @@ -1440,7 +1474,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, goto out; } - ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); + ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) goto out; diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index 1b0ab8fcfd8b..175e424b9ab1 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -126,7 +126,7 @@ static int clean_record_pte(pte_t *pte, unsigned long addr, static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { - pmd_t pmdval = pmd_read_atomic(pmd); + pmd_t pmdval = pmdp_get_lockless(pmd); if (!pmd_trans_unstable(&pmdval)) return 0; diff --git a/mm/memblock.c b/mm/memblock.c index b5d3026979fc..511d4783dcf1 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2000,7 +2000,7 @@ static void __init free_unused_memmap(void) * presume that there are no holes in the memory map inside * a pageblock */ - start = round_down(start, pageblock_nr_pages); + start = pageblock_start_pfn(start); /* * If we had a previous bank, and there is a space @@ -2014,12 +2014,12 @@ static void __init free_unused_memmap(void) * presume that there are no holes in the memory map inside * a pageblock */ - prev_end = ALIGN(end, pageblock_nr_pages); + prev_end = pageblock_align(end); } #ifdef CONFIG_SPARSEMEM if (!IS_ALIGNED(prev_end, PAGES_PER_SECTION)) { - prev_end = ALIGN(end, pageblock_nr_pages); + prev_end = pageblock_align(end); free_memmap(prev_end, ALIGN(prev_end, PAGES_PER_SECTION)); } #endif diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b69979c9ced5..ab457f0394ab 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -63,6 +63,7 @@ #include <linux/resume_user_mode.h> #include <linux/psi.h> #include <linux/seq_buf.h> +#include <linux/parser.h> #include "internal.h" #include <net/sock.h> #include <net/ip.h> @@ -88,13 +89,6 @@ static bool cgroup_memory_nosocket __ro_after_init; /* Kernel memory accounting disabled? */ static bool cgroup_memory_nokmem __ro_after_init; -/* Whether the swap controller is active */ -#ifdef CONFIG_MEMCG_SWAP -static bool cgroup_memory_noswap __ro_after_init; -#else -#define cgroup_memory_noswap 1 -#endif - #ifdef CONFIG_CGROUP_WRITEBACK static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); #endif @@ -102,7 +96,7 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); /* Whether legacy memory+swap accounting is active */ static bool do_memsw_account(void) { - return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; + return !cgroup_subsys_on_dfl(memory_cgrp_subsys); } #define THRESHOLDS_EVENTS_TARGET 128 @@ -597,25 +591,18 @@ static u64 flush_next_time; */ static void memcg_stats_lock(void) { -#ifdef CONFIG_PREEMPT_RT - preempt_disable(); -#else - VM_BUG_ON(!irqs_disabled()); -#endif + preempt_disable_nested(); + VM_WARN_ON_IRQS_ENABLED(); } static void __memcg_stats_lock(void) { -#ifdef CONFIG_PREEMPT_RT - preempt_disable(); -#endif + preempt_disable_nested(); } static void memcg_stats_unlock(void) { -#ifdef CONFIG_PREEMPT_RT - preempt_enable(); -#endif + preempt_enable_nested(); } static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) @@ -669,6 +656,83 @@ static void flush_memcg_stats_dwork(struct work_struct *w) queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); } +/* Subset of vm_event_item to report for memcg event stats */ +static const unsigned int memcg_vm_event_stat[] = { + PGPGIN, + PGPGOUT, + PGSCAN_KSWAPD, + PGSCAN_DIRECT, + PGSCAN_KHUGEPAGED, + PGSTEAL_KSWAPD, + PGSTEAL_DIRECT, + PGSTEAL_KHUGEPAGED, + PGFAULT, + PGMAJFAULT, + PGREFILL, + PGACTIVATE, + PGDEACTIVATE, + PGLAZYFREE, + PGLAZYFREED, +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) + ZSWPIN, + ZSWPOUT, +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + THP_FAULT_ALLOC, + THP_COLLAPSE_ALLOC, +#endif +}; + +#define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat) +static int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly; + +static void init_memcg_events(void) +{ + int i; + + for (i = 0; i < NR_MEMCG_EVENTS; ++i) + mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1; +} + +static inline int memcg_events_index(enum vm_event_item idx) +{ + return mem_cgroup_events_index[idx] - 1; +} + +struct memcg_vmstats_percpu { + /* Local (CPU and cgroup) page state & events */ + long state[MEMCG_NR_STAT]; + unsigned long events[NR_MEMCG_EVENTS]; + + /* Delta calculation for lockless upward propagation */ + long state_prev[MEMCG_NR_STAT]; + unsigned long events_prev[NR_MEMCG_EVENTS]; + + /* Cgroup1: threshold notifications & softlimit tree updates */ + unsigned long nr_page_events; + unsigned long targets[MEM_CGROUP_NTARGETS]; +}; + +struct memcg_vmstats { + /* Aggregated (CPU and subtree) page state & events */ + long state[MEMCG_NR_STAT]; + unsigned long events[NR_MEMCG_EVENTS]; + + /* Pending child counts during tree propagation */ + long state_pending[MEMCG_NR_STAT]; + unsigned long events_pending[NR_MEMCG_EVENTS]; +}; + +unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) +{ + long x = READ_ONCE(memcg->vmstats->state[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} + /** * __mod_memcg_state - update cgroup memory statistics * @memcg: the memory cgroup @@ -715,7 +779,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, * interrupt context while other caller need to have disabled interrupt. */ __memcg_stats_lock(); - if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) { + if (IS_ENABLED(CONFIG_DEBUG_VM)) { switch (idx) { case NR_ANON_MAPPED: case NR_FILE_MAPPED: @@ -725,7 +789,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, WARN_ON_ONCE(!in_task()); break; default: - WARN_ON_ONCE(!irqs_disabled()); + VM_WARN_ON_IRQS_ENABLED(); } } @@ -816,27 +880,37 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { - if (mem_cgroup_disabled()) + int index = memcg_events_index(idx); + + if (mem_cgroup_disabled() || index < 0) return; memcg_stats_lock(); - __this_cpu_add(memcg->vmstats_percpu->events[idx], count); + __this_cpu_add(memcg->vmstats_percpu->events[index], count); memcg_rstat_updated(memcg, count); memcg_stats_unlock(); } static unsigned long memcg_events(struct mem_cgroup *memcg, int event) { - return READ_ONCE(memcg->vmstats.events[event]); + int index = memcg_events_index(event); + + if (index < 0) + return 0; + return READ_ONCE(memcg->vmstats->events[index]); } static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) { long x = 0; int cpu; + int index = memcg_events_index(event); + + if (index < 0) + return 0; for_each_possible_cpu(cpu) - x += per_cpu(memcg->vmstats_percpu->events[event], cpu); + x += per_cpu(memcg->vmstats_percpu->events[index], cpu); return x; } @@ -1143,12 +1217,12 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) } while ((memcg = parent_mem_cgroup(memcg))); /* - * When cgruop1 non-hierarchy mode is used, + * When cgroup1 non-hierarchy mode is used, * parent_mem_cgroup() does not walk all the way up to the * cgroup root (root_mem_cgroup). So we have to handle * dead_memcg from cgroup root separately. */ - if (last != root_mem_cgroup) + if (!mem_cgroup_is_root(last)) __invalidate_reclaim_iterators(root_mem_cgroup, dead_memcg); } @@ -1172,7 +1246,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, struct mem_cgroup *iter; int ret = 0; - BUG_ON(memcg == root_mem_cgroup); + BUG_ON(mem_cgroup_is_root(memcg)); for_each_mem_cgroup_tree(iter, memcg) { struct css_task_iter it; @@ -1201,7 +1275,7 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) memcg = folio_memcg(folio); if (!memcg) - VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != root_mem_cgroup, folio); + VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio); else VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio); } @@ -1401,6 +1475,7 @@ static const struct memory_stat memory_stats[] = { { "kernel", MEMCG_KMEM }, { "kernel_stack", NR_KERNEL_STACK_KB }, { "pagetables", NR_PAGETABLE }, + { "sec_pagetables", NR_SECONDARY_PAGETABLE }, { "percpu", MEMCG_PERCPU_B }, { "sock", MEMCG_SOCK }, { "vmalloc", MEMCG_VMALLOC }, @@ -1467,29 +1542,6 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, return memcg_page_state(memcg, item) * memcg_page_state_unit(item); } -/* Subset of vm_event_item to report for memcg event stats */ -static const unsigned int memcg_vm_event_stat[] = { - PGSCAN_KSWAPD, - PGSCAN_DIRECT, - PGSTEAL_KSWAPD, - PGSTEAL_DIRECT, - PGFAULT, - PGMAJFAULT, - PGREFILL, - PGACTIVATE, - PGDEACTIVATE, - PGLAZYFREE, - PGLAZYFREED, -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) - ZSWPIN, - ZSWPOUT, -#endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - THP_FAULT_ALLOC, - THP_COLLAPSE_ALLOC, -#endif -}; - static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) { struct seq_buf s; @@ -1525,15 +1577,22 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) /* Accumulated memory events */ seq_buf_printf(&s, "pgscan %lu\n", memcg_events(memcg, PGSCAN_KSWAPD) + - memcg_events(memcg, PGSCAN_DIRECT)); + memcg_events(memcg, PGSCAN_DIRECT) + + memcg_events(memcg, PGSCAN_KHUGEPAGED)); seq_buf_printf(&s, "pgsteal %lu\n", memcg_events(memcg, PGSTEAL_KSWAPD) + - memcg_events(memcg, PGSTEAL_DIRECT)); + memcg_events(memcg, PGSTEAL_DIRECT) + + memcg_events(memcg, PGSTEAL_KHUGEPAGED)); + + for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) { + if (memcg_vm_event_stat[i] == PGPGIN || + memcg_vm_event_stat[i] == PGPGOUT) + continue; - for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) seq_buf_printf(&s, "%s %lu\n", vm_event_name(memcg_vm_event_stat[i]), memcg_events(memcg, memcg_vm_event_stat[i])); + } /* The above should easily fit into one page */ WARN_ON_ONCE(seq_buf_has_overflowed(&s)); @@ -1607,17 +1666,17 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { unsigned long max = READ_ONCE(memcg->memory.max); - if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) { - if (mem_cgroup_swappiness(memcg)) - max += min(READ_ONCE(memcg->swap.max), - (unsigned long)total_swap_pages); - } else { /* v1 */ + if (do_memsw_account()) { if (mem_cgroup_swappiness(memcg)) { /* Calculate swap excess capacity from memsw limit */ unsigned long swap = READ_ONCE(memcg->memsw.max) - max; max += min(swap, (unsigned long)total_swap_pages); } + } else { + if (mem_cgroup_swappiness(memcg)) + max += min(READ_ONCE(memcg->swap.max), + (unsigned long)total_swap_pages); } return max; } @@ -1982,7 +2041,7 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, rcu_read_lock(); memcg = mem_cgroup_from_task(victim); - if (memcg == root_mem_cgroup) + if (mem_cgroup_is_root(memcg)) goto out; /* @@ -2334,7 +2393,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, psi_memstall_enter(&pflags); nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, - MEMCG_RECLAIM_MAY_SWAP); + MEMCG_RECLAIM_MAY_SWAP, + NULL); psi_memstall_leave(&pflags); } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); @@ -2625,7 +2685,8 @@ retry: psi_memstall_enter(&pflags); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, - gfp_mask, reclaim_options); + gfp_mask, reclaim_options, + NULL); psi_memstall_leave(&pflags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) @@ -2789,6 +2850,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) * - LRU isolation * - lock_page_memcg() * - exclusive reference + * - mem_cgroup_trylock_pages() */ folio->memcg_data = (unsigned long)memcg; } @@ -2940,7 +3002,7 @@ static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) { struct obj_cgroup *objcg = NULL; - for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { + for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { objcg = rcu_dereference(memcg->objcg); if (objcg && obj_cgroup_tryget(objcg)) break; @@ -2971,7 +3033,7 @@ struct obj_cgroup *get_obj_cgroup_from_page(struct page *page) { struct obj_cgroup *objcg; - if (!memcg_kmem_enabled() || memcg_kmem_bypass()) + if (!memcg_kmem_enabled()) return NULL; if (PageMemcgKmem(page)) { @@ -3362,7 +3424,7 @@ void split_page_memcg(struct page *head, unsigned int nr) css_get_many(&memcg->css, nr - 1); } -#ifdef CONFIG_MEMCG_SWAP +#ifdef CONFIG_SWAP /** * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. * @entry: swap entry to be moved @@ -3444,7 +3506,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, } if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, - memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) { + memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, + NULL)) { ret = -EBUSY; break; } @@ -3555,7 +3618,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) return -EINTR; if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, - MEMCG_RECLAIM_MAY_SWAP)) + MEMCG_RECLAIM_MAY_SWAP, + NULL)) nr_retries--; } @@ -3975,6 +4039,8 @@ static const unsigned int memcg1_stats[] = { NR_FILE_MAPPED, NR_FILE_DIRTY, NR_WRITEBACK, + WORKINGSET_REFAULT_ANON, + WORKINGSET_REFAULT_FILE, MEMCG_SWAP, }; @@ -3988,6 +4054,8 @@ static const char *const memcg1_stat_names[] = { "mapped_file", "dirty", "writeback", + "workingset_refault_anon", + "workingset_refault_file", "swap", }; @@ -4016,7 +4084,8 @@ static int memcg_stat_show(struct seq_file *m, void *v) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; nr = memcg_page_state_local(memcg, memcg1_stats[i]); - seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE); + seq_printf(m, "%s %lu\n", memcg1_stat_names[i], + nr * memcg_page_state_unit(memcg1_stats[i])); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) @@ -4047,7 +4116,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) continue; nr = memcg_page_state(memcg, memcg1_stats[i]); seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], - (u64)nr * PAGE_SIZE); + (u64)nr * memcg_page_state_unit(memcg1_stats[i])); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) @@ -4772,6 +4841,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, unsigned int efd, cfd; struct fd efile; struct fd cfile; + struct dentry *cdentry; const char *name; char *endp; int ret; @@ -4826,6 +4896,16 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, goto out_put_cfile; /* + * The control file must be a regular cgroup1 file. As a regular cgroup + * file can't be renamed, it's safe to access its name afterwards. + */ + cdentry = cfile.file->f_path.dentry; + if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { + ret = -EINVAL; + goto out_put_cfile; + } + + /* * Determine the event callbacks and set them in @event. This used * to be done via struct cftype but cgroup core no longer knows * about these events. The following is crude but the whole thing @@ -4833,7 +4913,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, * * DO NOT ADD NEW FILES. */ - name = cfile.file->f_path.dentry->d_name.name; + name = cdentry->d_name.name; if (!strcmp(name, "memory.usage_in_bytes")) { event->register_event = mem_cgroup_usage_register_event; @@ -4857,7 +4937,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, * automatically removed on cgroup destruction but the removal is * asynchronous, so take an extra ref on @css. */ - cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent, + cfile_css = css_tryget_online_from_dir(cdentry->d_parent, &memory_cgrp_subsys); ret = -EINVAL; if (IS_ERR(cfile_css)) @@ -5110,8 +5190,8 @@ struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) struct mem_cgroup *memcg; cgrp = cgroup_get_from_id(ino); - if (!cgrp) - return ERR_PTR(-ENOENT); + if (IS_ERR(cgrp)) + return ERR_CAST(cgrp); css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys); if (css) @@ -5164,12 +5244,14 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); + kfree(memcg->vmstats); free_percpu(memcg->vmstats_percpu); kfree(memcg); } static void mem_cgroup_free(struct mem_cgroup *memcg) { + lru_gen_exit_memcg(memcg); memcg_wb_domain_exit(memcg); __mem_cgroup_free(memcg); } @@ -5192,6 +5274,10 @@ static struct mem_cgroup *mem_cgroup_alloc(void) goto fail; } + memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL); + if (!memcg->vmstats) + goto fail; + memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu, GFP_KERNEL_ACCOUNT); if (!memcg->vmstats_percpu) @@ -5228,6 +5314,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) memcg->deferred_split_queue.split_queue_len = 0; #endif idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); + lru_gen_init_memcg(memcg); return memcg; fail: mem_cgroup_id_remove(memcg); @@ -5262,6 +5349,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); } else { + init_memcg_events(); page_counter_init(&memcg->memory, NULL); page_counter_init(&memcg->swap, NULL); page_counter_init(&memcg->kmem, NULL); @@ -5410,9 +5498,9 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) * below us. We're in a per-cpu loop here and this is * a global counter, so the first cycle will get them. */ - delta = memcg->vmstats.state_pending[i]; + delta = memcg->vmstats->state_pending[i]; if (delta) - memcg->vmstats.state_pending[i] = 0; + memcg->vmstats->state_pending[i] = 0; /* Add CPU changes on this level since the last flush */ v = READ_ONCE(statc->state[i]); @@ -5425,15 +5513,15 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) continue; /* Aggregate counts on this level and propagate upwards */ - memcg->vmstats.state[i] += delta; + memcg->vmstats->state[i] += delta; if (parent) - parent->vmstats.state_pending[i] += delta; + parent->vmstats->state_pending[i] += delta; } - for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { - delta = memcg->vmstats.events_pending[i]; + for (i = 0; i < NR_MEMCG_EVENTS; i++) { + delta = memcg->vmstats->events_pending[i]; if (delta) - memcg->vmstats.events_pending[i] = 0; + memcg->vmstats->events_pending[i] = 0; v = READ_ONCE(statc->events[i]); if (v != statc->events_prev[i]) { @@ -5444,9 +5532,9 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) if (!delta) continue; - memcg->vmstats.events[i] += delta; + memcg->vmstats->events[i] += delta; if (parent) - parent->vmstats.events_pending[i] += delta; + parent->vmstats->events_pending[i] += delta; } for_each_node_state(nid, N_MEMORY) { @@ -5561,7 +5649,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, return NULL; /* - * Because lookup_swap_cache() updates some statistics counter, + * Because swap_cache_get_folio() updates some statistics counter, * we call find_get_page() with swapper_space directly. */ page = find_get_page(swap_address_space(ent), swp_offset(ent)); @@ -5580,15 +5668,21 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, static struct page *mc_handle_file_pte(struct vm_area_struct *vma, unsigned long addr, pte_t ptent) { + unsigned long index; + struct folio *folio; + if (!vma->vm_file) /* anonymous vma */ return NULL; if (!(mc.flags & MOVE_FILE)) return NULL; - /* page is moved even if it's not RSS of this task(page-faulted). */ + /* folio is moved even if it's not RSS of this task(page-faulted). */ /* shmem/tmpfs may report page out on swap: account for that too. */ - return find_get_incore_page(vma->vm_file->f_mapping, - linear_page_index(vma, addr)); + index = linear_page_index(vma, addr); + folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index); + if (!folio) + return NULL; + return folio_file_page(folio, index); } /** @@ -5673,6 +5767,12 @@ static int mem_cgroup_move_account(struct page *page, } } +#ifdef CONFIG_SWAP + if (folio_test_swapcache(folio)) { + __mod_lruvec_state(from_vec, NR_SWAPCACHE, -nr_pages); + __mod_lruvec_state(to_vec, NR_SWAPCACHE, nr_pages); + } +#endif if (folio_test_writeback(folio)) { __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages); __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages); @@ -5871,7 +5971,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) unsigned long precharge; mmap_read_lock(mm); - walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL); + walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL); mmap_read_unlock(mm); precharge = mc.precharge; @@ -6169,9 +6269,7 @@ retry: * When we have consumed all precharges and failed in doing * additional charge, the page walk just aborts. */ - walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops, - NULL); - + walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL); mmap_read_unlock(mc.mm); atomic_dec(&mc.from->moving_account); } @@ -6196,6 +6294,30 @@ static void mem_cgroup_move_task(void) } #endif +#ifdef CONFIG_LRU_GEN +static void mem_cgroup_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *css; + + /* find the first leader if there is any */ + cgroup_taskset_for_each_leader(task, css, tset) + break; + + if (!task) + return; + + task_lock(task); + if (task->mm && READ_ONCE(task->mm->owner) == task) + lru_gen_migrate_mm(task->mm); + task_unlock(task); +} +#else +static void mem_cgroup_attach(struct cgroup_taskset *tset) +{ +} +#endif /* CONFIG_LRU_GEN */ + static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) { if (value == PAGE_COUNTER_MAX) @@ -6307,7 +6429,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, } reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, + NULL); if (!reclaimed && !nr_retries--) break; @@ -6356,7 +6479,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (nr_reclaims) { if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP)) + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, + NULL)) nr_reclaims--; continue; } @@ -6479,21 +6603,54 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, return nbytes; } +enum { + MEMORY_RECLAIM_NODES = 0, + MEMORY_RECLAIM_NULL, +}; + +static const match_table_t if_tokens = { + { MEMORY_RECLAIM_NODES, "nodes=%s" }, + { MEMORY_RECLAIM_NULL, NULL }, +}; + static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); unsigned int nr_retries = MAX_RECLAIM_RETRIES; unsigned long nr_to_reclaim, nr_reclaimed = 0; - unsigned int reclaim_options; - int err; + unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP | + MEMCG_RECLAIM_PROACTIVE; + char *old_buf, *start; + substring_t args[MAX_OPT_ARGS]; + int token; + char value[256]; + nodemask_t nodemask = NODE_MASK_ALL; buf = strstrip(buf); - err = page_counter_memparse(buf, "", &nr_to_reclaim); - if (err) - return err; - reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; + old_buf = buf; + nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE; + if (buf == old_buf) + return -EINVAL; + + buf = strstrip(buf); + + while ((start = strsep(&buf, " ")) != NULL) { + if (!strlen(start)) + continue; + token = match_token(start, if_tokens, args); + match_strlcpy(value, args, sizeof(value)); + switch (token) { + case MEMORY_RECLAIM_NODES: + if (nodelist_parse(value, nodemask) < 0) + return -EINVAL; + break; + default: + return -EINVAL; + } + } + while (nr_reclaimed < nr_to_reclaim) { unsigned long reclaimed; @@ -6510,7 +6667,8 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_to_reclaim - nr_reclaimed, - GFP_KERNEL, reclaim_options); + GFP_KERNEL, reclaim_options, + &nodemask); if (!reclaimed && !nr_retries--) return -EAGAIN; @@ -6601,6 +6759,7 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_reset = mem_cgroup_css_reset, .css_rstat_flush = mem_cgroup_css_rstat_flush, .can_attach = mem_cgroup_can_attach, + .attach = mem_cgroup_attach, .cancel_attach = mem_cgroup_cancel_attach, .post_attach = mem_cgroup_move_task, .dfl_cftypes = memory_files, @@ -6813,21 +6972,20 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) } /** - * mem_cgroup_swapin_charge_page - charge a newly allocated page for swapin - * @page: page to charge + * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin. + * @folio: folio to charge. * @mm: mm context of the victim * @gfp: reclaim mode - * @entry: swap entry for which the page is allocated + * @entry: swap entry for which the folio is allocated * - * This function charges a page allocated for swapin. Please call this before - * adding the page to the swapcache. + * This function charges a folio allocated for swapin. Please call this before + * adding the folio to the swapcache. * * Returns 0 on success. Otherwise, an error code is returned. */ -int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, +int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { - struct folio *folio = page_folio(page); struct mem_cgroup *memcg; unsigned short id; int ret; @@ -7073,7 +7231,7 @@ void mem_cgroup_sk_alloc(struct sock *sk) rcu_read_lock(); memcg = mem_cgroup_from_task(current); - if (memcg == root_mem_cgroup) + if (mem_cgroup_is_root(memcg)) goto out; if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active) goto out; @@ -7200,7 +7358,7 @@ static int __init mem_cgroup_init(void) } subsys_initcall(mem_cgroup_init); -#ifdef CONFIG_MEMCG_SWAP +#ifdef CONFIG_SWAP static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) { while (!refcount_inc_not_zero(&memcg->id.ref)) { @@ -7208,7 +7366,7 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) * The root cgroup cannot be destroyed, so it's refcount must * always be >= 1. */ - if (WARN_ON_ONCE(memcg == root_mem_cgroup)) { + if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) { VM_BUG_ON(1); break; } @@ -7238,7 +7396,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) if (mem_cgroup_disabled()) return; - if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (!do_memsw_account()) return; memcg = folio_memcg(folio); @@ -7267,7 +7425,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, nr_entries); - if (!cgroup_memory_noswap && memcg != swap_memcg) { + if (memcg != swap_memcg) { if (!mem_cgroup_is_root(swap_memcg)) page_counter_charge(&swap_memcg->memsw, nr_entries); page_counter_uncharge(&memcg->memsw, nr_entries); @@ -7303,7 +7461,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) struct mem_cgroup *memcg; unsigned short oldid; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (do_memsw_account()) return 0; memcg = folio_memcg(folio); @@ -7319,7 +7477,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) memcg = mem_cgroup_id_get_online(memcg); - if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) && + if (!mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { memcg_memory_event(memcg, MEMCG_SWAP_MAX); memcg_memory_event(memcg, MEMCG_SWAP_FAIL); @@ -7347,15 +7505,18 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) struct mem_cgroup *memcg; unsigned short id; + if (mem_cgroup_disabled()) + return; + id = swap_cgroup_record(entry, 0, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_id(id); if (memcg) { - if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) { - if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_uncharge(&memcg->swap, nr_pages); - else + if (!mem_cgroup_is_root(memcg)) { + if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); + else + page_counter_uncharge(&memcg->swap, nr_pages); } mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); mem_cgroup_id_put_many(memcg, nr_pages); @@ -7367,31 +7528,31 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) { long nr_swap_pages = get_nr_swap_pages(); - if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (mem_cgroup_disabled() || do_memsw_account()) return nr_swap_pages; - for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) + for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) nr_swap_pages = min_t(long, nr_swap_pages, READ_ONCE(memcg->swap.max) - page_counter_read(&memcg->swap)); return nr_swap_pages; } -bool mem_cgroup_swap_full(struct page *page) +bool mem_cgroup_swap_full(struct folio *folio) { struct mem_cgroup *memcg; - VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (vm_swap_full()) return true; - if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (do_memsw_account()) return false; - memcg = page_memcg(page); + memcg = folio_memcg(folio); if (!memcg) return false; - for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { + for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { unsigned long usage = page_counter_read(&memcg->swap); if (usage * 2 >= READ_ONCE(memcg->swap.high) || @@ -7404,10 +7565,9 @@ bool mem_cgroup_swap_full(struct page *page) static int __init setup_swap_account(char *s) { - if (!strcmp(s, "1")) - cgroup_memory_noswap = false; - else if (!strcmp(s, "0")) - cgroup_memory_noswap = true; + pr_warn_once("The swapaccount= commandline option is deprecated. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); return 1; } __setup("swapaccount=", setup_swap_account); @@ -7556,7 +7716,7 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) return true; original_memcg = get_mem_cgroup_from_objcg(objcg); - for (memcg = original_memcg; memcg != root_mem_cgroup; + for (memcg = original_memcg; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { unsigned long max = READ_ONCE(memcg->zswap_max); unsigned long pages; @@ -7676,20 +7836,9 @@ static struct cftype zswap_files[] = { }; #endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */ -/* - * If mem_cgroup_swap_init() is implemented as a subsys_initcall() - * instead of a core_initcall(), this could mean cgroup_memory_noswap still - * remains set to false even when memcg is disabled via "cgroup_disable=memory" - * boot parameter. This may result in premature OOPS inside - * mem_cgroup_get_nr_swap_pages() function in corner cases. - */ static int __init mem_cgroup_swap_init(void) { - /* No memory control -> no swap control */ if (mem_cgroup_disabled()) - cgroup_memory_noswap = true; - - if (cgroup_memory_noswap) return 0; WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); @@ -7699,6 +7848,6 @@ static int __init mem_cgroup_swap_init(void) #endif return 0; } -core_initcall(mem_cgroup_swap_init); +subsys_initcall(mem_cgroup_swap_init); -#endif /* CONFIG_MEMCG_SWAP */ +#endif /* CONFIG_SWAP */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 14439806b5ef..c77a9e37e27e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -74,6 +74,19 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); static bool hw_memory_failure __read_mostly = false; +inline void num_poisoned_pages_inc(unsigned long pfn) +{ + atomic_long_inc(&num_poisoned_pages); + memblk_nr_poison_inc(pfn); +} + +inline void num_poisoned_pages_sub(unsigned long pfn, long i) +{ + atomic_long_sub(i, &num_poisoned_pages); + if (pfn != -1UL) + memblk_nr_poison_sub(pfn, i); +} + /* * Return values: * 1: the page is dissolved (if needed) and taken off from buddy, @@ -115,7 +128,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo if (release) put_page(page); page_ref_inc(page); - num_poisoned_pages_inc(); + num_poisoned_pages_inc(page_to_pfn(page)); return true; } @@ -277,7 +290,7 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) * to SIG_IGN, but hopefully no one will do that? */ ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr, - addr_lsb, t); /* synchronous? */ + addr_lsb, t); if (ret < 0) pr_info("Error sending signal to %s:%d: %d\n", t->comm, t->pid, ret); @@ -345,13 +358,17 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, * not much we can do. We just print a message and ignore otherwise. */ +#define FSDAX_INVALID_PGOFF ULONG_MAX + /* * Schedule a process for later kill. * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. * - * Notice: @fsdax_pgoff is used only when @p is a fsdax page. - * In other cases, such as anonymous and file-backend page, the address to be - * killed can be caculated by @p itself. + * Note: @fsdax_pgoff is used only when @p is a fsdax page and a + * filesystem with a memory failure handler has claimed the + * memory_failure event. In all other cases, page->index and + * page->mapping are sufficient for mapping the page back to its + * corresponding user virtual address. */ static void add_to_kill(struct task_struct *tsk, struct page *p, pgoff_t fsdax_pgoff, struct vm_area_struct *vma, @@ -367,11 +384,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, tk->addr = page_address_in_vma(p, vma); if (is_zone_device_page(p)) { - /* - * Since page->mapping is not used for fsdax, we need - * calculate the address based on the vma. - */ - if (p->pgmap->type == MEMORY_DEVICE_FS_DAX) + if (fsdax_pgoff != FSDAX_INVALID_PGOFF) tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma); tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr); } else @@ -413,7 +426,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail, { struct to_kill *tk, *next; - list_for_each_entry_safe (tk, next, to_kill, nd) { + list_for_each_entry_safe(tk, next, to_kill, nd) { if (forcekill) { /* * In case something went wrong with munmapping @@ -437,6 +450,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail, pr_err("%#lx: Cannot send advisory machine check signal to %s:%d\n", pfn, tk->tsk->comm, tk->tsk->pid); } + list_del(&tk->nd); put_task_struct(tk->tsk); kfree(tk); } @@ -520,14 +534,15 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, anon_vma_interval_tree_foreach(vmac, &av->rb_root, pgoff, pgoff) { vma = vmac->vma; + if (vma->vm_mm != t->mm) + continue; if (!page_mapped_in_vma(page, vma)) continue; - if (vma->vm_mm == t->mm) - add_to_kill(t, page, 0, vma, to_kill); + add_to_kill(t, page, FSDAX_INVALID_PGOFF, vma, to_kill); } } read_unlock(&tasklist_lock); - page_unlock_anon_vma_read(av); + anon_vma_unlock_read(av); } /* @@ -559,7 +574,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, * to be informed of all such data corruptions. */ if (vma->vm_mm == t->mm) - add_to_kill(t, page, 0, vma, to_kill); + add_to_kill(t, page, FSDAX_INVALID_PGOFF, vma, + to_kill); } } read_unlock(&tasklist_lock); @@ -632,7 +648,7 @@ static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift, swp_entry_t swp = pte_to_swp_entry(pte); if (is_hwpoison_entry(swp)) - pfn = hwpoison_entry_to_pfn(swp); + pfn = swp_offset_pfn(swp); } if (!pfn || pfn != poisoned_pfn) @@ -743,6 +759,9 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn, }; priv.tk.tsk = p; + if (!p->mm) + return -EFAULT; + mmap_read_lock(p->mm); ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops, (void *)&priv); @@ -821,12 +840,13 @@ static int truncate_error_page(struct page *p, unsigned long pfn, int ret = MF_FAILED; if (mapping->a_ops->error_remove_page) { + struct folio *folio = page_folio(p); int err = mapping->a_ops->error_remove_page(mapping, p); if (err != 0) { pr_info("%#lx: Failed to punch page: %d\n", pfn, err); - } else if (page_has_private(p) && - !try_to_release_page(p, GFP_NOIO)) { + } else if (folio_has_private(folio) && + !filemap_release_folio(folio, GFP_NOIO)) { pr_info("%#lx: failed to release buffers\n", pfn); } else { ret = MF_RECOVERED; @@ -1074,6 +1094,7 @@ static int me_huge_page(struct page_state *ps, struct page *p) int res; struct page *hpage = compound_head(p); struct address_space *mapping; + bool extra_pins = false; if (!PageHuge(hpage)) return MF_DELAYED; @@ -1081,6 +1102,8 @@ static int me_huge_page(struct page_state *ps, struct page *p) mapping = page_mapping(hpage); if (mapping) { res = truncate_error_page(hpage, page_to_pfn(p), mapping); + /* The page is kept in page cache. */ + extra_pins = true; unlock_page(hpage); } else { unlock_page(hpage); @@ -1098,7 +1121,7 @@ static int me_huge_page(struct page_state *ps, struct page *p) } } - if (has_extra_refcount(ps, p, false)) + if (has_extra_refcount(ps, p, extra_pins)) res = MF_FAILED; return res; @@ -1173,14 +1196,16 @@ static struct page_state error_states[] = { * "Dirty/Clean" indication is not 100% accurate due to the possibility of * setting PG_dirty outside page lock. See also comment above set_page_dirty(). */ -static void action_result(unsigned long pfn, enum mf_action_page_type type, - enum mf_result result) +static int action_result(unsigned long pfn, enum mf_action_page_type type, + enum mf_result result) { trace_memory_failure_event(pfn, type, result); - num_poisoned_pages_inc(); + num_poisoned_pages_inc(pfn); pr_err("%#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); + + return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; } static int page_action(struct page_state *ps, struct page *p, @@ -1191,14 +1216,12 @@ static int page_action(struct page_state *ps, struct page *p, /* page p should be unlocked after returning from ps->action(). */ result = ps->action(ps, p); - action_result(pfn, ps->type, result); - /* Could do more checks here if page looks ok */ /* * Could adjust zone counters here to correct for the missing page. */ - return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; + return action_result(pfn, ps->type, result); } static inline bool PageHWPoisonTakenOff(struct page *page) @@ -1238,14 +1261,14 @@ static int __get_hwpoison_page(struct page *page, unsigned long flags) int ret = 0; bool hugetlb = false; - ret = get_hwpoison_huge_page(head, &hugetlb); + ret = get_hwpoison_huge_page(head, &hugetlb, false); if (hugetlb) return ret; /* - * This check prevents from calling get_hwpoison_unless_zero() - * for any unsupported type of page in order to reduce the risk of - * unexpected races caused by taking a page refcount. + * This check prevents from calling get_page_unless_zero() for any + * unsupported type of page in order to reduce the risk of unexpected + * races caused by taking a page refcount. */ if (!HWPoisonHandlable(head, flags)) return -EBUSY; @@ -1328,7 +1351,7 @@ static int __get_unpoison_page(struct page *page) int ret = 0; bool hugetlb = false; - ret = get_hwpoison_huge_page(head, &hugetlb); + ret = get_hwpoison_huge_page(head, &hugetlb, true); if (hugetlb) return ret; @@ -1396,14 +1419,14 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, struct address_space *mapping; LIST_HEAD(tokill); bool unmap_success; - int kill = 1, forcekill; + int forcekill; bool mlocked = PageMlocked(hpage); /* * Here we are interested only in user-mapped pages, so skip any * other types of pages. */ - if (PageReserved(p) || PageSlab(p)) + if (PageReserved(p) || PageSlab(p) || PageTable(p)) return true; if (!(PageLRU(hpage) || PageHuge(p))) return true; @@ -1437,7 +1460,6 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, if (page_mkclean(hpage)) { SetPageDirty(hpage); } else { - kill = 0; ttu |= TTU_IGNORE_HWPOISON; pr_info("%#lx: corrupted page was clean: dropped without side effects\n", pfn); @@ -1448,12 +1470,8 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * First collect all the processes that have the page * mapped in dirty form. This has to be done before try_to_unmap, * because ttu takes the rmap data structures down. - * - * Error handling: We ignore errors here because - * there's nothing that can be done. */ - if (kill) - collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); + collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); if (PageHuge(hpage) && !PageAnon(hpage)) { /* @@ -1495,7 +1513,8 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL); + forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL) || + !unmap_success; kill_procs(&tokill, forcekill, !unmap_success, pfn, flags); return unmap_success; @@ -1524,20 +1543,18 @@ static int identify_page_state(unsigned long pfn, struct page *p, return page_action(ps, p, pfn); } -static int try_to_split_thp_page(struct page *page, const char *msg) +static int try_to_split_thp_page(struct page *page) { + int ret; + lock_page(page); - if (unlikely(split_huge_page(page))) { - unsigned long pfn = page_to_pfn(page); + ret = split_huge_page(page); + unlock_page(page); - unlock_page(page); - pr_info("%s: %#lx: thp split failed\n", msg, pfn); + if (unlikely(ret)) put_page(page); - return -EBUSY; - } - unlock_page(page); - return 0; + return ret; } static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn, @@ -1671,8 +1688,7 @@ EXPORT_SYMBOL_GPL(mf_dax_kill_procs); #ifdef CONFIG_HUGETLB_PAGE /* * Struct raw_hwp_page represents information about "raw error page", - * constructing singly linked list originated from ->private field of - * SUBPAGE_INDEX_HWPOISON-th tail page. + * constructing singly linked list from ->_hugetlb_hwpoison field of folio. */ struct raw_hwp_page { struct llist_node node; @@ -1681,7 +1697,7 @@ struct raw_hwp_page { static inline struct llist_head *raw_hwp_list_head(struct page *hpage) { - return (struct llist_head *)&page_private(hpage + SUBPAGE_INDEX_HWPOISON); + return (struct llist_head *)&page_folio(hpage)->_hugetlb_hwpoison; } static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag) @@ -1696,6 +1712,8 @@ static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag) if (move_flag) SetPageHWPoison(p->page); + else + num_poisoned_pages_sub(page_to_pfn(p->page), 1); kfree(p); count++; } @@ -1731,7 +1749,7 @@ static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page) llist_add(&raw_hwp->node, head); /* the first error event will be counted in action_result(). */ if (ret) - num_poisoned_pages_inc(); + num_poisoned_pages_inc(page_to_pfn(page)); } else { /* * Failed to save raw error info. We no longer trace all @@ -1785,7 +1803,8 @@ void hugetlb_clear_page_hwpoison(struct page *hpage) * -EBUSY - the hugepage is busy (try to retry) * -EHWPOISON - the hugepage is already hwpoisoned */ -int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) +int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared) { struct page *page = pfn_to_page(pfn); struct page *head = compound_head(page); @@ -1815,6 +1834,15 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) goto out; } + /* + * Clearing HPageMigratable for hwpoisoned hugepages to prevent them + * from being migrated by memory hotremove. + */ + if (count_increased && HPageMigratable(head)) { + ClearHPageMigratable(head); + *migratable_cleared = true; + } + return ret; out: if (count_increased) @@ -1834,10 +1862,11 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb struct page *p = pfn_to_page(pfn); struct page *head; unsigned long page_flags; + bool migratable_cleared = false; *hugetlb = 1; retry: - res = get_huge_page_for_hwpoison(pfn, flags); + res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared); if (res == 2) { /* fallback to normal page handling */ *hugetlb = 0; return 0; @@ -1853,8 +1882,7 @@ retry: flags |= MF_NO_RETRY; goto retry; } - action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); - return res; + return action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); } head = compound_head(p); @@ -1862,8 +1890,12 @@ retry: if (hwpoison_filter(p)) { hugetlb_clear_page_hwpoison(head); - res = -EOPNOTSUPP; - goto out; + if (migratable_cleared) + SetHPageMigratable(head); + unlock_page(head); + if (res == 1) + put_page(head); + return -EOPNOTSUPP; } /* @@ -1878,22 +1910,17 @@ retry: } else { res = MF_FAILED; } - action_result(pfn, MF_MSG_FREE_HUGE, res); - return res == MF_RECOVERED ? 0 : -EBUSY; + return action_result(pfn, MF_MSG_FREE_HUGE, res); } page_flags = head->flags; if (!hwpoison_user_mappings(p, pfn, flags, head)) { - action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); - res = -EBUSY; - goto out; + unlock_page(head); + return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); } return identify_page_state(pfn, p, page_flags); -out: - unlock_page(head); - return res; } #else @@ -1908,17 +1935,25 @@ static inline unsigned long free_raw_hwp_pages(struct page *hpage, bool flag) } #endif /* CONFIG_HUGETLB_PAGE */ +/* Drop the extra refcount in case we come from madvise() */ +static void put_ref_page(unsigned long pfn, int flags) +{ + struct page *page; + + if (!(flags & MF_COUNT_INCREASED)) + return; + + page = pfn_to_page(pfn); + if (page) + put_page(page); +} + static int memory_failure_dev_pagemap(unsigned long pfn, int flags, struct dev_pagemap *pgmap) { - struct page *page = pfn_to_page(pfn); int rc = -ENXIO; - if (flags & MF_COUNT_INCREASED) - /* - * Drop the extra refcount in case we come from madvise(). - */ - put_page(page); + put_ref_page(pfn, flags); /* device metadata space is not recoverable */ if (!pgmap_pfn_valid(pgmap, pfn)) @@ -1928,7 +1963,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, * Call driver's implementation to handle the memory failure, otherwise * fall back to generic handler. */ - if (pgmap->ops->memory_failure) { + if (pgmap_has_memory_failure(pgmap)) { rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags); /* * Fall back to generic handler too if operation is not @@ -2026,7 +2061,7 @@ try_again: /* * We need/can do nothing about count=0 pages. * 1) it's a free page, and therefore in safe hand: - * prep_new_page() will be the gate keeper. + * check_new_page() will be the gate keeper. * 2) it's part of a non-compound high order page. * Implies some kernel user: cannot stop them from * R/W the page; let's pray that the page has been @@ -2050,16 +2085,13 @@ try_again: } res = MF_FAILED; } - action_result(pfn, MF_MSG_BUDDY, res); - res = res == MF_RECOVERED ? 0 : -EBUSY; + res = action_result(pfn, MF_MSG_BUDDY, res); } else { - action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); - res = -EBUSY; + res = action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); } goto unlock_mutex; } else if (res < 0) { - action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); - res = -EBUSY; + res = action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); goto unlock_mutex; } } @@ -2079,9 +2111,8 @@ try_again: * page is a valid handlable page. */ SetPageHasHWPoisoned(hpage); - if (try_to_split_thp_page(p, "Memory Failure") < 0) { - action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); - res = -EBUSY; + if (try_to_split_thp_page(p) < 0) { + res = action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); goto unlock_mutex; } VM_BUG_ON_PAGE(!page_count(p), p); @@ -2114,8 +2145,7 @@ try_again: retry = false; goto try_again; } - action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED); - res = -EBUSY; + res = action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED); goto unlock_page; } @@ -2129,7 +2159,7 @@ try_again: page_flags = p->flags; if (hwpoison_filter(p)) { - TestClearPageHWPoison(p); + ClearPageHWPoison(p); unlock_page(p); put_page(p); res = -EOPNOTSUPP; @@ -2155,8 +2185,7 @@ try_again: * Abort on fail: __filemap_remove_folio() assumes unmapped page. */ if (!hwpoison_user_mappings(p, pfn, flags, p)) { - action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); - res = -EBUSY; + res = action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); goto unlock_page; } @@ -2164,8 +2193,7 @@ try_again: * Torn down by someone else? */ if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { - action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED); - res = -EBUSY; + res = action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED); goto unlock_page; } @@ -2310,8 +2338,8 @@ int unpoison_memory(unsigned long pfn) struct page *page; struct page *p; int ret = -EBUSY; - int freeit = 0; unsigned long count = 1; + bool huge = false; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -2354,12 +2382,13 @@ int unpoison_memory(unsigned long pfn) goto unlock_mutex; } - if (PageSlab(page) || PageTable(page)) + if (PageSlab(page) || PageTable(page) || PageReserved(page)) goto unlock_mutex; ret = get_hwpoison_page(p, MF_UNPOISON); if (!ret) { if (PageHuge(p)) { + huge = true; count = free_raw_hwp_pages(page, false); if (count == 0) { ret = -EBUSY; @@ -2375,16 +2404,17 @@ int unpoison_memory(unsigned long pfn) pfn, &unpoison_rs); } else { if (PageHuge(p)) { + huge = true; count = free_raw_hwp_pages(page, false); if (count == 0) { ret = -EBUSY; + put_page(page); goto unlock_mutex; } } - freeit = !!TestClearPageHWPoison(p); put_page(page); - if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) { + if (TestClearPageHWPoison(p)) { put_page(page); ret = 0; } @@ -2392,8 +2422,9 @@ int unpoison_memory(unsigned long pfn) unlock_mutex: mutex_unlock(&mf_mutex); - if (!ret || freeit) { - num_poisoned_pages_sub(count); + if (!ret) { + if (!huge) + num_poisoned_pages_sub(pfn, 1); unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", page_to_pfn(p), &unpoison_rs); } @@ -2404,24 +2435,26 @@ EXPORT_SYMBOL(unpoison_memory); static bool isolate_page(struct page *page, struct list_head *pagelist) { bool isolated = false; - bool lru = PageLRU(page); if (PageHuge(page)) { isolated = !isolate_hugetlb(page, pagelist); } else { + bool lru = !__PageMovable(page); + if (lru) isolated = !isolate_lru_page(page); else - isolated = !isolate_movable_page(page, ISOLATE_UNEVICTABLE); + isolated = !isolate_movable_page(page, + ISOLATE_UNEVICTABLE); - if (isolated) + if (isolated) { list_add(&page->lru, pagelist); + if (lru) + inc_node_page_state(page, NR_ISOLATED_ANON + + page_is_file_lru(page)); + } } - if (isolated && lru) - inc_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_lru(page)); - /* * If we succeed to isolate the page, we grabbed another refcount on * the page, so we can safely drop the one we got from get_any_pages(). @@ -2434,11 +2467,11 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) } /* - * __soft_offline_page handles hugetlb-pages and non-hugetlb pages. + * soft_offline_in_use_page handles hugetlb-pages and non-hugetlb pages. * If the page is a non-dirty unmapped page-cache page, it simply invalidates. * If the page is mapped, it migrates the contents over. */ -static int __soft_offline_page(struct page *page) +static int soft_offline_in_use_page(struct page *page) { long ret = 0; unsigned long pfn = page_to_pfn(page); @@ -2451,6 +2484,14 @@ static int __soft_offline_page(struct page *page) .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, }; + if (!huge && PageTransHuge(hpage)) { + if (try_to_split_thp_page(page)) { + pr_info("soft offline: %#lx: thp split failed\n", pfn); + return -EBUSY; + } + hpage = page; + } + lock_page(page); if (!PageHuge(page)) wait_on_page_writeback(page); @@ -2500,32 +2541,6 @@ static int __soft_offline_page(struct page *page) return ret; } -static int soft_offline_in_use_page(struct page *page) -{ - struct page *hpage = compound_head(page); - - if (!PageHuge(page) && PageTransHuge(hpage)) - if (try_to_split_thp_page(page, "soft offline") < 0) - return -EBUSY; - return __soft_offline_page(page); -} - -static int soft_offline_free_page(struct page *page) -{ - int rc = 0; - - if (!page_handle_poison(page, true, false)) - rc = -EBUSY; - - return rc; -} - -static void put_ref_page(struct page *page) -{ - if (page) - put_page(page); -} - /** * soft_offline_page - Soft offline a page. * @pfn: pfn to soft-offline @@ -2554,19 +2569,17 @@ int soft_offline_page(unsigned long pfn, int flags) { int ret; bool try_again = true; - struct page *page, *ref_page = NULL; - - WARN_ON_ONCE(!pfn_valid(pfn) && (flags & MF_COUNT_INCREASED)); + struct page *page; - if (!pfn_valid(pfn)) + if (!pfn_valid(pfn)) { + WARN_ON_ONCE(flags & MF_COUNT_INCREASED); return -ENXIO; - if (flags & MF_COUNT_INCREASED) - ref_page = pfn_to_page(pfn); + } /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */ page = pfn_to_online_page(pfn); if (!page) { - put_ref_page(ref_page); + put_ref_page(pfn, flags); return -EIO; } @@ -2574,7 +2587,7 @@ int soft_offline_page(unsigned long pfn, int flags) if (PageHWPoison(page)) { pr_info("%s: %#lx page already poisoned\n", __func__, pfn); - put_ref_page(ref_page); + put_ref_page(pfn, flags); mutex_unlock(&mf_mutex); return 0; } @@ -2587,8 +2600,6 @@ retry: if (hwpoison_filter(page)) { if (ret > 0) put_page(page); - else - put_ref_page(ref_page); mutex_unlock(&mf_mutex); return -EOPNOTSUPP; @@ -2597,7 +2608,7 @@ retry: if (ret > 0) { ret = soft_offline_in_use_page(page); } else if (ret == 0) { - if (soft_offline_free_page(page) && try_again) { + if (!page_handle_poison(page, true, false) && try_again) { try_again = false; flags &= ~MF_COUNT_INCREASED; goto retry; @@ -2608,24 +2619,3 @@ retry: return ret; } - -void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) -{ - int i; - - /* - * A further optimization is to have per section refcounted - * num_poisoned_pages. But that would need more space per memmap, so - * for now just do a quick global check to speed up this routine in the - * absence of bad pages. - */ - if (atomic_long_read(&num_poisoned_pages) == 0) - return; - - for (i = 0; i < nr_pages; i++) { - if (PageHWPoison(&memmap[i])) { - num_poisoned_pages_dec(); - ClearPageHWPoison(&memmap[i]); - } - } -} diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c new file mode 100644 index 000000000000..c734658c6242 --- /dev/null +++ b/mm/memory-tiers.c @@ -0,0 +1,732 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/slab.h> +#include <linux/lockdep.h> +#include <linux/sysfs.h> +#include <linux/kobject.h> +#include <linux/memory.h> +#include <linux/memory-tiers.h> + +#include "internal.h" + +struct memory_tier { + /* hierarchy of memory tiers */ + struct list_head list; + /* list of all memory types part of this tier */ + struct list_head memory_types; + /* + * start value of abstract distance. memory tier maps + * an abstract distance range, + * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE + */ + int adistance_start; + struct device dev; + /* All the nodes that are part of all the lower memory tiers. */ + nodemask_t lower_tier_mask; +}; + +struct demotion_nodes { + nodemask_t preferred; +}; + +struct node_memory_type_map { + struct memory_dev_type *memtype; + int map_count; +}; + +static DEFINE_MUTEX(memory_tier_lock); +static LIST_HEAD(memory_tiers); +static struct node_memory_type_map node_memory_types[MAX_NUMNODES]; +static struct memory_dev_type *default_dram_type; + +static struct bus_type memory_tier_subsys = { + .name = "memory_tiering", + .dev_name = "memory_tier", +}; + +#ifdef CONFIG_MIGRATION +static int top_tier_adistance; +/* + * node_demotion[] examples: + * + * Example 1: + * + * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes. + * + * node distances: + * node 0 1 2 3 + * 0 10 20 30 40 + * 1 20 10 40 30 + * 2 30 40 10 40 + * 3 40 30 40 10 + * + * memory_tiers0 = 0-1 + * memory_tiers1 = 2-3 + * + * node_demotion[0].preferred = 2 + * node_demotion[1].preferred = 3 + * node_demotion[2].preferred = <empty> + * node_demotion[3].preferred = <empty> + * + * Example 2: + * + * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node. + * + * node distances: + * node 0 1 2 + * 0 10 20 30 + * 1 20 10 30 + * 2 30 30 10 + * + * memory_tiers0 = 0-2 + * + * node_demotion[0].preferred = <empty> + * node_demotion[1].preferred = <empty> + * node_demotion[2].preferred = <empty> + * + * Example 3: + * + * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node. + * + * node distances: + * node 0 1 2 + * 0 10 20 30 + * 1 20 10 40 + * 2 30 40 10 + * + * memory_tiers0 = 1 + * memory_tiers1 = 0 + * memory_tiers2 = 2 + * + * node_demotion[0].preferred = 2 + * node_demotion[1].preferred = 0 + * node_demotion[2].preferred = <empty> + * + */ +static struct demotion_nodes *node_demotion __read_mostly; +#endif /* CONFIG_MIGRATION */ + +static inline struct memory_tier *to_memory_tier(struct device *device) +{ + return container_of(device, struct memory_tier, dev); +} + +static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier) +{ + nodemask_t nodes = NODE_MASK_NONE; + struct memory_dev_type *memtype; + + list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling) + nodes_or(nodes, nodes, memtype->nodes); + + return nodes; +} + +static void memory_tier_device_release(struct device *dev) +{ + struct memory_tier *tier = to_memory_tier(dev); + /* + * synchronize_rcu in clear_node_memory_tier makes sure + * we don't have rcu access to this memory tier. + */ + kfree(tier); +} + +static ssize_t nodelist_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int ret; + nodemask_t nmask; + + mutex_lock(&memory_tier_lock); + nmask = get_memtier_nodemask(to_memory_tier(dev)); + ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask)); + mutex_unlock(&memory_tier_lock); + return ret; +} +static DEVICE_ATTR_RO(nodelist); + +static struct attribute *memtier_dev_attrs[] = { + &dev_attr_nodelist.attr, + NULL +}; + +static const struct attribute_group memtier_dev_group = { + .attrs = memtier_dev_attrs, +}; + +static const struct attribute_group *memtier_dev_groups[] = { + &memtier_dev_group, + NULL +}; + +static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype) +{ + int ret; + bool found_slot = false; + struct memory_tier *memtier, *new_memtier; + int adistance = memtype->adistance; + unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE; + + lockdep_assert_held_once(&memory_tier_lock); + + adistance = round_down(adistance, memtier_adistance_chunk_size); + /* + * If the memtype is already part of a memory tier, + * just return that. + */ + if (!list_empty(&memtype->tier_sibiling)) { + list_for_each_entry(memtier, &memory_tiers, list) { + if (adistance == memtier->adistance_start) + return memtier; + } + WARN_ON(1); + return ERR_PTR(-EINVAL); + } + + list_for_each_entry(memtier, &memory_tiers, list) { + if (adistance == memtier->adistance_start) { + goto link_memtype; + } else if (adistance < memtier->adistance_start) { + found_slot = true; + break; + } + } + + new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL); + if (!new_memtier) + return ERR_PTR(-ENOMEM); + + new_memtier->adistance_start = adistance; + INIT_LIST_HEAD(&new_memtier->list); + INIT_LIST_HEAD(&new_memtier->memory_types); + if (found_slot) + list_add_tail(&new_memtier->list, &memtier->list); + else + list_add_tail(&new_memtier->list, &memory_tiers); + + new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS; + new_memtier->dev.bus = &memory_tier_subsys; + new_memtier->dev.release = memory_tier_device_release; + new_memtier->dev.groups = memtier_dev_groups; + + ret = device_register(&new_memtier->dev); + if (ret) { + list_del(&memtier->list); + put_device(&memtier->dev); + return ERR_PTR(ret); + } + memtier = new_memtier; + +link_memtype: + list_add(&memtype->tier_sibiling, &memtier->memory_types); + return memtier; +} + +static struct memory_tier *__node_get_memory_tier(int node) +{ + pg_data_t *pgdat; + + pgdat = NODE_DATA(node); + if (!pgdat) + return NULL; + /* + * Since we hold memory_tier_lock, we can avoid + * RCU read locks when accessing the details. No + * parallel updates are possible here. + */ + return rcu_dereference_check(pgdat->memtier, + lockdep_is_held(&memory_tier_lock)); +} + +#ifdef CONFIG_MIGRATION +bool node_is_toptier(int node) +{ + bool toptier; + pg_data_t *pgdat; + struct memory_tier *memtier; + + pgdat = NODE_DATA(node); + if (!pgdat) + return false; + + rcu_read_lock(); + memtier = rcu_dereference(pgdat->memtier); + if (!memtier) { + toptier = true; + goto out; + } + if (memtier->adistance_start <= top_tier_adistance) + toptier = true; + else + toptier = false; +out: + rcu_read_unlock(); + return toptier; +} + +void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets) +{ + struct memory_tier *memtier; + + /* + * pg_data_t.memtier updates includes a synchronize_rcu() + * which ensures that we either find NULL or a valid memtier + * in NODE_DATA. protect the access via rcu_read_lock(); + */ + rcu_read_lock(); + memtier = rcu_dereference(pgdat->memtier); + if (memtier) + *targets = memtier->lower_tier_mask; + else + *targets = NODE_MASK_NONE; + rcu_read_unlock(); +} + +/** + * next_demotion_node() - Get the next node in the demotion path + * @node: The starting node to lookup the next node + * + * Return: node id for next memory node in the demotion path hierarchy + * from @node; NUMA_NO_NODE if @node is terminal. This does not keep + * @node online or guarantee that it *continues* to be the next demotion + * target. + */ +int next_demotion_node(int node) +{ + struct demotion_nodes *nd; + int target; + + if (!node_demotion) + return NUMA_NO_NODE; + + nd = &node_demotion[node]; + + /* + * node_demotion[] is updated without excluding this + * function from running. + * + * Make sure to use RCU over entire code blocks if + * node_demotion[] reads need to be consistent. + */ + rcu_read_lock(); + /* + * If there are multiple target nodes, just select one + * target node randomly. + * + * In addition, we can also use round-robin to select + * target node, but we should introduce another variable + * for node_demotion[] to record last selected target node, + * that may cause cache ping-pong due to the changing of + * last target node. Or introducing per-cpu data to avoid + * caching issue, which seems more complicated. So selecting + * target node randomly seems better until now. + */ + target = node_random(&nd->preferred); + rcu_read_unlock(); + + return target; +} + +static void disable_all_demotion_targets(void) +{ + struct memory_tier *memtier; + int node; + + for_each_node_state(node, N_MEMORY) { + node_demotion[node].preferred = NODE_MASK_NONE; + /* + * We are holding memory_tier_lock, it is safe + * to access pgda->memtier. + */ + memtier = __node_get_memory_tier(node); + if (memtier) + memtier->lower_tier_mask = NODE_MASK_NONE; + } + /* + * Ensure that the "disable" is visible across the system. + * Readers will see either a combination of before+disable + * state or disable+after. They will never see before and + * after state together. + */ + synchronize_rcu(); +} + +/* + * Find an automatic demotion target for all memory + * nodes. Failing here is OK. It might just indicate + * being at the end of a chain. + */ +static void establish_demotion_targets(void) +{ + struct memory_tier *memtier; + struct demotion_nodes *nd; + int target = NUMA_NO_NODE, node; + int distance, best_distance; + nodemask_t tier_nodes, lower_tier; + + lockdep_assert_held_once(&memory_tier_lock); + + if (!node_demotion || !IS_ENABLED(CONFIG_MIGRATION)) + return; + + disable_all_demotion_targets(); + + for_each_node_state(node, N_MEMORY) { + best_distance = -1; + nd = &node_demotion[node]; + + memtier = __node_get_memory_tier(node); + if (!memtier || list_is_last(&memtier->list, &memory_tiers)) + continue; + /* + * Get the lower memtier to find the demotion node list. + */ + memtier = list_next_entry(memtier, list); + tier_nodes = get_memtier_nodemask(memtier); + /* + * find_next_best_node, use 'used' nodemask as a skip list. + * Add all memory nodes except the selected memory tier + * nodelist to skip list so that we find the best node from the + * memtier nodelist. + */ + nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes); + + /* + * Find all the nodes in the memory tier node list of same best distance. + * add them to the preferred mask. We randomly select between nodes + * in the preferred mask when allocating pages during demotion. + */ + do { + target = find_next_best_node(node, &tier_nodes); + if (target == NUMA_NO_NODE) + break; + + distance = node_distance(node, target); + if (distance == best_distance || best_distance == -1) { + best_distance = distance; + node_set(target, nd->preferred); + } else { + break; + } + } while (1); + } + /* + * Promotion is allowed from a memory tier to higher + * memory tier only if the memory tier doesn't include + * compute. We want to skip promotion from a memory tier, + * if any node that is part of the memory tier have CPUs. + * Once we detect such a memory tier, we consider that tier + * as top tiper from which promotion is not allowed. + */ + list_for_each_entry_reverse(memtier, &memory_tiers, list) { + tier_nodes = get_memtier_nodemask(memtier); + nodes_and(tier_nodes, node_states[N_CPU], tier_nodes); + if (!nodes_empty(tier_nodes)) { + /* + * abstract distance below the max value of this memtier + * is considered toptier. + */ + top_tier_adistance = memtier->adistance_start + + MEMTIER_CHUNK_SIZE - 1; + break; + } + } + /* + * Now build the lower_tier mask for each node collecting node mask from + * all memory tier below it. This allows us to fallback demotion page + * allocation to a set of nodes that is closer the above selected + * perferred node. + */ + lower_tier = node_states[N_MEMORY]; + list_for_each_entry(memtier, &memory_tiers, list) { + /* + * Keep removing current tier from lower_tier nodes, + * This will remove all nodes in current and above + * memory tier from the lower_tier mask. + */ + tier_nodes = get_memtier_nodemask(memtier); + nodes_andnot(lower_tier, lower_tier, tier_nodes); + memtier->lower_tier_mask = lower_tier; + } +} + +#else +static inline void disable_all_demotion_targets(void) {} +static inline void establish_demotion_targets(void) {} +#endif /* CONFIG_MIGRATION */ + +static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype) +{ + if (!node_memory_types[node].memtype) + node_memory_types[node].memtype = memtype; + /* + * for each device getting added in the same NUMA node + * with this specific memtype, bump the map count. We + * Only take memtype device reference once, so that + * changing a node memtype can be done by droping the + * only reference count taken here. + */ + + if (node_memory_types[node].memtype == memtype) { + if (!node_memory_types[node].map_count++) + kref_get(&memtype->kref); + } +} + +static struct memory_tier *set_node_memory_tier(int node) +{ + struct memory_tier *memtier; + struct memory_dev_type *memtype; + pg_data_t *pgdat = NODE_DATA(node); + + + lockdep_assert_held_once(&memory_tier_lock); + + if (!node_state(node, N_MEMORY)) + return ERR_PTR(-EINVAL); + + __init_node_memory_type(node, default_dram_type); + + memtype = node_memory_types[node].memtype; + node_set(node, memtype->nodes); + memtier = find_create_memory_tier(memtype); + if (!IS_ERR(memtier)) + rcu_assign_pointer(pgdat->memtier, memtier); + return memtier; +} + +static void destroy_memory_tier(struct memory_tier *memtier) +{ + list_del(&memtier->list); + device_unregister(&memtier->dev); +} + +static bool clear_node_memory_tier(int node) +{ + bool cleared = false; + pg_data_t *pgdat; + struct memory_tier *memtier; + + pgdat = NODE_DATA(node); + if (!pgdat) + return false; + + /* + * Make sure that anybody looking at NODE_DATA who finds + * a valid memtier finds memory_dev_types with nodes still + * linked to the memtier. We achieve this by waiting for + * rcu read section to finish using synchronize_rcu. + * This also enables us to free the destroyed memory tier + * with kfree instead of kfree_rcu + */ + memtier = __node_get_memory_tier(node); + if (memtier) { + struct memory_dev_type *memtype; + + rcu_assign_pointer(pgdat->memtier, NULL); + synchronize_rcu(); + memtype = node_memory_types[node].memtype; + node_clear(node, memtype->nodes); + if (nodes_empty(memtype->nodes)) { + list_del_init(&memtype->tier_sibiling); + if (list_empty(&memtier->memory_types)) + destroy_memory_tier(memtier); + } + cleared = true; + } + return cleared; +} + +static void release_memtype(struct kref *kref) +{ + struct memory_dev_type *memtype; + + memtype = container_of(kref, struct memory_dev_type, kref); + kfree(memtype); +} + +struct memory_dev_type *alloc_memory_type(int adistance) +{ + struct memory_dev_type *memtype; + + memtype = kmalloc(sizeof(*memtype), GFP_KERNEL); + if (!memtype) + return ERR_PTR(-ENOMEM); + + memtype->adistance = adistance; + INIT_LIST_HEAD(&memtype->tier_sibiling); + memtype->nodes = NODE_MASK_NONE; + kref_init(&memtype->kref); + return memtype; +} +EXPORT_SYMBOL_GPL(alloc_memory_type); + +void destroy_memory_type(struct memory_dev_type *memtype) +{ + kref_put(&memtype->kref, release_memtype); +} +EXPORT_SYMBOL_GPL(destroy_memory_type); + +void init_node_memory_type(int node, struct memory_dev_type *memtype) +{ + + mutex_lock(&memory_tier_lock); + __init_node_memory_type(node, memtype); + mutex_unlock(&memory_tier_lock); +} +EXPORT_SYMBOL_GPL(init_node_memory_type); + +void clear_node_memory_type(int node, struct memory_dev_type *memtype) +{ + mutex_lock(&memory_tier_lock); + if (node_memory_types[node].memtype == memtype) + node_memory_types[node].map_count--; + /* + * If we umapped all the attached devices to this node, + * clear the node memory type. + */ + if (!node_memory_types[node].map_count) { + node_memory_types[node].memtype = NULL; + kref_put(&memtype->kref, release_memtype); + } + mutex_unlock(&memory_tier_lock); +} +EXPORT_SYMBOL_GPL(clear_node_memory_type); + +static int __meminit memtier_hotplug_callback(struct notifier_block *self, + unsigned long action, void *_arg) +{ + struct memory_tier *memtier; + struct memory_notify *arg = _arg; + + /* + * Only update the node migration order when a node is + * changing status, like online->offline. + */ + if (arg->status_change_nid < 0) + return notifier_from_errno(0); + + switch (action) { + case MEM_OFFLINE: + mutex_lock(&memory_tier_lock); + if (clear_node_memory_tier(arg->status_change_nid)) + establish_demotion_targets(); + mutex_unlock(&memory_tier_lock); + break; + case MEM_ONLINE: + mutex_lock(&memory_tier_lock); + memtier = set_node_memory_tier(arg->status_change_nid); + if (!IS_ERR(memtier)) + establish_demotion_targets(); + mutex_unlock(&memory_tier_lock); + break; + } + + return notifier_from_errno(0); +} + +static int __init memory_tier_init(void) +{ + int ret, node; + struct memory_tier *memtier; + + ret = subsys_virtual_register(&memory_tier_subsys, NULL); + if (ret) + panic("%s() failed to register memory tier subsystem\n", __func__); + +#ifdef CONFIG_MIGRATION + node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes), + GFP_KERNEL); + WARN_ON(!node_demotion); +#endif + mutex_lock(&memory_tier_lock); + /* + * For now we can have 4 faster memory tiers with smaller adistance + * than default DRAM tier. + */ + default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM); + if (IS_ERR(default_dram_type)) + panic("%s() failed to allocate default DRAM tier\n", __func__); + + /* + * Look at all the existing N_MEMORY nodes and add them to + * default memory tier or to a tier if we already have memory + * types assigned. + */ + for_each_node_state(node, N_MEMORY) { + memtier = set_node_memory_tier(node); + if (IS_ERR(memtier)) + /* + * Continue with memtiers we are able to setup + */ + break; + } + establish_demotion_targets(); + mutex_unlock(&memory_tier_lock); + + hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI); + return 0; +} +subsys_initcall(memory_tier_init); + +bool numa_demotion_enabled = false; + +#ifdef CONFIG_MIGRATION +#ifdef CONFIG_SYSFS +static ssize_t numa_demotion_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", + numa_demotion_enabled ? "true" : "false"); +} + +static ssize_t numa_demotion_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + ssize_t ret; + + ret = kstrtobool(buf, &numa_demotion_enabled); + if (ret) + return ret; + + return count; +} + +static struct kobj_attribute numa_demotion_enabled_attr = + __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show, + numa_demotion_enabled_store); + +static struct attribute *numa_attrs[] = { + &numa_demotion_enabled_attr.attr, + NULL, +}; + +static const struct attribute_group numa_attr_group = { + .attrs = numa_attrs, +}; + +static int __init numa_init_sysfs(void) +{ + int err; + struct kobject *numa_kobj; + + numa_kobj = kobject_create_and_add("numa", mm_kobj); + if (!numa_kobj) { + pr_err("failed to create numa kobject\n"); + return -ENOMEM; + } + err = sysfs_create_group(numa_kobj, &numa_attr_group); + if (err) { + pr_err("failed to register numa group\n"); + goto delete_obj; + } + return 0; + +delete_obj: + kobject_put(numa_kobj); + return err; +} +subsys_initcall(numa_init_sysfs); +#endif /* CONFIG_SYSFS */ +#endif diff --git a/mm/memory.c b/mm/memory.c index 4ba73f5aa8bb..aad226daf41b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -52,6 +52,7 @@ #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/memremap.h> +#include <linux/kmsan.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/export.h> @@ -66,6 +67,7 @@ #include <linux/gfp.h> #include <linux/migrate.h> #include <linux/string.h> +#include <linux/memory-tiers.h> #include <linux/debugfs.h> #include <linux/userfaultfd_k.h> #include <linux/dax.h> @@ -74,6 +76,7 @@ #include <linux/perf_event.h> #include <linux/ptrace.h> #include <linux/vmalloc.h> +#include <linux/sched/sysctl.h> #include <trace/events/kmem.h> @@ -125,18 +128,6 @@ int randomize_va_space __read_mostly = 2; #endif -#ifndef arch_faults_on_old_pte -static inline bool arch_faults_on_old_pte(void) -{ - /* - * Those arches which don't have hw access flag feature need to - * implement their own helper. By default, "true" means pagefault - * will be hit on old pte. - */ - return true; -} -#endif - #ifndef arch_wants_old_prefaulted_pte static inline bool arch_wants_old_prefaulted_pte(void) { @@ -171,58 +162,11 @@ static int __init init_zero_pfn(void) } early_initcall(init_zero_pfn); -void mm_trace_rss_stat(struct mm_struct *mm, int member, long count) -{ - trace_rss_stat(mm, member, count); -} - -#if defined(SPLIT_RSS_COUNTING) - -void sync_mm_rss(struct mm_struct *mm) -{ - int i; - - for (i = 0; i < NR_MM_COUNTERS; i++) { - if (current->rss_stat.count[i]) { - add_mm_counter(mm, i, current->rss_stat.count[i]); - current->rss_stat.count[i] = 0; - } - } - current->rss_stat.events = 0; -} - -static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) -{ - struct task_struct *task = current; - - if (likely(task->mm == mm)) - task->rss_stat.count[member] += val; - else - add_mm_counter(mm, member, val); -} -#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) -#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) - -/* sync counter once per 64 page faults */ -#define TASK_RSS_EVENTS_THRESH (64) -static void check_sync_rss_stat(struct task_struct *task) -{ - if (unlikely(task != current)) - return; - if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) - sync_mm_rss(task->mm); -} -#else /* SPLIT_RSS_COUNTING */ - -#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) -#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) - -static void check_sync_rss_stat(struct task_struct *task) +void mm_trace_rss_stat(struct mm_struct *mm, int member) { + trace_rss_stat(mm, member); } -#endif /* SPLIT_RSS_COUNTING */ - /* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. @@ -402,12 +346,21 @@ void free_pgd_range(struct mmu_gather *tlb, } while (pgd++, addr = next, addr != end); } -void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, - unsigned long floor, unsigned long ceiling) +void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, + struct vm_area_struct *vma, unsigned long floor, + unsigned long ceiling) { - while (vma) { - struct vm_area_struct *next = vma->vm_next; + MA_STATE(mas, mt, vma->vm_end, vma->vm_end); + + do { unsigned long addr = vma->vm_start; + struct vm_area_struct *next; + + /* + * Note: USER_PGTABLES_CEILING may be passed as ceiling and may + * be 0. This will underflow and is okay. + */ + next = mas_find(&mas, ceiling - 1); /* * Hide vma from rmap and truncate_pagecache before freeing @@ -426,7 +379,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, while (next && next->vm_start <= vma->vm_end + PMD_SIZE && !is_vm_hugetlb_page(next)) { vma = next; - next = vma->vm_next; + next = mas_find(&mas, ceiling - 1); unlink_anon_vmas(vma); unlink_file_vma(vma); } @@ -434,7 +387,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, floor, next ? next->vm_start : ceiling); } vma = next; - } + } while (vma); } void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte) @@ -1341,15 +1294,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) return ret; } -/* - * Parameter block passed down to zap_pte_range in exceptional cases. - */ -struct zap_details { - struct folio *single_folio; /* Locked folio to be unmapped */ - bool even_cows; /* Zap COWed private pages too? */ - zap_flags_t zap_flags; /* Extra flags for zapping */ -}; - /* Whether we should zap all COWed (private) pages too */ static inline bool should_zap_cows(struct zap_details *details) { @@ -1430,6 +1374,8 @@ again: break; if (pte_present(ptent)) { + unsigned int delay_rmap; + page = vm_normal_page(vma, addr, ptent); if (unlikely(!should_zap_page(details, page))) continue; @@ -1441,20 +1387,26 @@ again: if (unlikely(!page)) continue; + delay_rmap = 0; if (!PageAnon(page)) { if (pte_dirty(ptent)) { - force_flush = 1; set_page_dirty(page); + if (tlb_delay_rmap(tlb)) { + delay_rmap = 1; + force_flush = 1; + } } if (pte_young(ptent) && likely(!(vma->vm_flags & VM_SEQ_READ))) mark_page_accessed(page); } rss[mm_counter(page)]--; - page_remove_rmap(page, vma, false); - if (unlikely(page_mapcount(page) < 0)) - print_bad_pte(vma, addr, ptent, page); - if (unlikely(__tlb_remove_page(tlb, page))) { + if (!delay_rmap) { + page_remove_rmap(page, vma, false); + if (unlikely(page_mapcount(page) < 0)) + print_bad_pte(vma, addr, ptent, page); + } + if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) { force_flush = 1; addr += PAGE_SIZE; break; @@ -1511,8 +1463,10 @@ again: arch_leave_lazy_mmu_mode(); /* Do the actual TLB flush before dropping ptl */ - if (force_flush) + if (force_flush) { tlb_flush_mmu_tlbonly(tlb); + tlb_flush_rmaps(tlb, vma); + } pte_unmap_unlock(start_pte, ptl); /* @@ -1685,10 +1639,8 @@ static void unmap_single_vma(struct mmu_gather *tlb, if (vma->vm_file) { zap_flags_t zap_flags = details ? details->zap_flags : 0; - i_mmap_lock_write(vma->vm_file->f_mapping); __unmap_hugepage_range_final(tlb, vma, start, end, NULL, zap_flags); - i_mmap_unlock_write(vma->vm_file->f_mapping); } } else unmap_page_range(tlb, vma, start, end, details); @@ -1698,6 +1650,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlb: address of the caller's struct mmu_gather + * @mt: the maple tree * @vma: the starting vma * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping @@ -1713,22 +1666,24 @@ static void unmap_single_vma(struct mmu_gather *tlb, * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ -void unmap_vmas(struct mmu_gather *tlb, +void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr) { struct mmu_notifier_range range; struct zap_details details = { - .zap_flags = ZAP_FLAG_DROP_MARKER, + .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP, /* Careful - we need to zap private pages too! */ .even_cows = true, }; + MA_STATE(mas, mt, vma->vm_end, vma->vm_end); mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, start_addr, end_addr); mmu_notifier_invalidate_range_start(&range); - for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) + do { unmap_single_vma(tlb, vma, start_addr, end_addr, &details); + } while ((vma = mas_find(&mas, end_addr - 1)) != NULL); mmu_notifier_invalidate_range_end(&range); } @@ -1743,8 +1698,11 @@ void unmap_vmas(struct mmu_gather *tlb, void zap_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long size) { + struct maple_tree *mt = &vma->vm_mm->mm_mt; + unsigned long end = start + size; struct mmu_notifier_range range; struct mmu_gather tlb; + MA_STATE(mas, mt, vma->vm_end, vma->vm_end); lru_add_drain(); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, @@ -1752,8 +1710,9 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, tlb_gather_mmu(&tlb, vma->vm_mm); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); - for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next) + do { unmap_single_vma(&tlb, vma, start, range.end, NULL); + } while ((vma = mas_find(&mas, end - 1)) != NULL); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); } @@ -1767,19 +1726,27 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, * * The range must fit into one VMA. */ -static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, +void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { + const unsigned long end = address + size; struct mmu_notifier_range range; struct mmu_gather tlb; lru_add_drain(); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, - address, address + size); + address, end); + if (is_vm_hugetlb_page(vma)) + adjust_range_if_pmd_sharing_possible(vma, &range.start, + &range.end); tlb_gather_mmu(&tlb, vma->vm_mm); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); - unmap_single_vma(&tlb, vma, address, range.end, details); + /* + * unmap 'address-end' not 'range.start-range.end' as range + * could have been expanded for hugetlb pmd sharing. + */ + unmap_single_vma(&tlb, vma, address, end, details); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); } @@ -1853,7 +1820,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, return -EBUSY; /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); + inc_mm_counter(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, vma, false); set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot)); return 0; @@ -2841,10 +2808,16 @@ static inline int pte_unmap_same(struct vm_fault *vmf) return same; } -static inline bool __wp_page_copy_user(struct page *dst, struct page *src, - struct vm_fault *vmf) +/* + * Return: + * 0: copied succeeded + * -EHWPOISON: copy failed due to hwpoison in source page + * -EAGAIN: copied failed (some other reason) + */ +static inline int __wp_page_copy_user(struct page *dst, struct page *src, + struct vm_fault *vmf) { - bool ret; + int ret; void *kaddr; void __user *uaddr; bool locked = false; @@ -2853,8 +2826,11 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src, unsigned long addr = vmf->address; if (likely(src)) { - copy_user_highpage(dst, src, addr, vma); - return true; + if (copy_mc_user_highpage(dst, src, addr, vma)) { + memory_failure_queue(page_to_pfn(src), 0); + return -EHWPOISON; + } + return 0; } /* @@ -2870,7 +2846,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src, * On architectures with software "accessed" bits, we would * take a double page fault, so mark it accessed here. */ - if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) { + if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) { pte_t entry; vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); @@ -2881,7 +2857,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src, * and update local tlb only */ update_mmu_tlb(vma, addr, vmf->pte); - ret = false; + ret = -EAGAIN; goto pte_unlock; } @@ -2906,7 +2882,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src, if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { /* The PTE changed under us, update local tlb */ update_mmu_tlb(vma, addr, vmf->pte); - ret = false; + ret = -EAGAIN; goto pte_unlock; } @@ -2925,7 +2901,7 @@ warn: } } - ret = true; + ret = 0; pte_unlock: if (locked) @@ -3097,6 +3073,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) pte_t entry; int page_copied = 0; struct mmu_notifier_range range; + int ret; delayacct_wpcopy_start(); @@ -3114,20 +3091,23 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) if (!new_page) goto oom; - if (!__wp_page_copy_user(new_page, old_page, vmf)) { + ret = __wp_page_copy_user(new_page, old_page, vmf); + if (ret) { /* * COW failed, if the fault was solved by other, * it's fine. If not, userspace would re-fault on * the same address and we will handle the fault * from the second attempt. + * The -EHWPOISON case will not be retried. */ put_page(new_page); if (old_page) put_page(old_page); delayacct_wpcopy_end(); - return 0; + return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0; } + kmsan_copy_page_meta(new_page, old_page); } if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL)) @@ -3148,12 +3128,11 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { - dec_mm_counter_fast(mm, - mm_counter_file(old_page)); - inc_mm_counter_fast(mm, MM_ANONPAGES); + dec_mm_counter(mm, mm_counter_file(old_page)); + inc_mm_counter(mm, MM_ANONPAGES); } } else { - inc_mm_counter_fast(mm, MM_ANONPAGES); + inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); @@ -3234,7 +3213,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) } delayacct_wpcopy_end(); - return (page_copied && !unshare) ? VM_FAULT_WRITE : 0; + return 0; oom_free_new: put_page(new_page); oom: @@ -3298,14 +3277,14 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) return finish_mkwrite_fault(vmf); } wp_page_reuse(vmf); - return VM_FAULT_WRITE; + return 0; } static vm_fault_t wp_page_shared(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; - vm_fault_t ret = VM_FAULT_WRITE; + vm_fault_t ret = 0; get_page(vmf->page); @@ -3362,9 +3341,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; - - VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE)); - VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE)); + struct folio *folio = NULL; if (likely(!unshare)) { if (userfaultfd_pte_wp(vma, *vmf->pte)) { @@ -3382,13 +3359,12 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) } vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); - if (!vmf->page) { - if (unlikely(unshare)) { - /* No anonymous page -> nothing to do. */ - pte_unmap_unlock(vmf->pte, vmf->ptl); - return 0; - } + /* + * Shared mapping: we are guaranteed to have VM_WRITE and + * FAULT_FLAG_WRITE set at this point. + */ + if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { /* * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a * VM_PFNMAP VMA. @@ -3396,84 +3372,76 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) * We should not cow pages in a shared writeable mapping. * Just mark the pages writable and/or call ops->pfn_mkwrite. */ - if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == - (VM_WRITE|VM_SHARED)) + if (!vmf->page) return wp_pfn_shared(vmf); - - pte_unmap_unlock(vmf->pte, vmf->ptl); - return wp_page_copy(vmf); + return wp_page_shared(vmf); } + if (vmf->page) + folio = page_folio(vmf->page); + /* - * Take out anonymous pages first, anonymous shared vmas are - * not dirty accountable. + * Private mapping: create an exclusive anonymous page copy if reuse + * is impossible. We might miss VM_WRITE for FOLL_FORCE handling. */ - if (PageAnon(vmf->page)) { - struct page *page = vmf->page; - + if (folio && folio_test_anon(folio)) { /* * If the page is exclusive to this process we must reuse the * page without further checks. */ - if (PageAnonExclusive(page)) + if (PageAnonExclusive(vmf->page)) goto reuse; /* - * We have to verify under page lock: these early checks are - * just an optimization to avoid locking the page and freeing + * We have to verify under folio lock: these early checks are + * just an optimization to avoid locking the folio and freeing * the swapcache if there is little hope that we can reuse. * - * PageKsm() doesn't necessarily raise the page refcount. + * KSM doesn't necessarily raise the folio refcount. */ - if (PageKsm(page) || page_count(page) > 3) + if (folio_test_ksm(folio) || folio_ref_count(folio) > 3) goto copy; - if (!PageLRU(page)) + if (!folio_test_lru(folio)) /* * Note: We cannot easily detect+handle references from - * remote LRU pagevecs or references to PageLRU() pages. + * remote LRU pagevecs or references to LRU folios. */ lru_add_drain(); - if (page_count(page) > 1 + PageSwapCache(page)) + if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio)) goto copy; - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto copy; - if (PageSwapCache(page)) - try_to_free_swap(page); - if (PageKsm(page) || page_count(page) != 1) { - unlock_page(page); + if (folio_test_swapcache(folio)) + folio_free_swap(folio); + if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) { + folio_unlock(folio); goto copy; } /* - * Ok, we've got the only page reference from our mapping - * and the page is locked, it's dark out, and we're wearing + * Ok, we've got the only folio reference from our mapping + * and the folio is locked, it's dark out, and we're wearing * sunglasses. Hit it. */ - page_move_anon_rmap(page, vma); - unlock_page(page); + page_move_anon_rmap(vmf->page, vma); + folio_unlock(folio); reuse: if (unlikely(unshare)) { pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } wp_page_reuse(vmf); - return VM_FAULT_WRITE; - } else if (unshare) { - /* No anonymous page -> nothing to do. */ - pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; - } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == - (VM_WRITE|VM_SHARED))) { - return wp_page_shared(vmf); } copy: /* * Ok, we need to copy. Oh, well.. */ - get_page(vmf->page); + if (folio) + folio_get(folio); pte_unmap_unlock(vmf->pte, vmf->ptl); #ifdef CONFIG_KSM - if (PageKsm(vmf->page)) + if (folio && folio_test_ksm(folio)) count_vm_event(COW_KSM); #endif return wp_page_copy(vmf); @@ -3612,11 +3580,11 @@ EXPORT_SYMBOL(unmap_mapping_range); */ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) { - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); struct vm_area_struct *vma = vmf->vma; struct mmu_notifier_range range; - if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) + if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) return VM_FAULT_RETRY; mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma, vma->vm_mm, vmf->address & PAGE_MASK, @@ -3626,23 +3594,23 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (likely(pte_same(*vmf->pte, vmf->orig_pte))) - restore_exclusive_pte(vma, page, vmf->address, vmf->pte); + restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); - unlock_page(page); + folio_unlock(folio); mmu_notifier_invalidate_range_end(&range); return 0; } -static inline bool should_try_to_free_swap(struct page *page, +static inline bool should_try_to_free_swap(struct folio *folio, struct vm_area_struct *vma, unsigned int fault_flags) { - if (!PageSwapCache(page)) + if (!folio_test_swapcache(folio)) return false; - if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) || - PageMlocked(page)) + if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) || + folio_test_mlocked(folio)) return true; /* * If we want to map a page that's in the swapcache writable, we @@ -3650,8 +3618,8 @@ static inline bool should_try_to_free_swap(struct page *page, * user. Try freeing the swapcache to get rid of the swapcache * reference only in case it's likely that we'll be the exlusive user. */ - return (fault_flags & FAULT_FLAG_WRITE) && !PageKsm(page) && - page_count(page) == 2; + return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) && + folio_ref_count(folio) == 2; } static vm_fault_t pte_marker_clear(struct vm_fault *vmf) @@ -3693,11 +3661,14 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) unsigned long marker = pte_marker_get(entry); /* - * PTE markers should always be with file-backed memories, and the - * marker should never be empty. If anything weird happened, the best - * thing to do is to kill the process along with its mm. + * PTE markers should never be empty. If anything weird happened, + * the best thing to do is to kill the process along with its mm. */ - if (WARN_ON_ONCE(vma_is_anonymous(vmf->vma) || !marker)) + if (WARN_ON_ONCE(!marker)) + return VM_FAULT_SIGBUS; + + /* Higher priority than uffd-wp when data corrupted */ + if (marker & PTE_MARKER_SWAPIN_ERROR) return VM_FAULT_SIGBUS; if (pte_marker_entry_uffd_wp(entry)) @@ -3718,7 +3689,8 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct page *page = NULL, *swapcache; + struct folio *swapcache, *folio = NULL; + struct page *page; struct swap_info_struct *si = NULL; rmap_t rmap_flags = RMAP_NONE; bool exclusive = false; @@ -3741,11 +3713,23 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ret = remove_device_exclusive_entry(vmf); } else if (is_device_private_entry(entry)) { vmf->page = pfn_swap_entry_to_page(entry); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { + spin_unlock(vmf->ptl); + goto out; + } + + /* + * Get a page reference while we know the page can't be + * freed. + */ + get_page(vmf->page); + pte_unmap_unlock(vmf->pte, vmf->ptl); ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); + put_page(vmf->page); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; - } else if (is_swapin_error_entry(entry)) { - ret = VM_FAULT_SIGBUS; } else if (is_pte_marker_entry(entry)) { ret = handle_pte_marker(vmf); } else { @@ -3760,21 +3744,25 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (unlikely(!si)) goto out; - page = lookup_swap_cache(entry, vma, vmf->address); - swapcache = page; + folio = swap_cache_get_folio(entry, vma, vmf->address); + if (folio) + page = folio_file_page(folio, swp_offset(entry)); + swapcache = folio; - if (!page) { + if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1) { /* skip swapcache */ - page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, - vmf->address); - if (page) { - __SetPageLocked(page); - __SetPageSwapBacked(page); - - if (mem_cgroup_swapin_charge_page(page, - vma->vm_mm, GFP_KERNEL, entry)) { + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, + vma, vmf->address, false); + page = &folio->page; + if (folio) { + __folio_set_locked(folio); + __folio_set_swapbacked(folio); + + if (mem_cgroup_swapin_charge_folio(folio, + vma->vm_mm, GFP_KERNEL, + entry)) { ret = VM_FAULT_OOM; goto out_page; } @@ -3782,23 +3770,24 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) shadow = get_shadow_from_swap_cache(entry); if (shadow) - workingset_refault(page_folio(page), - shadow); + workingset_refault(folio, shadow); - lru_cache_add(page); + folio_add_lru(folio); /* To provide entry to swap_readpage() */ - set_page_private(page, entry.val); + folio_set_swap_entry(folio, entry); swap_readpage(page, true, NULL); - set_page_private(page, 0); + folio->private = NULL; } } else { page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf); - swapcache = page; + if (page) + folio = page_folio(page); + swapcache = folio; } - if (!page) { + if (!folio) { /* * Back out if somebody else faulted in this pte * while we released the pte lock. @@ -3823,7 +3812,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_release; } - locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); + locked = folio_lock_or_retry(folio, vma->vm_mm, vmf->flags); if (!locked) { ret |= VM_FAULT_RETRY; @@ -3832,13 +3821,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (swapcache) { /* - * Make sure try_to_free_swap or swapoff did not release the + * Make sure folio_free_swap() or swapoff did not release the * swapcache from under us. The page pin, and pte_same test * below, are not enough to exclude that. Even if it is still * swapcache, we need to check that the page's swap has not * changed. */ - if (unlikely(!PageSwapCache(page) || + if (unlikely(!folio_test_swapcache(folio) || page_private(page) != entry.val)) goto out_page; @@ -3850,9 +3839,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) page = ksm_might_need_to_copy(page, vma, vmf->address); if (unlikely(!page)) { ret = VM_FAULT_OOM; - page = swapcache; goto out_page; } + folio = page_folio(page); /* * If we want to map a page that's in the swapcache writable, we @@ -3860,8 +3849,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * owner. Try removing the extra reference from the local LRU * pagevecs if required. */ - if ((vmf->flags & FAULT_FLAG_WRITE) && page == swapcache && - !PageKsm(page) && !PageLRU(page)) + if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache && + !folio_test_ksm(folio) && !folio_test_lru(folio)) lru_add_drain(); } @@ -3875,7 +3864,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) goto out_nomap; - if (unlikely(!PageUptodate(page))) { + if (unlikely(!folio_test_uptodate(folio))) { ret = VM_FAULT_SIGBUS; goto out_nomap; } @@ -3888,26 +3877,26 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * check after taking the PT lock and making sure that nobody * concurrently faulted in this page and set PG_anon_exclusive. */ - BUG_ON(!PageAnon(page) && PageMappedToDisk(page)); - BUG_ON(PageAnon(page) && PageAnonExclusive(page)); + BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio)); + BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page)); /* * Check under PT lock (to protect against concurrent fork() sharing * the swap entry concurrently) for certainly exclusive pages. */ - if (!PageKsm(page)) { + if (!folio_test_ksm(folio)) { /* * Note that pte_swp_exclusive() == false for architectures * without __HAVE_ARCH_PTE_SWP_EXCLUSIVE. */ exclusive = pte_swp_exclusive(vmf->orig_pte); - if (page != swapcache) { + if (folio != swapcache) { /* * We have a fresh page that is not exposed to the * swapcache -> certainly exclusive. */ exclusive = true; - } else if (exclusive && PageWriteback(page) && + } else if (exclusive && folio_test_writeback(folio) && data_race(si->flags & SWP_STABLE_WRITES)) { /* * This is tricky: not all swap backends support @@ -3937,11 +3926,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * yet. */ swap_free(entry); - if (should_try_to_free_swap(page, vma, vmf->flags)) - try_to_free_swap(page); + if (should_try_to_free_swap(folio, vma, vmf->flags)) + folio_free_swap(folio); - inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); + dec_mm_counter(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); /* @@ -3950,11 +3939,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * exposing them to the swapcache or because the swap entry indicates * exclusivity. */ - if (!PageKsm(page) && (exclusive || page_count(page) == 1)) { + if (!folio_test_ksm(folio) && + (exclusive || folio_ref_count(folio) == 1)) { if (vmf->flags & FAULT_FLAG_WRITE) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); vmf->flags &= ~FAULT_FLAG_WRITE; - ret |= VM_FAULT_WRITE; } rmap_flags |= RMAP_EXCLUSIVE; } @@ -3968,19 +3957,20 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vmf->orig_pte = pte; /* ksm created a completely new copy */ - if (unlikely(page != swapcache && swapcache)) { + if (unlikely(folio != swapcache && swapcache)) { page_add_new_anon_rmap(page, vma, vmf->address); - lru_cache_add_inactive_or_unevictable(page, vma); + folio_add_lru_vma(folio, vma); } else { page_add_anon_rmap(page, vma, vmf->address, rmap_flags); } - VM_BUG_ON(!PageAnon(page) || (pte_write(pte) && !PageAnonExclusive(page))); + VM_BUG_ON(!folio_test_anon(folio) || + (pte_write(pte) && !PageAnonExclusive(page))); set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); - unlock_page(page); - if (page != swapcache && swapcache) { + folio_unlock(folio); + if (folio != swapcache && swapcache) { /* * Hold the lock to avoid the swap entry to be reused * until we take the PT lock for the pte_same() check @@ -3989,8 +3979,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * so that the swap count won't change under a * parallel locked swapcache. */ - unlock_page(swapcache); - put_page(swapcache); + folio_unlock(swapcache); + folio_put(swapcache); } if (vmf->flags & FAULT_FLAG_WRITE) { @@ -4011,12 +4001,12 @@ out: out_nomap: pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: - unlock_page(page); + folio_unlock(folio); out_release: - put_page(page); - if (page != swapcache && swapcache) { - unlock_page(swapcache); - put_page(swapcache); + folio_put(folio); + if (folio != swapcache && swapcache) { + folio_unlock(swapcache); + folio_put(swapcache); } if (si) put_swap_device(si); @@ -4104,7 +4094,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!pte_none(*vmf->pte)) { - update_mmu_cache(vma, vmf->address, vmf->pte); + update_mmu_tlb(vma, vmf->address, vmf->pte); goto release; } @@ -4119,7 +4109,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) return handle_userfault(vmf, VM_UFFD_MISSING); } - inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address); lru_cache_add_inactive_or_unevictable(page, vma); setpte: @@ -4309,11 +4299,11 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) entry = pte_mkuffd_wp(pte_wrprotect(entry)); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { - inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, addr); lru_cache_add_inactive_or_unevictable(page, vma); } else { - inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); + inc_mm_counter(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, vma, false); } set_pte_at(vma->vm_mm, addr, vmf->pte, entry); @@ -4386,14 +4376,20 @@ vm_fault_t finish_fault(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - ret = 0; + /* Re-check under ptl */ - if (likely(!vmf_pte_changed(vmf))) + if (likely(!vmf_pte_changed(vmf))) { do_set_pte(vmf, page, vmf->address); - else + + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, vmf->address, vmf->pte); + + ret = 0; + } else { + update_mmu_tlb(vma, vmf->address, vmf->pte); ret = VM_FAULT_NOPAGE; + } - update_mmu_tlb(vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; } @@ -4677,10 +4673,10 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct page *page = NULL; int page_nid = NUMA_NO_NODE; + bool writable = false; int last_cpupid; int target_nid; pte_t pte, old_pte; - bool was_writable = pte_savedwrite(vmf->orig_pte); int flags = 0; /* @@ -4699,6 +4695,15 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) old_pte = ptep_get(vmf->pte); pte = pte_modify(old_pte, vma->vm_page_prot); + /* + * Detect now whether the PTE could be writable; this information + * is only valid while holding the PT lock. + */ + writable = pte_write(pte); + if (!writable && vma_wants_manual_pte_write_upgrade(vma) && + can_change_pte_writable(vma, vmf->address, pte)) + writable = true; + page = vm_normal_page(vma, vmf->address, pte); if (!page || is_zone_device_page(page)) goto out_map; @@ -4715,7 +4720,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * pte_dirty has unpredictable behaviour between PTE scan updates, * background writeback, dirty balancing and application behaviour. */ - if (!was_writable) + if (!writable) flags |= TNF_NO_GROUP; /* @@ -4725,8 +4730,16 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) flags |= TNF_SHARED; - last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); + /* + * For memory tiering mode, cpupid of slow memory page is used + * to record page access time. So use default value. + */ + if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && + !node_is_toptier(page_nid)) + last_cpupid = (-1 & LAST_CPUPID_MASK); + else + last_cpupid = page_cpupid_last(page); target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, &flags); if (target_nid == NUMA_NO_NODE) { @@ -4734,6 +4747,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) goto out_map; } pte_unmap_unlock(vmf->pte, vmf->ptl); + writable = false; /* Migrate to the requested node */ if (migrate_misplaced_page(page, vma, target_nid)) { @@ -4762,7 +4776,7 @@ out_map: old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); pte = pte_modify(old_pte, vma->vm_page_prot); pte = pte_mkyoung(pte); - if (was_writable) + if (writable) pte = pte_mkwrite(pte); ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); update_mmu_cache(vma, vmf->address, vmf->pte); @@ -4783,6 +4797,7 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; + vm_fault_t ret; if (vma_is_anonymous(vmf->vma)) { if (likely(!unshare) && @@ -4790,11 +4805,13 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) return handle_userfault(vmf, VM_UFFD_WP); return do_huge_pmd_wp_page(vmf); } - if (vmf->vma->vm_ops->huge_fault) { - vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); - if (!(ret & VM_FAULT_FALLBACK)) - return ret; + if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { + if (vmf->vma->vm_ops->huge_fault) { + ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } } /* COW or write-notify handled on pte level: split pmd. */ @@ -4820,14 +4837,17 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) + vm_fault_t ret; + /* No support for anonymous transparent PUD pages yet */ if (vma_is_anonymous(vmf->vma)) goto split; - if (vmf->vma->vm_ops->huge_fault) { - vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); - - if (!(ret & VM_FAULT_FALLBACK)) - return ret; + if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { + if (vmf->vma->vm_ops->huge_fault) { + ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } } split: /* COW or write-notify not handled on PUD level: split pud.*/ @@ -4985,7 +5005,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - hugepage_vma_check(vma, vm_flags, false, true)) { + hugepage_vma_check(vma, vm_flags, false, true, true)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -5019,7 +5039,7 @@ retry_pud: goto retry_pud; if (pmd_none(*vmf.pmd) && - hugepage_vma_check(vma, vm_flags, false, true)) { + hugepage_vma_check(vma, vm_flags, false, true, true)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -5114,6 +5134,51 @@ static inline void mm_account_fault(struct pt_regs *regs, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } +#ifdef CONFIG_LRU_GEN +static void lru_gen_enter_fault(struct vm_area_struct *vma) +{ + /* the LRU algorithm doesn't apply to sequential or random reads */ + current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)); +} + +static void lru_gen_exit_fault(void) +{ + current->in_lru_fault = false; +} +#else +static void lru_gen_enter_fault(struct vm_area_struct *vma) +{ +} + +static void lru_gen_exit_fault(void) +{ +} +#endif /* CONFIG_LRU_GEN */ + +static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma, + unsigned int *flags) +{ + if (unlikely(*flags & FAULT_FLAG_UNSHARE)) { + if (WARN_ON_ONCE(*flags & FAULT_FLAG_WRITE)) + return VM_FAULT_SIGSEGV; + /* + * FAULT_FLAG_UNSHARE only applies to COW mappings. Let's + * just treat it like an ordinary read-fault otherwise. + */ + if (!is_cow_mapping(vma->vm_flags)) + *flags &= ~FAULT_FLAG_UNSHARE; + } else if (*flags & FAULT_FLAG_WRITE) { + /* Write faults on read-only mappings are impossible ... */ + if (WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE))) + return VM_FAULT_SIGSEGV; + /* ... and FOLL_FORCE only applies to COW mappings. */ + if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE) && + !is_cow_mapping(vma->vm_flags))) + return VM_FAULT_SIGSEGV; + } + return 0; +} + /* * By the time we get here, we already hold the mm semaphore * @@ -5130,8 +5195,9 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, count_vm_event(PGFAULT); count_memcg_event_mm(vma->vm_mm, PGFAULT); - /* do counter updates before entering really critical section. */ - check_sync_rss_stat(current); + ret = sanitize_fault_flags(vma, &flags); + if (ret) + return ret; if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, flags & FAULT_FLAG_INSTRUCTION, @@ -5145,11 +5211,15 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, if (flags & FAULT_FLAG_USER) mem_cgroup_enter_user_fault(); + lru_gen_enter_fault(vma); + if (unlikely(is_vm_hugetlb_page(vma))) ret = hugetlb_fault(vma->vm_mm, vma, address, flags); else ret = __handle_mm_fault(vma, address, flags); + lru_gen_exit_fault(); + if (flags & FAULT_FLAG_USER) { mem_cgroup_exit_user_fault(); /* @@ -5637,11 +5707,11 @@ static void clear_gigantic_page(struct page *page, unsigned int pages_per_huge_page) { int i; - struct page *p = page; + struct page *p; might_sleep(); - for (i = 0; i < pages_per_huge_page; - i++, p = mem_map_next(p, page, i)) { + for (i = 0; i < pages_per_huge_page; i++) { + p = nth_page(page, i); cond_resched(); clear_user_highpage(p, addr + i * PAGE_SIZE); } @@ -5677,13 +5747,12 @@ static void copy_user_gigantic_page(struct page *dst, struct page *src, struct page *dst_base = dst; struct page *src_base = src; - for (i = 0; i < pages_per_huge_page; ) { + for (i = 0; i < pages_per_huge_page; i++) { + dst = nth_page(dst_base, i); + src = nth_page(src_base, i); + cond_resched(); copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); - - i++; - dst = mem_map_next(dst, dst_base, i); - src = mem_map_next(src, src_base, i); } } @@ -5730,10 +5799,10 @@ long copy_huge_page_from_user(struct page *dst_page, void *page_kaddr; unsigned long i, rc = 0; unsigned long ret_val = pages_per_huge_page * PAGE_SIZE; - struct page *subpage = dst_page; + struct page *subpage; - for (i = 0; i < pages_per_huge_page; - i++, subpage = mem_map_next(subpage, dst_page, i)) { + for (i = 0; i < pages_per_huge_page; i++) { + subpage = nth_page(dst_page, i); if (allow_pagefault) page_kaddr = kmap(subpage); else diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fad6d1f2262a..fd40f7e9f176 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1085,8 +1085,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, * of the physical memory space for vmemmaps. That space is pageblock * aligned. */ - if (WARN_ON_ONCE(!nr_pages || - !IS_ALIGNED(pfn, pageblock_nr_pages) || + if (WARN_ON_ONCE(!nr_pages || !pageblock_aligned(pfn) || !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL; @@ -1806,8 +1805,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, * of the physical memory space for vmemmaps. That space is pageblock * aligned. */ - if (WARN_ON_ONCE(!nr_pages || - !IS_ALIGNED(start_pfn, pageblock_nr_pages) || + if (WARN_ON_ONCE(!nr_pages || !pageblock_aligned(start_pfn) || !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL; @@ -1940,8 +1938,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, node_states_clear_node(node, &arg); if (arg.status_change_nid >= 0) { - kswapd_stop(node); kcompactd_stop(node); + kswapd_stop(node); } writeback_set_ratelimit(); @@ -1969,11 +1967,10 @@ failed_removal: static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) { - int ret = !is_memblock_offlined(mem); int *nid = arg; *nid = mem->nid; - if (unlikely(ret)) { + if (unlikely(mem->state != MEM_OFFLINE)) { phys_addr_t beginpa, endpa; beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b73d3248d976..02c8a712282f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -381,9 +381,10 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); mmap_write_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) + for_each_vma(vmi, vma) mpol_rebind_policy(vma->vm_policy, new); mmap_write_unlock(mm); } @@ -654,7 +655,7 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, static int queue_pages_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { - struct vm_area_struct *vma = walk->vma; + struct vm_area_struct *next, *vma = walk->vma; struct queue_pages *qp = walk->private; unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; @@ -669,9 +670,10 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, /* hole at head side of range */ return -EFAULT; } + next = find_vma(vma->vm_mm, vma->vm_end); if (!(flags & MPOL_MF_DISCONTIG_OK) && ((vma->vm_end < qp->end) && - (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start))) + (!next || vma->vm_end < next->vm_start))) /* hole at middle or tail of range */ return -EFAULT; @@ -785,26 +787,29 @@ static int vma_replace_policy(struct vm_area_struct *vma, static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct mempolicy *new_pol) { + MA_STATE(mas, &mm->mm_mt, start, start); struct vm_area_struct *prev; struct vm_area_struct *vma; int err = 0; pgoff_t pgoff; - unsigned long vmstart; - unsigned long vmend; - vma = find_vma(mm, start); - VM_BUG_ON(!vma); + prev = mas_prev(&mas, 0); + if (unlikely(!prev)) + mas_set(&mas, start); + + vma = mas_find(&mas, end - 1); + if (WARN_ON(!vma)) + return 0; - prev = vma->vm_prev; if (start > vma->vm_start) prev = vma; - for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) { - vmstart = max(start, vma->vm_start); - vmend = min(end, vma->vm_end); + for (; vma; vma = mas_next(&mas, end - 1)) { + unsigned long vmstart = max(start, vma->vm_start); + unsigned long vmend = min(end, vma->vm_end); if (mpol_equal(vma_policy(vma), new_pol)) - continue; + goto next; pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); @@ -813,6 +818,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (prev) { + /* vma_merge() invalidated the mas */ + mas_pause(&mas); vma = prev; goto replace; } @@ -820,19 +827,25 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, err = split_vma(vma->vm_mm, vma, vmstart, 1); if (err) goto out; + /* split_vma() invalidated the mas */ + mas_pause(&mas); } if (vma->vm_end != vmend) { err = split_vma(vma->vm_mm, vma, vmend, 0); if (err) goto out; + /* split_vma() invalidated the mas */ + mas_pause(&mas); } - replace: +replace: err = vma_replace_policy(vma, new_pol); if (err) goto out; +next: + prev = vma; } - out: +out: return err; } @@ -853,12 +866,14 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, goto out; } + task_lock(current); ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { + task_unlock(current); mpol_put(new); goto out; } - task_lock(current); + old = current->mempolicy; current->mempolicy = new; if (new && new->mode == MPOL_INTERLEAVE) @@ -1047,6 +1062,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) { nodemask_t nmask; + struct vm_area_struct *vma; LIST_HEAD(pagelist); int err = 0; struct migration_target_control mtc = { @@ -1062,8 +1078,9 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, * need migration. Between passing in the full user address * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. */ + vma = find_vma(mm, 0); VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); - queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, + queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { @@ -1193,14 +1210,13 @@ static struct page *new_page(struct page *page, unsigned long start) struct folio *dst, *src = page_folio(page); struct vm_area_struct *vma; unsigned long address; + VMA_ITERATOR(vmi, current->mm, start); gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL; - vma = find_vma(current->mm, start); - while (vma) { + for_each_vma(vmi, vma) { address = page_address_in_vma(page, vma); if (address != -EFAULT) break; - vma = vma->vm_next; } if (folio_test_hugetlb(src)) @@ -1259,7 +1275,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (mode == MPOL_DEFAULT) flags &= ~MPOL_MF_STRICT; - len = (len + PAGE_SIZE - 1) & PAGE_MASK; + len = PAGE_ALIGN(len); end = start + len; if (end < start) @@ -1478,6 +1494,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le unsigned long vmend; unsigned long end; int err = -ENOENT; + VMA_ITERATOR(vmi, mm, start); start = untagged_addr(start); if (start & ~PAGE_MASK) @@ -1495,7 +1512,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le if (home_node >= MAX_NUMNODES || !node_online(home_node)) return -EINVAL; - len = (len + PAGE_SIZE - 1) & PAGE_MASK; + len = PAGE_ALIGN(len); end = start + len; if (end < start) @@ -1503,9 +1520,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le if (end == start) return 0; mmap_write_lock(mm); - vma = find_vma(mm, start); - for (; vma && vma->vm_start < end; vma = vma->vm_next) { - + for_each_vma_range(vmi, vma, end) { vmstart = max(start, vma->vm_start); vmend = min(end, vma->vm_end); new = mpol_dup(vma_policy(vma)); @@ -1525,6 +1540,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le * the home node for vmas we already updated before. */ if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) { + mpol_put(new); err = -EOPNOTSUPP; break; } @@ -1803,7 +1819,7 @@ bool vma_policy_mof(struct vm_area_struct *vma) return pol->flags & MPOL_F_MOF; } -static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) +bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) { enum zone_type dynamic_policy_zone = policy_zone; diff --git a/mm/mempool.c b/mm/mempool.c index 96488b13a1ef..734bcf5afbb7 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -57,8 +57,10 @@ static void __check_element(mempool_t *pool, void *element, size_t size) static void check_element(mempool_t *pool, void *element) { /* Mempools backed by slab allocator */ - if (pool->free == mempool_free_slab || pool->free == mempool_kfree) { - __check_element(pool, element, ksize(element)); + if (pool->free == mempool_kfree) { + __check_element(pool, element, (size_t)pool->pool_data); + } else if (pool->free == mempool_free_slab) { + __check_element(pool, element, kmem_cache_size(pool->pool_data)); } else if (pool->free == mempool_free_pages) { /* Mempools backed by page allocator */ int order = (int)(long)pool->pool_data; @@ -80,8 +82,10 @@ static void __poison_element(void *element, size_t size) static void poison_element(mempool_t *pool, void *element) { /* Mempools backed by slab allocator */ - if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) { - __poison_element(element, ksize(element)); + if (pool->alloc == mempool_kmalloc) { + __poison_element(element, (size_t)pool->pool_data); + } else if (pool->alloc == mempool_alloc_slab) { + __poison_element(element, kmem_cache_size(pool->pool_data)); } else if (pool->alloc == mempool_alloc_pages) { /* Mempools backed by page allocator */ int order = (int)(long)pool->pool_data; @@ -111,8 +115,10 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element) static void kasan_unpoison_element(mempool_t *pool, void *element) { - if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) - kasan_unpoison_range(element, __ksize(element)); + if (pool->alloc == mempool_kmalloc) + kasan_unpoison_range(element, (size_t)pool->pool_data); + else if (pool->alloc == mempool_alloc_slab) + kasan_unpoison_range(element, kmem_cache_size(pool->pool_data)); else if (pool->alloc == mempool_alloc_pages) kasan_unpoison_pages(element, (unsigned long)pool->pool_data, false); diff --git a/mm/memremap.c b/mm/memremap.c index 58b20c3c300b..08cbf54fe037 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -138,8 +138,11 @@ void memunmap_pages(struct dev_pagemap *pgmap) int i; percpu_ref_kill(&pgmap->ref); - for (i = 0; i < pgmap->nr_range; i++) - percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i)); + if (pgmap->type != MEMORY_DEVICE_PRIVATE && + pgmap->type != MEMORY_DEVICE_COHERENT) + for (i = 0; i < pgmap->nr_range; i++) + percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i)); + wait_for_completion(&pgmap->done); for (i = 0; i < pgmap->nr_range; i++) @@ -264,7 +267,9 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], PHYS_PFN(range->start), PHYS_PFN(range_len(range)), pgmap); - percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id)); + if (pgmap->type != MEMORY_DEVICE_PRIVATE && + pgmap->type != MEMORY_DEVICE_COHERENT) + percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id)); return 0; err_add_memory: @@ -330,6 +335,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) WARN(1, "File system DAX not supported\n"); return ERR_PTR(-EINVAL); } + params.pgprot = pgprot_decrypted(params.pgprot); break; case MEMORY_DEVICE_GENERIC: break; @@ -454,7 +460,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, /* fall back to slow path lookup */ rcu_read_lock(); pgmap = xa_load(&pgmap_array, PHYS_PFN(phys)); - if (pgmap && !percpu_ref_tryget_live(&pgmap->ref)) + if (pgmap && !percpu_ref_tryget_live_rcu(&pgmap->ref)) pgmap = NULL; rcu_read_unlock(); @@ -502,11 +508,28 @@ void free_zone_device_page(struct page *page) page->mapping = NULL; page->pgmap->ops->page_free(page); + if (page->pgmap->type != MEMORY_DEVICE_PRIVATE && + page->pgmap->type != MEMORY_DEVICE_COHERENT) + /* + * Reset the page count to 1 to prepare for handing out the page + * again. + */ + set_page_count(page, 1); + else + put_dev_pagemap(page->pgmap); +} + +void zone_device_page_init(struct page *page) +{ /* - * Reset the page count to 1 to prepare for handing out the page again. + * Drivers shouldn't be allocating pages after calling + * memunmap_pages(). */ + WARN_ON_ONCE(!percpu_ref_tryget_live(&page->pgmap->ref)); set_page_count(page, 1); + lock_page(page); } +EXPORT_SYMBOL_GPL(zone_device_page_init); #ifdef CONFIG_FS_DAX bool __put_devmap_managed_page_refs(struct page *page, int refs) diff --git a/mm/migrate.c b/mm/migrate.c index 6a1597c92261..a4d3fc65085f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -50,6 +50,7 @@ #include <linux/memory.h> #include <linux/random.h> #include <linux/sched/sysctl.h> +#include <linux/memory-tiers.h> #include <asm/tlbflush.h> @@ -73,13 +74,22 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) if (unlikely(!get_page_unless_zero(page))) goto out; + if (unlikely(PageSlab(page))) + goto out_putpage; + /* Pairs with smp_wmb() in slab freeing, e.g. SLUB's __free_slab() */ + smp_rmb(); /* - * Check PageMovable before holding a PG_lock because page's owner - * assumes anybody doesn't touch PG_lock of newly allocated page - * so unconditionally grabbing the lock ruins page's owner side. + * Check movable flag before taking the page lock because + * we use non-atomic bitops on newly allocated page flags so + * unconditionally grabbing the lock ruins page's owner side. */ if (unlikely(!__PageMovable(page))) goto out_putpage; + /* Pairs with smp_wmb() in slab allocation, e.g. SLUB's alloc_slab_page() */ + smp_rmb(); + if (unlikely(PageSlab(page))) + goto out_putpage; + /* * As movable pages are not isolated from LRU lists, concurrent * compaction threads can race against page migration functions @@ -198,7 +208,7 @@ static bool remove_migration_pte(struct folio *folio, #endif folio_get(folio); - pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot))); + pte = mk_pte(new, READ_ONCE(vma->vm_page_prot)); if (pte_swp_soft_dirty(*pvmw.pte)) pte = pte_mksoft_dirty(pte); @@ -206,6 +216,10 @@ static bool remove_migration_pte(struct folio *folio, * Recheck VMA as permissions can change since migration started */ entry = pte_to_swp_entry(*pvmw.pte); + if (!is_migration_entry_young(entry)) + pte = pte_mkold(pte); + if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) + pte = pte_mkdirty(pte); if (is_writable_migration_entry(entry)) pte = maybe_mkwrite(pte, vma); else if (pte_swp_uffd_wp(*pvmw.pte)) @@ -560,6 +574,18 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) * future migrations of this same page. */ cpupid = page_cpupid_xchg_last(&folio->page, -1); + /* + * For memory tiering mode, when migrate between slow and fast + * memory node, reset cpupid, because that is used to record + * page access time in slow memory node. + */ + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) { + bool f_toptier = node_is_toptier(page_to_nid(&folio->page)); + bool t_toptier = node_is_toptier(page_to_nid(&newfolio->page)); + + if (f_toptier != t_toptier) + cpupid = -1; + } page_cpupid_xchg_last(&newfolio->page, cpupid); folio_migrate_ksm(newfolio, folio); @@ -608,6 +634,25 @@ EXPORT_SYMBOL(folio_migrate_copy); * Migration functions ***********************************************************/ +int migrate_folio_extra(struct address_space *mapping, struct folio *dst, + struct folio *src, enum migrate_mode mode, int extra_count) +{ + int rc; + + BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */ + + rc = folio_migrate_mapping(mapping, dst, src, extra_count); + + if (rc != MIGRATEPAGE_SUCCESS) + return rc; + + if (mode != MIGRATE_SYNC_NO_COPY) + folio_migrate_copy(dst, src); + else + folio_migrate_flags(dst, src); + return MIGRATEPAGE_SUCCESS; +} + /** * migrate_folio() - Simple folio migration. * @mapping: The address_space containing the folio. @@ -623,20 +668,7 @@ EXPORT_SYMBOL(folio_migrate_copy); int migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { - int rc; - - BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */ - - rc = folio_migrate_mapping(mapping, dst, src, 0); - - if (rc != MIGRATEPAGE_SUCCESS) - return rc; - - if (mode != MIGRATE_SYNC_NO_COPY) - folio_migrate_copy(dst, src); - else - folio_migrate_flags(dst, src); - return MIGRATEPAGE_SUCCESS; + return migrate_folio_extra(mapping, dst, src, mode, 0); } EXPORT_SYMBOL(migrate_folio); @@ -797,6 +829,7 @@ int buffer_migrate_folio_norefs(struct address_space *mapping, { return __buffer_migrate_folio(mapping, dst, src, mode, true); } +EXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs); #endif int filemap_migrate_folio(struct address_space *mapping, @@ -976,17 +1009,15 @@ out: return rc; } -static int __unmap_and_move(struct page *page, struct page *newpage, +static int __unmap_and_move(struct folio *src, struct folio *dst, int force, enum migrate_mode mode) { - struct folio *folio = page_folio(page); - struct folio *dst = page_folio(newpage); int rc = -EAGAIN; bool page_was_mapped = false; struct anon_vma *anon_vma = NULL; - bool is_lru = !__PageMovable(page); + bool is_lru = !__PageMovable(&src->page); - if (!trylock_page(page)) { + if (!folio_trylock(src)) { if (!force || mode == MIGRATE_ASYNC) goto out; @@ -1006,10 +1037,10 @@ static int __unmap_and_move(struct page *page, struct page *newpage, if (current->flags & PF_MEMALLOC) goto out; - lock_page(page); + folio_lock(src); } - if (PageWriteback(page)) { + if (folio_test_writeback(src)) { /* * Only in the case of a full synchronous migration is it * necessary to wait for PageWriteback. In the async case, @@ -1026,39 +1057,39 @@ static int __unmap_and_move(struct page *page, struct page *newpage, } if (!force) goto out_unlock; - wait_on_page_writeback(page); + folio_wait_writeback(src); } /* - * By try_to_migrate(), page->mapcount goes down to 0 here. In this case, - * we cannot notice that anon_vma is freed while we migrates a page. + * By try_to_migrate(), src->mapcount goes down to 0 here. In this case, + * we cannot notice that anon_vma is freed while we migrate a page. * This get_anon_vma() delays freeing anon_vma pointer until the end * of migration. File cache pages are no problem because of page_lock() * File Caches may use write_page() or lock_page() in migration, then, * just care Anon page here. * - * Only page_get_anon_vma() understands the subtleties of + * Only folio_get_anon_vma() understands the subtleties of * getting a hold on an anon_vma from outside one of its mms. * But if we cannot get anon_vma, then we won't need it anyway, * because that implies that the anon page is no longer mapped * (and cannot be remapped so long as we hold the page lock). */ - if (PageAnon(page) && !PageKsm(page)) - anon_vma = page_get_anon_vma(page); + if (folio_test_anon(src) && !folio_test_ksm(src)) + anon_vma = folio_get_anon_vma(src); /* * Block others from accessing the new page when we get around to * establishing additional references. We are usually the only one - * holding a reference to newpage at this point. We used to have a BUG - * here if trylock_page(newpage) fails, but would like to allow for - * cases where there might be a race with the previous use of newpage. + * holding a reference to dst at this point. We used to have a BUG + * here if folio_trylock(dst) fails, but would like to allow for + * cases where there might be a race with the previous use of dst. * This is much like races on refcount of oldpage: just don't BUG(). */ - if (unlikely(!trylock_page(newpage))) + if (unlikely(!folio_trylock(dst))) goto out_unlock; if (unlikely(!is_lru)) { - rc = move_to_new_folio(dst, folio, mode); + rc = move_to_new_folio(dst, src, mode); goto out_unlock_both; } @@ -1066,7 +1097,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * Corner case handling: * 1. When a new swap-cache page is read into, it is added to the LRU * and treated as swapcache but it has no rmap yet. - * Calling try_to_unmap() against a page->mapping==NULL page will + * Calling try_to_unmap() against a src->mapping==NULL page will * trigger a BUG. So handle it here. * 2. An orphaned page (see truncate_cleanup_page) might have * fs-private metadata. The page can be picked up due to memory @@ -1074,133 +1105,134 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * invisible to the vm, so the page can not be migrated. So try to * free the metadata, so the page can be freed. */ - if (!page->mapping) { - VM_BUG_ON_PAGE(PageAnon(page), page); - if (page_has_private(page)) { - try_to_free_buffers(folio); + if (!src->mapping) { + if (folio_test_private(src)) { + try_to_free_buffers(src); goto out_unlock_both; } - } else if (page_mapped(page)) { + } else if (folio_mapped(src)) { /* Establish migration ptes */ - VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma, - page); - try_to_migrate(folio, 0); + VM_BUG_ON_FOLIO(folio_test_anon(src) && + !folio_test_ksm(src) && !anon_vma, src); + try_to_migrate(src, 0); page_was_mapped = true; } - if (!page_mapped(page)) - rc = move_to_new_folio(dst, folio, mode); + if (!folio_mapped(src)) + rc = move_to_new_folio(dst, src, mode); /* - * When successful, push newpage to LRU immediately: so that if it + * When successful, push dst to LRU immediately: so that if it * turns out to be an mlocked page, remove_migration_ptes() will - * automatically build up the correct newpage->mlock_count for it. + * automatically build up the correct dst->mlock_count for it. * * We would like to do something similar for the old page, when * unsuccessful, and other cases when a page has been temporarily * isolated from the unevictable LRU: but this case is the easiest. */ if (rc == MIGRATEPAGE_SUCCESS) { - lru_cache_add(newpage); + folio_add_lru(dst); if (page_was_mapped) lru_add_drain(); } if (page_was_mapped) - remove_migration_ptes(folio, - rc == MIGRATEPAGE_SUCCESS ? dst : folio, false); + remove_migration_ptes(src, + rc == MIGRATEPAGE_SUCCESS ? dst : src, false); out_unlock_both: - unlock_page(newpage); + folio_unlock(dst); out_unlock: /* Drop an anon_vma reference if we took one */ if (anon_vma) put_anon_vma(anon_vma); - unlock_page(page); + folio_unlock(src); out: /* - * If migration is successful, decrease refcount of the newpage, + * If migration is successful, decrease refcount of dst, * which will not free the page because new page owner increased * refcounter. */ if (rc == MIGRATEPAGE_SUCCESS) - put_page(newpage); + folio_put(dst); return rc; } /* - * Obtain the lock on page, remove all ptes and migrate the page - * to the newly allocated page in newpage. + * Obtain the lock on folio, remove all ptes and migrate the folio + * to the newly allocated folio in dst. */ static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, - unsigned long private, struct page *page, + unsigned long private, struct folio *src, int force, enum migrate_mode mode, enum migrate_reason reason, struct list_head *ret) { + struct folio *dst; int rc = MIGRATEPAGE_SUCCESS; struct page *newpage = NULL; - if (!thp_migration_supported() && PageTransHuge(page)) + if (!thp_migration_supported() && folio_test_transhuge(src)) return -ENOSYS; - if (page_count(page) == 1) { - /* Page was freed from under us. So we are done. */ - ClearPageActive(page); - ClearPageUnevictable(page); + if (folio_ref_count(src) == 1) { + /* Folio was freed from under us. So we are done. */ + folio_clear_active(src); + folio_clear_unevictable(src); /* free_pages_prepare() will clear PG_isolated. */ goto out; } - newpage = get_new_page(page, private); + newpage = get_new_page(&src->page, private); if (!newpage) return -ENOMEM; + dst = page_folio(newpage); - newpage->private = 0; - rc = __unmap_and_move(page, newpage, force, mode); + dst->private = NULL; + rc = __unmap_and_move(src, dst, force, mode); if (rc == MIGRATEPAGE_SUCCESS) - set_page_owner_migrate_reason(newpage, reason); + set_page_owner_migrate_reason(&dst->page, reason); out: if (rc != -EAGAIN) { /* - * A page that has been migrated has all references - * removed and will be freed. A page that has not been + * A folio that has been migrated has all references + * removed and will be freed. A folio that has not been * migrated will have kept its references and be restored. */ - list_del(&page->lru); + list_del(&src->lru); } /* * If migration is successful, releases reference grabbed during - * isolation. Otherwise, restore the page to right list unless + * isolation. Otherwise, restore the folio to right list unless * we want to retry. */ if (rc == MIGRATEPAGE_SUCCESS) { /* - * Compaction can migrate also non-LRU pages which are + * Compaction can migrate also non-LRU folios which are * not accounted to NR_ISOLATED_*. They can be recognized - * as __PageMovable + * as __folio_test_movable */ - if (likely(!__PageMovable(page))) - mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + - page_is_file_lru(page), -thp_nr_pages(page)); + if (likely(!__folio_test_movable(src))) + mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON + + folio_is_file_lru(src), -folio_nr_pages(src)); if (reason != MR_MEMORY_FAILURE) /* - * We release the page in page_handle_poison. + * We release the folio in page_handle_poison. */ - put_page(page); + folio_put(src); } else { if (rc != -EAGAIN) - list_add_tail(&page->lru, ret); + list_add_tail(&src->lru, ret); if (put_new_page) - put_new_page(newpage, private); + put_new_page(&dst->page, private); else - put_page(newpage); + folio_put(dst); } return rc; @@ -1244,12 +1276,10 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, * tables or check whether the hugepage is pmd-based or not before * kicking migration. */ - if (!hugepage_migration_supported(page_hstate(hpage))) { - list_move_tail(&hpage->lru, ret); + if (!hugepage_migration_supported(page_hstate(hpage))) return -ENOSYS; - } - if (page_count(hpage) == 1) { + if (folio_ref_count(src) == 1) { /* page was freed from under us. So we are done. */ putback_active_hugepage(hpage); return MIGRATEPAGE_SUCCESS; @@ -1260,7 +1290,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, return -ENOMEM; dst = page_folio(new_hpage); - if (!trylock_page(hpage)) { + if (!folio_trylock(src)) { if (!force) goto out; switch (mode) { @@ -1270,29 +1300,29 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, default: goto out; } - lock_page(hpage); + folio_lock(src); } /* * Check for pages which are in the process of being freed. Without - * page_mapping() set, hugetlbfs specific move page routine will not + * folio_mapping() set, hugetlbfs specific move page routine will not * be called and we could leak usage counts for subpools. */ - if (hugetlb_page_subpool(hpage) && !page_mapping(hpage)) { + if (hugetlb_folio_subpool(src) && !folio_mapping(src)) { rc = -EBUSY; goto out_unlock; } - if (PageAnon(hpage)) - anon_vma = page_get_anon_vma(hpage); + if (folio_test_anon(src)) + anon_vma = folio_get_anon_vma(src); - if (unlikely(!trylock_page(new_hpage))) + if (unlikely(!folio_trylock(dst))) goto put_anon; - if (page_mapped(hpage)) { + if (folio_mapped(src)) { enum ttu_flags ttu = 0; - if (!PageAnon(hpage)) { + if (!folio_test_anon(src)) { /* * In shared mappings, try_to_unmap could potentially * call huge_pmd_unshare. Because of this, take @@ -1313,7 +1343,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, i_mmap_unlock_write(mapping); } - if (!page_mapped(hpage)) + if (!folio_mapped(src)) rc = move_to_new_folio(dst, src, mode); if (page_was_mapped) @@ -1321,24 +1351,24 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, rc == MIGRATEPAGE_SUCCESS ? dst : src, false); unlock_put_anon: - unlock_page(new_hpage); + folio_unlock(dst); put_anon: if (anon_vma) put_anon_vma(anon_vma); if (rc == MIGRATEPAGE_SUCCESS) { - move_hugetlb_state(hpage, new_hpage, reason); + move_hugetlb_state(src, dst, reason); put_new_page = NULL; } out_unlock: - unlock_page(hpage); + folio_unlock(src); out: if (rc == MIGRATEPAGE_SUCCESS) putback_active_hugepage(hpage); else if (rc != -EAGAIN) - list_move_tail(&hpage->lru, ret); + list_move_tail(&src->lru, ret); /* * If migration was not successful and there's a freeing callback, use @@ -1353,210 +1383,248 @@ out: return rc; } -static inline int try_split_thp(struct page *page, struct page **page2, - struct list_head *from) +static inline int try_split_folio(struct folio *folio, struct list_head *split_folios) { - int rc = 0; + int rc; - lock_page(page); - rc = split_huge_page_to_list(page, from); - unlock_page(page); + folio_lock(folio); + rc = split_folio_to_list(folio, split_folios); + folio_unlock(folio); if (!rc) - list_safe_reset_next(page, *page2, lru); + list_move_tail(&folio->lru, split_folios); return rc; } /* - * migrate_pages - migrate the pages specified in a list, to the free pages + * migrate_pages - migrate the folios specified in a list, to the free folios * supplied as the target for the page migration * - * @from: The list of pages to be migrated. - * @get_new_page: The function used to allocate free pages to be used - * as the target of the page migration. - * @put_new_page: The function used to free target pages if migration + * @from: The list of folios to be migrated. + * @get_new_page: The function used to allocate free folios to be used + * as the target of the folio migration. + * @put_new_page: The function used to free target folios if migration * fails, or NULL if no special handling is necessary. * @private: Private data to be passed on to get_new_page() * @mode: The migration mode that specifies the constraints for - * page migration, if any. - * @reason: The reason for page migration. - * @ret_succeeded: Set to the number of normal pages migrated successfully if + * folio migration, if any. + * @reason: The reason for folio migration. + * @ret_succeeded: Set to the number of folios migrated successfully if * the caller passes a non-NULL pointer. * - * The function returns after 10 attempts or if no pages are movable any more - * because the list has become empty or no retryable pages exist any more. - * It is caller's responsibility to call putback_movable_pages() to return pages + * The function returns after 10 attempts or if no folios are movable any more + * because the list has become empty or no retryable folios exist any more. + * It is caller's responsibility to call putback_movable_pages() to return folios * to the LRU or free list only if ret != 0. * - * Returns the number of {normal page, THP, hugetlb} that were not migrated, or - * an error code. The number of THP splits will be considered as the number of - * non-migrated THP, no matter how many subpages of the THP are migrated successfully. + * Returns the number of {normal folio, large folio, hugetlb} that were not + * migrated, or an error code. The number of large folio splits will be + * considered as the number of non-migrated large folio, no matter how many + * split folios of the large folio are migrated successfully. */ int migrate_pages(struct list_head *from, new_page_t get_new_page, free_page_t put_new_page, unsigned long private, enum migrate_mode mode, int reason, unsigned int *ret_succeeded) { int retry = 1; + int large_retry = 1; int thp_retry = 1; int nr_failed = 0; int nr_failed_pages = 0; + int nr_retry_pages = 0; int nr_succeeded = 0; int nr_thp_succeeded = 0; + int nr_large_failed = 0; int nr_thp_failed = 0; int nr_thp_split = 0; int pass = 0; + bool is_large = false; bool is_thp = false; - struct page *page; - struct page *page2; - int rc, nr_subpages; - LIST_HEAD(ret_pages); - LIST_HEAD(thp_split_pages); + struct folio *folio, *folio2; + int rc, nr_pages; + LIST_HEAD(ret_folios); + LIST_HEAD(split_folios); bool nosplit = (reason == MR_NUMA_MISPLACED); - bool no_subpage_counting = false; + bool no_split_folio_counting = false; trace_mm_migrate_pages_start(mode, reason); -thp_subpage_migration: - for (pass = 0; pass < 10 && (retry || thp_retry); pass++) { +split_folio_migration: + for (pass = 0; pass < 10 && (retry || large_retry); pass++) { retry = 0; + large_retry = 0; thp_retry = 0; + nr_retry_pages = 0; - list_for_each_entry_safe(page, page2, from, lru) { -retry: + list_for_each_entry_safe(folio, folio2, from, lru) { /* - * THP statistics is based on the source huge page. - * Capture required information that might get lost - * during migration. + * Large folio statistics is based on the source large + * folio. Capture required information that might get + * lost during migration. */ - is_thp = PageTransHuge(page) && !PageHuge(page); - nr_subpages = compound_nr(page); + is_large = folio_test_large(folio) && !folio_test_hugetlb(folio); + is_thp = is_large && folio_test_pmd_mappable(folio); + nr_pages = folio_nr_pages(folio); cond_resched(); - if (PageHuge(page)) + if (folio_test_hugetlb(folio)) rc = unmap_and_move_huge_page(get_new_page, - put_new_page, private, page, - pass > 2, mode, reason, - &ret_pages); + put_new_page, private, + &folio->page, pass > 2, mode, + reason, + &ret_folios); else rc = unmap_and_move(get_new_page, put_new_page, - private, page, pass > 2, mode, - reason, &ret_pages); + private, folio, pass > 2, mode, + reason, &ret_folios); /* * The rules are: - * Success: non hugetlb page will be freed, hugetlb - * page will be put back + * Success: non hugetlb folio will be freed, hugetlb + * folio will be put back * -EAGAIN: stay on the from list * -ENOMEM: stay on the from list - * Other errno: put on ret_pages list then splice to + * -ENOSYS: stay on the from list + * Other errno: put on ret_folios list then splice to * from list */ switch(rc) { /* - * THP migration might be unsupported or the - * allocation could've failed so we should - * retry on the same page with the THP split - * to base pages. + * Large folio migration might be unsupported or + * the allocation could've failed so we should retry + * on the same folio with the large folio split + * to normal folios. * - * Head page is retried immediately and tail - * pages are added to the tail of the list so - * we encounter them after the rest of the list - * is processed. + * Split folios are put in split_folios, and + * we will migrate them after the rest of the + * list is processed. */ case -ENOSYS: - /* THP migration is unsupported */ - if (is_thp) { - nr_thp_failed++; - if (!try_split_thp(page, &page2, &thp_split_pages)) { - nr_thp_split++; - goto retry; + /* Large folio migration is unsupported */ + if (is_large) { + nr_large_failed++; + nr_thp_failed += is_thp; + if (!try_split_folio(folio, &split_folios)) { + nr_thp_split += is_thp; + break; } /* Hugetlb migration is unsupported */ - } else if (!no_subpage_counting) { + } else if (!no_split_folio_counting) { nr_failed++; } - nr_failed_pages += nr_subpages; + nr_failed_pages += nr_pages; + list_move_tail(&folio->lru, &ret_folios); break; case -ENOMEM: /* * When memory is low, don't bother to try to migrate - * other pages, just exit. - * THP NUMA faulting doesn't split THP to retry. + * other folios, just exit. */ - if (is_thp && !nosplit) { - nr_thp_failed++; - if (!try_split_thp(page, &page2, &thp_split_pages)) { - nr_thp_split++; - goto retry; + if (is_large) { + nr_large_failed++; + nr_thp_failed += is_thp; + /* Large folio NUMA faulting doesn't split to retry. */ + if (!nosplit) { + int ret = try_split_folio(folio, &split_folios); + + if (!ret) { + nr_thp_split += is_thp; + break; + } else if (reason == MR_LONGTERM_PIN && + ret == -EAGAIN) { + /* + * Try again to split large folio to + * mitigate the failure of longterm pinning. + */ + large_retry++; + thp_retry += is_thp; + nr_retry_pages += nr_pages; + break; + } } - } else if (!no_subpage_counting) { + } else if (!no_split_folio_counting) { nr_failed++; } - nr_failed_pages += nr_subpages; + nr_failed_pages += nr_pages + nr_retry_pages; /* - * There might be some subpages of fail-to-migrate THPs - * left in thp_split_pages list. Move them back to migration + * There might be some split folios of fail-to-migrate large + * folios left in split_folios list. Move them back to migration * list so that they could be put back to the right list by - * the caller otherwise the page refcnt will be leaked. + * the caller otherwise the folio refcnt will be leaked. */ - list_splice_init(&thp_split_pages, from); + list_splice_init(&split_folios, from); + /* nr_failed isn't updated for not used */ + nr_large_failed += large_retry; nr_thp_failed += thp_retry; goto out; case -EAGAIN: - if (is_thp) - thp_retry++; - else + if (is_large) { + large_retry++; + thp_retry += is_thp; + } else if (!no_split_folio_counting) { retry++; + } + nr_retry_pages += nr_pages; break; case MIGRATEPAGE_SUCCESS: - nr_succeeded += nr_subpages; - if (is_thp) - nr_thp_succeeded++; + nr_succeeded += nr_pages; + nr_thp_succeeded += is_thp; break; default: /* * Permanent failure (-EBUSY, etc.): - * unlike -EAGAIN case, the failed page is - * removed from migration page list and not + * unlike -EAGAIN case, the failed folio is + * removed from migration folio list and not * retried in the next outer loop. */ - if (is_thp) - nr_thp_failed++; - else if (!no_subpage_counting) + if (is_large) { + nr_large_failed++; + nr_thp_failed += is_thp; + } else if (!no_split_folio_counting) { nr_failed++; + } - nr_failed_pages += nr_subpages; + nr_failed_pages += nr_pages; break; } } } nr_failed += retry; + nr_large_failed += large_retry; nr_thp_failed += thp_retry; + nr_failed_pages += nr_retry_pages; /* - * Try to migrate subpages of fail-to-migrate THPs, no nr_failed - * counting in this round, since all subpages of a THP is counted - * as 1 failure in the first round. + * Try to migrate split folios of fail-to-migrate large folios, no + * nr_failed counting in this round, since all split folios of a + * large folio is counted as 1 failure in the first round. */ - if (!list_empty(&thp_split_pages)) { + if (!list_empty(&split_folios)) { /* - * Move non-migrated pages (after 10 retries) to ret_pages + * Move non-migrated folios (after 10 retries) to ret_folios * to avoid migrating them again. */ - list_splice_init(from, &ret_pages); - list_splice_init(&thp_split_pages, from); - no_subpage_counting = true; + list_splice_init(from, &ret_folios); + list_splice_init(&split_folios, from); + no_split_folio_counting = true; retry = 1; - goto thp_subpage_migration; + goto split_folio_migration; } - rc = nr_failed + nr_thp_failed; + rc = nr_failed + nr_large_failed; out: /* - * Put the permanent failure page back to migration list, they + * Put the permanent failure folio back to migration list, they * will be put back to the right list by the caller. */ - list_splice(&ret_pages, from); + list_splice(&ret_folios, from); + + /* + * Return 0 in case all split folios of fail-to-migrate large folios + * are migrated successfully. + */ + if (list_empty(from)) + rc = 0; count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); count_vm_events(PGMIGRATE_FAIL, nr_failed_pages); @@ -1589,7 +1657,7 @@ struct page *alloc_migration_target(struct page *page, unsigned long private) nid = folio_nid(folio); if (folio_test_hugetlb(folio)) { - struct hstate *h = page_hstate(&folio->page); + struct hstate *h = folio_hstate(folio); gfp_mask = htlb_modify_alloc_mask(h, gfp_mask); return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask); @@ -1672,9 +1740,12 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, goto out; err = -ENOENT; - if (!page || is_zone_device_page(page)) + if (!page) goto out; + if (is_zone_device_page(page)) + goto out_putpage; + err = 0; if (page_to_nid(page) == node) goto out_putpage; @@ -1735,7 +1806,7 @@ static int move_pages_and_store_status(struct mm_struct *mm, int node, * well. */ if (err > 0) - err += nr_pages - i - 1; + err += nr_pages - i; return err; } return store_status(status, start, node, i - start); @@ -1821,8 +1892,12 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, err = move_pages_and_store_status(mm, current_node, &pagelist, status, start, i, nr_pages); - if (err) + if (err) { + /* We have accounted for page i */ + if (err > 0) + err--; goto out; + } current_node = NUMA_NO_NODE; } out_flush: @@ -1863,12 +1938,14 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, if (IS_ERR(page)) goto set_status; - if (page && !is_zone_device_page(page)) { + err = -ENOENT; + if (!page) + goto set_status; + + if (!is_zone_device_page(page)) err = page_to_nid(page); - put_page(page); - } else { - err = -ENOENT; - } + + put_page(page); set_status: *status = err; @@ -2170,456 +2247,4 @@ out: return 0; } #endif /* CONFIG_NUMA_BALANCING */ - -/* - * node_demotion[] example: - * - * Consider a system with two sockets. Each socket has - * three classes of memory attached: fast, medium and slow. - * Each memory class is placed in its own NUMA node. The - * CPUs are placed in the node with the "fast" memory. The - * 6 NUMA nodes (0-5) might be split among the sockets like - * this: - * - * Socket A: 0, 1, 2 - * Socket B: 3, 4, 5 - * - * When Node 0 fills up, its memory should be migrated to - * Node 1. When Node 1 fills up, it should be migrated to - * Node 2. The migration path start on the nodes with the - * processors (since allocations default to this node) and - * fast memory, progress through medium and end with the - * slow memory: - * - * 0 -> 1 -> 2 -> stop - * 3 -> 4 -> 5 -> stop - * - * This is represented in the node_demotion[] like this: - * - * { nr=1, nodes[0]=1 }, // Node 0 migrates to 1 - * { nr=1, nodes[0]=2 }, // Node 1 migrates to 2 - * { nr=0, nodes[0]=-1 }, // Node 2 does not migrate - * { nr=1, nodes[0]=4 }, // Node 3 migrates to 4 - * { nr=1, nodes[0]=5 }, // Node 4 migrates to 5 - * { nr=0, nodes[0]=-1 }, // Node 5 does not migrate - * - * Moreover some systems may have multiple slow memory nodes. - * Suppose a system has one socket with 3 memory nodes, node 0 - * is fast memory type, and node 1/2 both are slow memory - * type, and the distance between fast memory node and slow - * memory node is same. So the migration path should be: - * - * 0 -> 1/2 -> stop - * - * This is represented in the node_demotion[] like this: - * { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2 - * { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate - * { nr=0, nodes[0]=-1, }, // Node 2 does not migrate - */ - -/* - * Writes to this array occur without locking. Cycles are - * not allowed: Node X demotes to Y which demotes to X... - * - * If multiple reads are performed, a single rcu_read_lock() - * must be held over all reads to ensure that no cycles are - * observed. - */ -#define DEFAULT_DEMOTION_TARGET_NODES 15 - -#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES -#define DEMOTION_TARGET_NODES (MAX_NUMNODES - 1) -#else -#define DEMOTION_TARGET_NODES DEFAULT_DEMOTION_TARGET_NODES -#endif - -struct demotion_nodes { - unsigned short nr; - short nodes[DEMOTION_TARGET_NODES]; -}; - -static struct demotion_nodes *node_demotion __read_mostly; - -/** - * next_demotion_node() - Get the next node in the demotion path - * @node: The starting node to lookup the next node - * - * Return: node id for next memory node in the demotion path hierarchy - * from @node; NUMA_NO_NODE if @node is terminal. This does not keep - * @node online or guarantee that it *continues* to be the next demotion - * target. - */ -int next_demotion_node(int node) -{ - struct demotion_nodes *nd; - unsigned short target_nr, index; - int target; - - if (!node_demotion) - return NUMA_NO_NODE; - - nd = &node_demotion[node]; - - /* - * node_demotion[] is updated without excluding this - * function from running. RCU doesn't provide any - * compiler barriers, so the READ_ONCE() is required - * to avoid compiler reordering or read merging. - * - * Make sure to use RCU over entire code blocks if - * node_demotion[] reads need to be consistent. - */ - rcu_read_lock(); - target_nr = READ_ONCE(nd->nr); - - switch (target_nr) { - case 0: - target = NUMA_NO_NODE; - goto out; - case 1: - index = 0; - break; - default: - /* - * If there are multiple target nodes, just select one - * target node randomly. - * - * In addition, we can also use round-robin to select - * target node, but we should introduce another variable - * for node_demotion[] to record last selected target node, - * that may cause cache ping-pong due to the changing of - * last target node. Or introducing per-cpu data to avoid - * caching issue, which seems more complicated. So selecting - * target node randomly seems better until now. - */ - index = get_random_int() % target_nr; - break; - } - - target = READ_ONCE(nd->nodes[index]); - -out: - rcu_read_unlock(); - return target; -} - -/* Disable reclaim-based migration. */ -static void __disable_all_migrate_targets(void) -{ - int node, i; - - if (!node_demotion) - return; - - for_each_online_node(node) { - node_demotion[node].nr = 0; - for (i = 0; i < DEMOTION_TARGET_NODES; i++) - node_demotion[node].nodes[i] = NUMA_NO_NODE; - } -} - -static void disable_all_migrate_targets(void) -{ - __disable_all_migrate_targets(); - - /* - * Ensure that the "disable" is visible across the system. - * Readers will see either a combination of before+disable - * state or disable+after. They will never see before and - * after state together. - * - * The before+after state together might have cycles and - * could cause readers to do things like loop until this - * function finishes. This ensures they can only see a - * single "bad" read and would, for instance, only loop - * once. - */ - synchronize_rcu(); -} - -/* - * Find an automatic demotion target for 'node'. - * Failing here is OK. It might just indicate - * being at the end of a chain. - */ -static int establish_migrate_target(int node, nodemask_t *used, - int best_distance) -{ - int migration_target, index, val; - struct demotion_nodes *nd; - - if (!node_demotion) - return NUMA_NO_NODE; - - nd = &node_demotion[node]; - - migration_target = find_next_best_node(node, used); - if (migration_target == NUMA_NO_NODE) - return NUMA_NO_NODE; - - /* - * If the node has been set a migration target node before, - * which means it's the best distance between them. Still - * check if this node can be demoted to other target nodes - * if they have a same best distance. - */ - if (best_distance != -1) { - val = node_distance(node, migration_target); - if (val > best_distance) - goto out_clear; - } - - index = nd->nr; - if (WARN_ONCE(index >= DEMOTION_TARGET_NODES, - "Exceeds maximum demotion target nodes\n")) - goto out_clear; - - nd->nodes[index] = migration_target; - nd->nr++; - - return migration_target; -out_clear: - node_clear(migration_target, *used); - return NUMA_NO_NODE; -} - -/* - * When memory fills up on a node, memory contents can be - * automatically migrated to another node instead of - * discarded at reclaim. - * - * Establish a "migration path" which will start at nodes - * with CPUs and will follow the priorities used to build the - * page allocator zonelists. - * - * The difference here is that cycles must be avoided. If - * node0 migrates to node1, then neither node1, nor anything - * node1 migrates to can migrate to node0. Also one node can - * be migrated to multiple nodes if the target nodes all have - * a same best-distance against the source node. - * - * This function can run simultaneously with readers of - * node_demotion[]. However, it can not run simultaneously - * with itself. Exclusion is provided by memory hotplug events - * being single-threaded. - */ -static void __set_migration_target_nodes(void) -{ - nodemask_t next_pass; - nodemask_t this_pass; - nodemask_t used_targets = NODE_MASK_NONE; - int node, best_distance; - - /* - * Avoid any oddities like cycles that could occur - * from changes in the topology. This will leave - * a momentary gap when migration is disabled. - */ - disable_all_migrate_targets(); - - /* - * Allocations go close to CPUs, first. Assume that - * the migration path starts at the nodes with CPUs. - */ - next_pass = node_states[N_CPU]; -again: - this_pass = next_pass; - next_pass = NODE_MASK_NONE; - /* - * To avoid cycles in the migration "graph", ensure - * that migration sources are not future targets by - * setting them in 'used_targets'. Do this only - * once per pass so that multiple source nodes can - * share a target node. - * - * 'used_targets' will become unavailable in future - * passes. This limits some opportunities for - * multiple source nodes to share a destination. - */ - nodes_or(used_targets, used_targets, this_pass); - - for_each_node_mask(node, this_pass) { - best_distance = -1; - - /* - * Try to set up the migration path for the node, and the target - * migration nodes can be multiple, so doing a loop to find all - * the target nodes if they all have a best node distance. - */ - do { - int target_node = - establish_migrate_target(node, &used_targets, - best_distance); - - if (target_node == NUMA_NO_NODE) - break; - - if (best_distance == -1) - best_distance = node_distance(node, target_node); - - /* - * Visit targets from this pass in the next pass. - * Eventually, every node will have been part of - * a pass, and will become set in 'used_targets'. - */ - node_set(target_node, next_pass); - } while (1); - } - /* - * 'next_pass' contains nodes which became migration - * targets in this pass. Make additional passes until - * no more migrations targets are available. - */ - if (!nodes_empty(next_pass)) - goto again; -} - -/* - * For callers that do not hold get_online_mems() already. - */ -void set_migration_target_nodes(void) -{ - get_online_mems(); - __set_migration_target_nodes(); - put_online_mems(); -} - -/* - * This leaves migrate-on-reclaim transiently disabled between - * the MEM_GOING_OFFLINE and MEM_OFFLINE events. This runs - * whether reclaim-based migration is enabled or not, which - * ensures that the user can turn reclaim-based migration at - * any time without needing to recalculate migration targets. - * - * These callbacks already hold get_online_mems(). That is why - * __set_migration_target_nodes() can be used as opposed to - * set_migration_target_nodes(). - */ -#ifdef CONFIG_MEMORY_HOTPLUG -static int __meminit migrate_on_reclaim_callback(struct notifier_block *self, - unsigned long action, void *_arg) -{ - struct memory_notify *arg = _arg; - - /* - * Only update the node migration order when a node is - * changing status, like online->offline. This avoids - * the overhead of synchronize_rcu() in most cases. - */ - if (arg->status_change_nid < 0) - return notifier_from_errno(0); - - switch (action) { - case MEM_GOING_OFFLINE: - /* - * Make sure there are not transient states where - * an offline node is a migration target. This - * will leave migration disabled until the offline - * completes and the MEM_OFFLINE case below runs. - */ - disable_all_migrate_targets(); - break; - case MEM_OFFLINE: - case MEM_ONLINE: - /* - * Recalculate the target nodes once the node - * reaches its final state (online or offline). - */ - __set_migration_target_nodes(); - break; - case MEM_CANCEL_OFFLINE: - /* - * MEM_GOING_OFFLINE disabled all the migration - * targets. Reenable them. - */ - __set_migration_target_nodes(); - break; - case MEM_GOING_ONLINE: - case MEM_CANCEL_ONLINE: - break; - } - - return notifier_from_errno(0); -} -#endif - -void __init migrate_on_reclaim_init(void) -{ - node_demotion = kcalloc(nr_node_ids, - sizeof(struct demotion_nodes), - GFP_KERNEL); - WARN_ON(!node_demotion); -#ifdef CONFIG_MEMORY_HOTPLUG - hotplug_memory_notifier(migrate_on_reclaim_callback, 100); -#endif - /* - * At this point, all numa nodes with memory/CPus have their state - * properly set, so we can build the demotion order now. - * Let us hold the cpu_hotplug lock just, as we could possibily have - * CPU hotplug events during boot. - */ - cpus_read_lock(); - set_migration_target_nodes(); - cpus_read_unlock(); -} - -bool numa_demotion_enabled = false; - -#ifdef CONFIG_SYSFS -static ssize_t numa_demotion_enabled_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sysfs_emit(buf, "%s\n", - numa_demotion_enabled ? "true" : "false"); -} - -static ssize_t numa_demotion_enabled_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - ssize_t ret; - - ret = kstrtobool(buf, &numa_demotion_enabled); - if (ret) - return ret; - - return count; -} - -static struct kobj_attribute numa_demotion_enabled_attr = - __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show, - numa_demotion_enabled_store); - -static struct attribute *numa_attrs[] = { - &numa_demotion_enabled_attr.attr, - NULL, -}; - -static const struct attribute_group numa_attr_group = { - .attrs = numa_attrs, -}; - -static int __init numa_init_sysfs(void) -{ - int err; - struct kobject *numa_kobj; - - numa_kobj = kobject_create_and_add("numa", mm_kobj); - if (!numa_kobj) { - pr_err("failed to create numa kobject\n"); - return -ENOMEM; - } - err = sysfs_create_group(numa_kobj, &numa_attr_group); - if (err) { - pr_err("failed to register numa group\n"); - goto delete_obj; - } - return 0; - -delete_obj: - kobject_put(numa_kobj); - return err; -} -subsys_initcall(numa_init_sysfs); -#endif /* CONFIG_SYSFS */ #endif /* CONFIG_NUMA */ diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 27fb37d65476..721b2365dbca 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -7,6 +7,7 @@ #include <linux/export.h> #include <linux/memremap.h> #include <linux/migrate.h> +#include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> #include <linux/oom.h> @@ -185,18 +186,25 @@ again: get_page(page); /* - * Optimize for the common case where page is only mapped once - * in one process. If we can lock the page, then we can safely - * set up a special migration page table entry now. + * We rely on trylock_page() to avoid deadlock between + * concurrent migrations where each is waiting on the others + * page lock. If we can't immediately lock the page we fail this + * migration as it is only best effort anyway. + * + * If we can lock the page it's safe to set up a migration entry + * now. In the common case where the page is mapped once in a + * single process setting up the migration entry now is an + * optimisation to avoid walking the rmap later with + * try_to_migrate(). */ if (trylock_page(page)) { bool anon_exclusive; pte_t swp_pte; + flush_cache_page(vma, addr, pte_pfn(*ptep)); anon_exclusive = PageAnon(page) && PageAnonExclusive(page); if (anon_exclusive) { - flush_cache_page(vma, addr, pte_pfn(*ptep)); - ptep_clear_flush(vma, addr, ptep); + pte = ptep_clear_flush(vma, addr, ptep); if (page_try_share_anon_rmap(page)) { set_pte_at(mm, addr, ptep, pte); @@ -206,11 +214,15 @@ again: goto next; } } else { - ptep_get_and_clear(mm, addr, ptep); + pte = ptep_get_and_clear(mm, addr, ptep); } migrate->cpages++; + /* Set the dirty flag on the folio now the pte is gone. */ + if (pte_dirty(pte)) + folio_mark_dirty(page_folio(page)); + /* Setup special migration page table entry */ if (mpfn & MIGRATE_PFN_WRITE) entry = make_writable_migration_entry( @@ -221,6 +233,12 @@ again: else entry = make_readable_migration_entry( page_to_pfn(page)); + if (pte_present(pte)) { + if (pte_young(pte)) + entry = make_migration_entry_young(entry); + if (pte_dirty(pte)) + entry = make_migration_entry_dirty(entry); + } swp_pte = swp_entry_to_pte(entry); if (pte_present(pte)) { if (pte_soft_dirty(pte)) @@ -254,13 +272,14 @@ next: migrate->dst[migrate->npages] = 0; migrate->src[migrate->npages++] = mpfn; } - arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(ptep - 1, ptl); /* Only flush the TLB if we actually modified any entries */ if (unmapped) flush_tlb_range(walk->vma, start, end); + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(ptep - 1, ptl); + return 0; } @@ -306,14 +325,14 @@ static void migrate_vma_collect(struct migrate_vma *migrate) * folio_migrate_mapping(), except that here we allow migration of a * ZONE_DEVICE page. */ -static bool migrate_vma_check_page(struct page *page) +static bool migrate_vma_check_page(struct page *page, struct page *fault_page) { /* * One extra ref because caller holds an extra reference, either from * isolate_lru_page() for a regular page, or migrate_vma_collect() for * a device page. */ - int extra = 1; + int extra = 1 + (page == fault_page); /* * FIXME support THP (transparent huge page), it is bit more complex to @@ -338,30 +357,28 @@ static bool migrate_vma_check_page(struct page *page) } /* - * migrate_vma_unmap() - replace page mapping with special migration pte entry - * @migrate: migrate struct containing all migration information - * - * Isolate pages from the LRU and replace mappings (CPU page table pte) with a - * special migration pte entry and check if it has been pinned. Pinned pages are - * restored because we cannot migrate them. - * - * This is the last step before we call the device driver callback to allocate - * destination memory and copy contents of original page over to new page. + * Unmaps pages for migration. Returns number of source pfns marked as + * migrating. */ -static void migrate_vma_unmap(struct migrate_vma *migrate) +static unsigned long migrate_device_unmap(unsigned long *src_pfns, + unsigned long npages, + struct page *fault_page) { - const unsigned long npages = migrate->npages; unsigned long i, restore = 0; bool allow_drain = true; + unsigned long unmapped = 0; lru_add_drain(); for (i = 0; i < npages; i++) { - struct page *page = migrate_pfn_to_page(migrate->src[i]); + struct page *page = migrate_pfn_to_page(src_pfns[i]); struct folio *folio; - if (!page) + if (!page) { + if (src_pfns[i] & MIGRATE_PFN_MIGRATE) + unmapped++; continue; + } /* ZONE_DEVICE pages are not on LRU */ if (!is_zone_device_page(page)) { @@ -372,8 +389,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate) } if (isolate_lru_page(page)) { - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - migrate->cpages--; + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; restore++; continue; } @@ -386,34 +402,55 @@ static void migrate_vma_unmap(struct migrate_vma *migrate) if (folio_mapped(folio)) try_to_migrate(folio, 0); - if (page_mapped(page) || !migrate_vma_check_page(page)) { + if (page_mapped(page) || + !migrate_vma_check_page(page, fault_page)) { if (!is_zone_device_page(page)) { get_page(page); putback_lru_page(page); } - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - migrate->cpages--; + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; restore++; continue; } + + unmapped++; } for (i = 0; i < npages && restore; i++) { - struct page *page = migrate_pfn_to_page(migrate->src[i]); + struct page *page = migrate_pfn_to_page(src_pfns[i]); struct folio *folio; - if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) + if (!page || (src_pfns[i] & MIGRATE_PFN_MIGRATE)) continue; folio = page_folio(page); remove_migration_ptes(folio, folio, false); - migrate->src[i] = 0; + src_pfns[i] = 0; folio_unlock(folio); folio_put(folio); restore--; } + + return unmapped; +} + +/* + * migrate_vma_unmap() - replace page mapping with special migration pte entry + * @migrate: migrate struct containing all migration information + * + * Isolate pages from the LRU and replace mappings (CPU page table pte) with a + * special migration pte entry and check if it has been pinned. Pinned pages are + * restored because we cannot migrate them. + * + * This is the last step before we call the device driver callback to allocate + * destination memory and copy contents of original page over to new page. + */ +static void migrate_vma_unmap(struct migrate_vma *migrate) +{ + migrate->cpages = migrate_device_unmap(migrate->src, migrate->npages, + migrate->fault_page); } /** @@ -498,6 +535,8 @@ int migrate_vma_setup(struct migrate_vma *args) return -EINVAL; if (!args->src || !args->dst) return -EINVAL; + if (args->fault_page && !is_device_private_page(args->fault_page)) + return -EINVAL; memset(args->src, 0, sizeof(*args->src) * nr_pages); args->cpages = 0; @@ -658,42 +697,38 @@ abort: *src &= ~MIGRATE_PFN_MIGRATE; } -/** - * migrate_vma_pages() - migrate meta-data from src page to dst page - * @migrate: migrate struct containing all migration information - * - * This migrates struct page meta-data from source struct page to destination - * struct page. This effectively finishes the migration from source page to the - * destination page. - */ -void migrate_vma_pages(struct migrate_vma *migrate) +static void __migrate_device_pages(unsigned long *src_pfns, + unsigned long *dst_pfns, unsigned long npages, + struct migrate_vma *migrate) { - const unsigned long npages = migrate->npages; - const unsigned long start = migrate->start; struct mmu_notifier_range range; - unsigned long addr, i; + unsigned long i; bool notified = false; - for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { - struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); - struct page *page = migrate_pfn_to_page(migrate->src[i]); + for (i = 0; i < npages; i++) { + struct page *newpage = migrate_pfn_to_page(dst_pfns[i]); + struct page *page = migrate_pfn_to_page(src_pfns[i]); struct address_space *mapping; int r; if (!newpage) { - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; continue; } if (!page) { + unsigned long addr; + + if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE)) + continue; + /* * The only time there is no vma is when called from * migrate_device_coherent_page(). However this isn't * called if the page could not be unmapped. */ - VM_BUG_ON(!migrate->vma); - if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) - continue; + VM_BUG_ON(!migrate); + addr = migrate->start + i*PAGE_SIZE; if (!notified) { notified = true; @@ -704,7 +739,7 @@ void migrate_vma_pages(struct migrate_vma *migrate) mmu_notifier_invalidate_range_start(&range); } migrate_vma_insert_page(migrate, addr, newpage, - &migrate->src[i]); + &src_pfns[i]); continue; } @@ -717,21 +752,26 @@ void migrate_vma_pages(struct migrate_vma *migrate) * device private or coherent memory. */ if (mapping) { - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; continue; } } else if (is_zone_device_page(newpage)) { /* * Other types of ZONE_DEVICE page are not supported. */ - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; continue; } - r = migrate_folio(mapping, page_folio(newpage), - page_folio(page), MIGRATE_SYNC_NO_COPY); + if (migrate && migrate->fault_page == page) + r = migrate_folio_extra(mapping, page_folio(newpage), + page_folio(page), + MIGRATE_SYNC_NO_COPY, 1); + else + r = migrate_folio(mapping, page_folio(newpage), + page_folio(page), MIGRATE_SYNC_NO_COPY); if (r != MIGRATEPAGE_SUCCESS) - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; } /* @@ -742,28 +782,56 @@ void migrate_vma_pages(struct migrate_vma *migrate) if (notified) mmu_notifier_invalidate_range_only_end(&range); } -EXPORT_SYMBOL(migrate_vma_pages); /** - * migrate_vma_finalize() - restore CPU page table entry + * migrate_device_pages() - migrate meta-data from src page to dst page + * @src_pfns: src_pfns returned from migrate_device_range() + * @dst_pfns: array of pfns allocated by the driver to migrate memory to + * @npages: number of pages in the range + * + * Equivalent to migrate_vma_pages(). This is called to migrate struct page + * meta-data from source struct page to destination. + */ +void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns, + unsigned long npages) +{ + __migrate_device_pages(src_pfns, dst_pfns, npages, NULL); +} +EXPORT_SYMBOL(migrate_device_pages); + +/** + * migrate_vma_pages() - migrate meta-data from src page to dst page * @migrate: migrate struct containing all migration information * - * This replaces the special migration pte entry with either a mapping to the - * new page if migration was successful for that page, or to the original page - * otherwise. + * This migrates struct page meta-data from source struct page to destination + * struct page. This effectively finishes the migration from source page to the + * destination page. + */ +void migrate_vma_pages(struct migrate_vma *migrate) +{ + __migrate_device_pages(migrate->src, migrate->dst, migrate->npages, migrate); +} +EXPORT_SYMBOL(migrate_vma_pages); + +/* + * migrate_device_finalize() - complete page migration + * @src_pfns: src_pfns returned from migrate_device_range() + * @dst_pfns: array of pfns allocated by the driver to migrate memory to + * @npages: number of pages in the range * - * This also unlocks the pages and puts them back on the lru, or drops the extra - * refcount, for device pages. + * Completes migration of the page by removing special migration entries. + * Drivers must ensure copying of page data is complete and visible to the CPU + * before calling this. */ -void migrate_vma_finalize(struct migrate_vma *migrate) +void migrate_device_finalize(unsigned long *src_pfns, + unsigned long *dst_pfns, unsigned long npages) { - const unsigned long npages = migrate->npages; unsigned long i; for (i = 0; i < npages; i++) { struct folio *dst, *src; - struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); - struct page *page = migrate_pfn_to_page(migrate->src[i]); + struct page *newpage = migrate_pfn_to_page(dst_pfns[i]); + struct page *page = migrate_pfn_to_page(src_pfns[i]); if (!page) { if (newpage) { @@ -773,7 +841,7 @@ void migrate_vma_finalize(struct migrate_vma *migrate) continue; } - if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) { + if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !newpage) { if (newpage) { unlock_page(newpage); put_page(newpage); @@ -800,8 +868,72 @@ void migrate_vma_finalize(struct migrate_vma *migrate) } } } +EXPORT_SYMBOL(migrate_device_finalize); + +/** + * migrate_vma_finalize() - restore CPU page table entry + * @migrate: migrate struct containing all migration information + * + * This replaces the special migration pte entry with either a mapping to the + * new page if migration was successful for that page, or to the original page + * otherwise. + * + * This also unlocks the pages and puts them back on the lru, or drops the extra + * refcount, for device pages. + */ +void migrate_vma_finalize(struct migrate_vma *migrate) +{ + migrate_device_finalize(migrate->src, migrate->dst, migrate->npages); +} EXPORT_SYMBOL(migrate_vma_finalize); +/** + * migrate_device_range() - migrate device private pfns to normal memory. + * @src_pfns: array large enough to hold migrating source device private pfns. + * @start: starting pfn in the range to migrate. + * @npages: number of pages to migrate. + * + * migrate_vma_setup() is similar in concept to migrate_vma_setup() except that + * instead of looking up pages based on virtual address mappings a range of + * device pfns that should be migrated to system memory is used instead. + * + * This is useful when a driver needs to free device memory but doesn't know the + * virtual mappings of every page that may be in device memory. For example this + * is often the case when a driver is being unloaded or unbound from a device. + * + * Like migrate_vma_setup() this function will take a reference and lock any + * migrating pages that aren't free before unmapping them. Drivers may then + * allocate destination pages and start copying data from the device to CPU + * memory before calling migrate_device_pages(). + */ +int migrate_device_range(unsigned long *src_pfns, unsigned long start, + unsigned long npages) +{ + unsigned long i, pfn; + + for (pfn = start, i = 0; i < npages; pfn++, i++) { + struct page *page = pfn_to_page(pfn); + + if (!get_page_unless_zero(page)) { + src_pfns[i] = 0; + continue; + } + + if (!trylock_page(page)) { + src_pfns[i] = 0; + put_page(page); + continue; + } + + src_pfns[i] = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; + } + + migrate_device_unmap(src_pfns, npages, NULL); + + return 0; +} +EXPORT_SYMBOL(migrate_device_range); + /* * Migrate a device coherent page back to normal memory. The caller should have * a reference on page which will be copied to the new page if migration is @@ -810,25 +942,19 @@ EXPORT_SYMBOL(migrate_vma_finalize); int migrate_device_coherent_page(struct page *page) { unsigned long src_pfn, dst_pfn = 0; - struct migrate_vma args; struct page *dpage; WARN_ON_ONCE(PageCompound(page)); lock_page(page); src_pfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE; - args.src = &src_pfn; - args.dst = &dst_pfn; - args.cpages = 1; - args.npages = 1; - args.vma = NULL; /* * We don't have a VMA and don't need to walk the page tables to find * the source page. So call migrate_vma_unmap() directly to unmap the * page as migrate_vma_setup() will fail if args.vma == NULL. */ - migrate_vma_unmap(&args); + migrate_device_unmap(&src_pfn, 1, NULL); if (!(src_pfn & MIGRATE_PFN_MIGRATE)) return -EBUSY; @@ -838,10 +964,10 @@ int migrate_device_coherent_page(struct page *page) dst_pfn = migrate_pfn(page_to_pfn(dpage)); } - migrate_vma_pages(&args); + migrate_device_pages(&src_pfn, &dst_pfn, 1); if (src_pfn & MIGRATE_PFN_MIGRATE) copy_highpage(dpage, page); - migrate_vma_finalize(&args); + migrate_device_finalize(&src_pfn, &dst_pfn, 1); if (src_pfn & MIGRATE_PFN_MIGRATE) return 0; diff --git a/mm/mincore.c b/mm/mincore.c index fa200c14185f..a085a2aeabd8 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -52,7 +52,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, static unsigned char mincore_page(struct address_space *mapping, pgoff_t index) { unsigned char present = 0; - struct page *page; + struct folio *folio; /* * When tmpfs swaps out a page from a file, any process mapping that @@ -60,10 +60,10 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t index) * any other file mapping (ie. marked !present and faulted in with * tmpfs's .fault). So swapped out tmpfs mappings are tested here. */ - page = find_get_incore_page(mapping, index); - if (page) { - present = PageUptodate(page); - put_page(page); + folio = filemap_get_incore_folio(mapping, index); + if (folio) { + present = folio_test_uptodate(folio); + folio_put(folio); } return present; @@ -190,8 +190,8 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v unsigned long end; int err; - vma = find_vma(current->mm, addr); - if (!vma || addr < vma->vm_start) + vma = vma_lookup(current->mm, addr); + if (!vma) return -ENOMEM; end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); if (!can_do_mincore(vma)) { diff --git a/mm/mlock.c b/mm/mlock.c index b14e929084cc..7032f6dd0ce1 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -471,6 +471,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, unsigned long nstart, end, tmp; struct vm_area_struct *vma, *prev; int error; + MA_STATE(mas, ¤t->mm->mm_mt, start, start); VM_BUG_ON(offset_in_page(start)); VM_BUG_ON(len != PAGE_ALIGN(len)); @@ -479,13 +480,14 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, return -EINVAL; if (end == start) return 0; - vma = find_vma(current->mm, start); - if (!vma || vma->vm_start > start) + vma = mas_walk(&mas); + if (!vma) return -ENOMEM; - prev = vma->vm_prev; if (start > vma->vm_start) prev = vma; + else + prev = mas_prev(&mas, 0); for (nstart = start ; ; ) { vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; @@ -505,7 +507,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, if (nstart >= end) break; - vma = prev->vm_next; + vma = find_vma(prev->vm_mm, prev->vm_end); if (!vma || vma->vm_start != nstart) { error = -ENOMEM; break; @@ -526,24 +528,21 @@ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm, { struct vm_area_struct *vma; unsigned long count = 0; + unsigned long end; + VMA_ITERATOR(vmi, mm, start); - if (mm == NULL) - mm = current->mm; + /* Don't overflow past ULONG_MAX */ + if (unlikely(ULONG_MAX - len < start)) + end = ULONG_MAX; + else + end = start + len; - vma = find_vma(mm, start); - if (vma == NULL) - return 0; - - for (; vma ; vma = vma->vm_next) { - if (start >= vma->vm_end) - continue; - if (start + len <= vma->vm_start) - break; + for_each_vma_range(vmi, vma, end) { if (vma->vm_flags & VM_LOCKED) { if (start > vma->vm_start) count -= (start - vma->vm_start); - if (start + len < vma->vm_end) { - count += start + len - vma->vm_start; + if (end < vma->vm_end) { + count += end - vma->vm_start; break; } count += vma->vm_end - vma->vm_start; @@ -659,6 +658,7 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) */ static int apply_mlockall_flags(int flags) { + MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); struct vm_area_struct *vma, *prev = NULL; vm_flags_t to_add = 0; @@ -679,7 +679,7 @@ static int apply_mlockall_flags(int flags) to_add |= VM_LOCKONFAULT; } - for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { + mas_for_each(&mas, vma, ULONG_MAX) { vm_flags_t newflags; newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; @@ -687,6 +687,7 @@ static int apply_mlockall_flags(int flags) /* Ignore errors */ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); + mas_pause(&mas); cond_resched(); } out: diff --git a/mm/mm_init.c b/mm/mm_init.c index 9ddaf0e1b0ab..c1883362e71d 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layout(void) shift = 8 * sizeof(unsigned long); width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH; + - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH; mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", - "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n", + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n", SECTIONS_WIDTH, NODES_WIDTH, ZONES_WIDTH, LAST_CPUPID_WIDTH, KASAN_TAG_WIDTH, + LRU_GEN_WIDTH, + LRU_REFS_WIDTH, NR_PAGEFLAGS); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n", @@ -176,16 +178,10 @@ static int __meminit mm_compute_batch_notifier(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block compute_batch_nb __meminitdata = { - .notifier_call = mm_compute_batch_notifier, - .priority = IPC_CALLBACK_PRI, /* use lowest priority */ -}; - static int __init mm_compute_batch_init(void) { mm_compute_batch(sysctl_overcommit_memory); - register_hotmemory_notifier(&compute_batch_nb); - + hotplug_memory_notifier(mm_compute_batch_notifier, MM_COMPUTE_BATCH_PRI); return 0; } diff --git a/mm/mm_slot.h b/mm/mm_slot.h new file mode 100644 index 000000000000..83f18ed1c4bd --- /dev/null +++ b/mm/mm_slot.h @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef _LINUX_MM_SLOT_H +#define _LINUX_MM_SLOT_H + +#include <linux/hashtable.h> +#include <linux/slab.h> + +/* + * struct mm_slot - hash lookup from mm to mm_slot + * @hash: link to the mm_slots hash list + * @mm_node: link into the mm_slots list + * @mm: the mm that this information is valid for + */ +struct mm_slot { + struct hlist_node hash; + struct list_head mm_node; + struct mm_struct *mm; +}; + +#define mm_slot_entry(ptr, type, member) \ + container_of(ptr, type, member) + +static inline void *mm_slot_alloc(struct kmem_cache *cache) +{ + if (!cache) /* initialization failed */ + return NULL; + return kmem_cache_zalloc(cache, GFP_KERNEL); +} + +static inline void mm_slot_free(struct kmem_cache *cache, void *objp) +{ + kmem_cache_free(cache, objp); +} + +#define mm_slot_lookup(_hashtable, _mm) \ +({ \ + struct mm_slot *tmp_slot, *mm_slot = NULL; \ + \ + hash_for_each_possible(_hashtable, tmp_slot, hash, (unsigned long)_mm) \ + if (_mm == tmp_slot->mm) { \ + mm_slot = tmp_slot; \ + break; \ + } \ + \ + mm_slot; \ +}) + +#define mm_slot_insert(_hashtable, _mm, _mm_slot) \ +({ \ + _mm_slot->mm = _mm; \ + hash_add(_hashtable, &_mm_slot->hash, (unsigned long)_mm); \ +}) + +#endif /* _LINUX_MM_SLOT_H */ diff --git a/mm/mmap.c b/mm/mmap.c index 9d780f415be3..87d929316d57 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -14,7 +14,6 @@ #include <linux/backing-dev.h> #include <linux/mm.h> #include <linux/mm_inline.h> -#include <linux/vmacache.h> #include <linux/shm.h> #include <linux/mman.h> #include <linux/pagemap.h> @@ -39,7 +38,6 @@ #include <linux/audit.h> #include <linux/khugepaged.h> #include <linux/uprobes.h> -#include <linux/rbtree_augmented.h> #include <linux/notifier.h> #include <linux/memory.h> #include <linux/printk.h> @@ -77,9 +75,10 @@ int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; static bool ignore_rlimit_data; core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); -static void unmap_region(struct mm_struct *mm, +static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, struct vm_area_struct *vma, struct vm_area_struct *prev, - unsigned long start, unsigned long end); + struct vm_area_struct *next, unsigned long start, + unsigned long end); static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) { @@ -132,12 +131,10 @@ void unlink_file_vma(struct vm_area_struct *vma) } /* - * Close a vm structure and free it, returning the next. + * Close a vm structure and free it. */ -static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) +static void remove_vma(struct vm_area_struct *vma) { - struct vm_area_struct *next = vma->vm_next; - might_sleep(); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); @@ -145,20 +142,41 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) fput(vma->vm_file); mpol_put(vma_policy(vma)); vm_area_free(vma); - return next; } -static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags, - struct list_head *uf); +/* + * check_brk_limits() - Use platform specific check of range & verify mlock + * limits. + * @addr: The address to check + * @len: The size of increase. + * + * Return: 0 on success. + */ +static int check_brk_limits(unsigned long addr, unsigned long len) +{ + unsigned long mapped_addr; + + mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); + if (IS_ERR_VALUE(mapped_addr)) + return mapped_addr; + + return mlock_future_check(current->mm, current->mm->def_flags, len); +} +static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long newbrk, unsigned long oldbrk, + struct list_head *uf); +static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *brkvma, + unsigned long addr, unsigned long request, unsigned long flags); SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long newbrk, oldbrk, origbrk; struct mm_struct *mm = current->mm; - struct vm_area_struct *next; + struct vm_area_struct *brkvma, *next = NULL; unsigned long min_brk; bool populate; bool downgraded = false; LIST_HEAD(uf); + MA_STATE(mas, &mm->mm_mt, 0, 0); if (mmap_write_lock_killable(mm)) return -EINTR; @@ -200,35 +218,50 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* * Always allow shrinking brk. - * __do_munmap() may downgrade mmap_lock to read. + * do_brk_munmap() may downgrade mmap_lock to read. */ if (brk <= mm->brk) { int ret; + /* Search one past newbrk */ + mas_set(&mas, newbrk); + brkvma = mas_find(&mas, oldbrk); + if (!brkvma || brkvma->vm_start >= oldbrk) + goto out; /* mapping intersects with an existing non-brk vma. */ /* - * mm->brk must to be protected by write mmap_lock so update it - * before downgrading mmap_lock. When __do_munmap() fails, - * mm->brk will be restored from origbrk. + * mm->brk must be protected by write mmap_lock. + * do_brk_munmap() may downgrade the lock, so update it + * before calling do_brk_munmap(). */ mm->brk = brk; - ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true); - if (ret < 0) { - mm->brk = origbrk; - goto out; - } else if (ret == 1) { + ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf); + if (ret == 1) { downgraded = true; - } - goto success; + goto success; + } else if (!ret) + goto success; + + mm->brk = origbrk; + goto out; } - /* Check against existing mmap mappings. */ - next = find_vma(mm, oldbrk); + if (check_brk_limits(oldbrk, newbrk - oldbrk)) + goto out; + + /* + * Only check if the next VMA is within the stack_guard_gap of the + * expansion area + */ + mas_set(&mas, oldbrk); + next = mas_find(&mas, newbrk - 1 + PAGE_SIZE + stack_guard_gap); if (next && newbrk + PAGE_SIZE > vm_start_gap(next)) goto out; + brkvma = mas_prev(&mas, mm->start_brk); /* Ok, looks good - let it rip. */ - if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0) + if (do_brk_flags(&mas, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) goto out; + mm->brk = brk; success: @@ -247,104 +280,45 @@ out: return origbrk; } -static inline unsigned long vma_compute_gap(struct vm_area_struct *vma) -{ - unsigned long gap, prev_end; - - /* - * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we - * allow two stack_guard_gaps between them here, and when choosing - * an unmapped area; whereas when expanding we only require one. - * That's a little inconsistent, but keeps the code here simpler. - */ - gap = vm_start_gap(vma); - if (vma->vm_prev) { - prev_end = vm_end_gap(vma->vm_prev); - if (gap > prev_end) - gap -= prev_end; - else - gap = 0; - } - return gap; -} - -#ifdef CONFIG_DEBUG_VM_RB -static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma) -{ - unsigned long max = vma_compute_gap(vma), subtree_gap; - if (vma->vm_rb.rb_left) { - subtree_gap = rb_entry(vma->vm_rb.rb_left, - struct vm_area_struct, vm_rb)->rb_subtree_gap; - if (subtree_gap > max) - max = subtree_gap; - } - if (vma->vm_rb.rb_right) { - subtree_gap = rb_entry(vma->vm_rb.rb_right, - struct vm_area_struct, vm_rb)->rb_subtree_gap; - if (subtree_gap > max) - max = subtree_gap; - } - return max; -} - -static int browse_rb(struct mm_struct *mm) -{ - struct rb_root *root = &mm->mm_rb; - int i = 0, j, bug = 0; - struct rb_node *nd, *pn = NULL; - unsigned long prev = 0, pend = 0; - - for (nd = rb_first(root); nd; nd = rb_next(nd)) { - struct vm_area_struct *vma; - vma = rb_entry(nd, struct vm_area_struct, vm_rb); - if (vma->vm_start < prev) { - pr_emerg("vm_start %lx < prev %lx\n", - vma->vm_start, prev); - bug = 1; - } - if (vma->vm_start < pend) { - pr_emerg("vm_start %lx < pend %lx\n", - vma->vm_start, pend); - bug = 1; - } - if (vma->vm_start > vma->vm_end) { - pr_emerg("vm_start %lx > vm_end %lx\n", - vma->vm_start, vma->vm_end); - bug = 1; - } - spin_lock(&mm->page_table_lock); - if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { - pr_emerg("free gap %lx, correct %lx\n", - vma->rb_subtree_gap, - vma_compute_subtree_gap(vma)); - bug = 1; +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) +extern void mt_validate(struct maple_tree *mt); +extern void mt_dump(const struct maple_tree *mt); + +/* Validate the maple tree */ +static void validate_mm_mt(struct mm_struct *mm) +{ + struct maple_tree *mt = &mm->mm_mt; + struct vm_area_struct *vma_mt; + + MA_STATE(mas, mt, 0, 0); + + mt_validate(&mm->mm_mt); + mas_for_each(&mas, vma_mt, ULONG_MAX) { + if ((vma_mt->vm_start != mas.index) || + (vma_mt->vm_end - 1 != mas.last)) { + pr_emerg("issue in %s\n", current->comm); + dump_stack(); + dump_vma(vma_mt); + pr_emerg("mt piv: %p %lu - %lu\n", vma_mt, + mas.index, mas.last); + pr_emerg("mt vma: %p %lu - %lu\n", vma_mt, + vma_mt->vm_start, vma_mt->vm_end); + + mt_dump(mas.tree); + if (vma_mt->vm_end != mas.last + 1) { + pr_err("vma: %p vma_mt %lu-%lu\tmt %lu-%lu\n", + mm, vma_mt->vm_start, vma_mt->vm_end, + mas.index, mas.last); + mt_dump(mas.tree); + } + VM_BUG_ON_MM(vma_mt->vm_end != mas.last + 1, mm); + if (vma_mt->vm_start != mas.index) { + pr_err("vma: %p vma_mt %p %lu - %lu doesn't match\n", + mm, vma_mt, vma_mt->vm_start, vma_mt->vm_end); + mt_dump(mas.tree); + } + VM_BUG_ON_MM(vma_mt->vm_start != mas.index, mm); } - spin_unlock(&mm->page_table_lock); - i++; - pn = nd; - prev = vma->vm_start; - pend = vma->vm_end; - } - j = 0; - for (nd = pn; nd; nd = rb_prev(nd)) - j++; - if (i != j) { - pr_emerg("backwards %d, forwards %d\n", j, i); - bug = 1; - } - return bug ? -1 : i; -} - -static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) -{ - struct rb_node *nd; - - for (nd = rb_first(root); nd; nd = rb_next(nd)) { - struct vm_area_struct *vma; - vma = rb_entry(nd, struct vm_area_struct, vm_rb); - VM_BUG_ON_VMA(vma != ignore && - vma->rb_subtree_gap != vma_compute_subtree_gap(vma), - vma); } } @@ -352,10 +326,13 @@ static void validate_mm(struct mm_struct *mm) { int bug = 0; int i = 0; - unsigned long highest_address = 0; - struct vm_area_struct *vma = mm->mmap; + struct vm_area_struct *vma; + MA_STATE(mas, &mm->mm_mt, 0, 0); - while (vma) { + validate_mm_mt(mm); + + mas_for_each(&mas, vma, ULONG_MAX) { +#ifdef CONFIG_DEBUG_VM_RB struct anon_vma *anon_vma = vma->anon_vma; struct anon_vma_chain *avc; @@ -365,93 +342,20 @@ static void validate_mm(struct mm_struct *mm) anon_vma_interval_tree_verify(avc); anon_vma_unlock_read(anon_vma); } - - highest_address = vm_end_gap(vma); - vma = vma->vm_next; +#endif i++; } if (i != mm->map_count) { - pr_emerg("map_count %d vm_next %d\n", mm->map_count, i); - bug = 1; - } - if (highest_address != mm->highest_vm_end) { - pr_emerg("mm->highest_vm_end %lx, found %lx\n", - mm->highest_vm_end, highest_address); - bug = 1; - } - i = browse_rb(mm); - if (i != mm->map_count) { - if (i != -1) - pr_emerg("map_count %d rb %d\n", mm->map_count, i); + pr_emerg("map_count %d mas_for_each %d\n", mm->map_count, i); bug = 1; } VM_BUG_ON_MM(bug, mm); } -#else -#define validate_mm_rb(root, ignore) do { } while (0) -#define validate_mm(mm) do { } while (0) -#endif - -RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks, - struct vm_area_struct, vm_rb, - unsigned long, rb_subtree_gap, vma_compute_gap) - -/* - * Update augmented rbtree rb_subtree_gap values after vma->vm_start or - * vma->vm_prev->vm_end values changed, without modifying the vma's position - * in the rbtree. - */ -static void vma_gap_update(struct vm_area_struct *vma) -{ - /* - * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created - * a callback function that does exactly what we want. - */ - vma_gap_callbacks_propagate(&vma->vm_rb, NULL); -} - -static inline void vma_rb_insert(struct vm_area_struct *vma, - struct rb_root *root) -{ - /* All rb_subtree_gap values must be consistent prior to insertion */ - validate_mm_rb(root, NULL); - - rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); -} - -static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) -{ - /* - * Note rb_erase_augmented is a fairly large inline function, - * so make sure we instantiate it only once with our desired - * augmented rbtree callbacks. - */ - rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); -} - -static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma, - struct rb_root *root, - struct vm_area_struct *ignore) -{ - /* - * All rb_subtree_gap values must be consistent prior to erase, - * with the possible exception of - * - * a. the "next" vma being erased if next->vm_start was reduced in - * __vma_adjust() -> __vma_unlink() - * b. the vma being erased in detach_vmas_to_be_unmapped() -> - * vma_rb_erase() - */ - validate_mm_rb(root, ignore); - __vma_rb_erase(vma, root); -} - -static __always_inline void vma_rb_erase(struct vm_area_struct *vma, - struct rb_root *root) -{ - vma_rb_erase_ignore(vma, root, vma); -} +#else /* !CONFIG_DEBUG_VM_MAPLE_TREE */ +#define validate_mm_mt(root) do { } while (0) +#define validate_mm(mm) do { } while (0) +#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ /* * vma has some anon_vma assigned, and is already inserted on that @@ -485,208 +389,220 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); } -static int find_vma_links(struct mm_struct *mm, unsigned long addr, - unsigned long end, struct vm_area_struct **pprev, - struct rb_node ***rb_link, struct rb_node **rb_parent) +static unsigned long count_vma_pages_range(struct mm_struct *mm, + unsigned long addr, unsigned long end) { - struct rb_node **__rb_link, *__rb_parent, *rb_prev; + VMA_ITERATOR(vmi, mm, addr); + struct vm_area_struct *vma; + unsigned long nr_pages = 0; - mmap_assert_locked(mm); - __rb_link = &mm->mm_rb.rb_node; - rb_prev = __rb_parent = NULL; + for_each_vma_range(vmi, vma, end) { + unsigned long vm_start = max(addr, vma->vm_start); + unsigned long vm_end = min(end, vma->vm_end); - while (*__rb_link) { - struct vm_area_struct *vma_tmp; + nr_pages += PHYS_PFN(vm_end - vm_start); + } - __rb_parent = *__rb_link; - vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); + return nr_pages; +} - if (vma_tmp->vm_end > addr) { - /* Fail if an existing vma overlaps the area */ - if (vma_tmp->vm_start < end) - return -ENOMEM; - __rb_link = &__rb_parent->rb_left; - } else { - rb_prev = __rb_parent; - __rb_link = &__rb_parent->rb_right; - } - } +static void __vma_link_file(struct vm_area_struct *vma, + struct address_space *mapping) +{ + if (vma->vm_flags & VM_SHARED) + mapping_allow_writable(mapping); - *pprev = NULL; - if (rb_prev) - *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); - *rb_link = __rb_link; - *rb_parent = __rb_parent; - return 0; + flush_dcache_mmap_lock(mapping); + vma_interval_tree_insert(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); } /* - * vma_next() - Get the next VMA. - * @mm: The mm_struct. - * @vma: The current vma. + * vma_mas_store() - Store a VMA in the maple tree. + * @vma: The vm_area_struct + * @mas: The maple state * - * If @vma is NULL, return the first vma in the mm. + * Efficient way to store a VMA in the maple tree when the @mas has already + * walked to the correct location. * - * Returns: The next VMA after @vma. + * Note: the end address is inclusive in the maple tree. */ -static inline struct vm_area_struct *vma_next(struct mm_struct *mm, - struct vm_area_struct *vma) +void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas) { - if (!vma) - return mm->mmap; - - return vma->vm_next; + trace_vma_store(mas->tree, vma); + mas_set_range(mas, vma->vm_start, vma->vm_end - 1); + mas_store_prealloc(mas, vma); } /* - * munmap_vma_range() - munmap VMAs that overlap a range. - * @mm: The mm struct - * @start: The start of the range. - * @len: The length of the range. - * @pprev: pointer to the pointer that will be set to previous vm_area_struct - * @rb_link: the rb_node - * @rb_parent: the parent rb_node - * - * Find all the vm_area_struct that overlap from @start to - * @end and munmap them. Set @pprev to the previous vm_area_struct. + * vma_mas_remove() - Remove a VMA from the maple tree. + * @vma: The vm_area_struct + * @mas: The maple state * - * Returns: -ENOMEM on munmap failure or 0 on success. + * Efficient way to remove a VMA from the maple tree when the @mas has already + * been established and points to the correct location. + * Note: the end address is inclusive in the maple tree. */ -static inline int -munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len, - struct vm_area_struct **pprev, struct rb_node ***link, - struct rb_node **parent, struct list_head *uf) +void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas) { - - while (find_vma_links(mm, start, start + len, pprev, link, parent)) - if (do_munmap(mm, start, len, uf)) - return -ENOMEM; - - return 0; + trace_vma_mas_szero(mas->tree, vma->vm_start, vma->vm_end - 1); + mas->index = vma->vm_start; + mas->last = vma->vm_end - 1; + mas_store_prealloc(mas, NULL); } -static unsigned long count_vma_pages_range(struct mm_struct *mm, - unsigned long addr, unsigned long end) + +/* + * vma_mas_szero() - Set a given range to zero. Used when modifying a + * vm_area_struct start or end. + * + * @mas: The maple tree ma_state + * @start: The start address to zero + * @end: The end address to zero. + */ +static inline void vma_mas_szero(struct ma_state *mas, unsigned long start, + unsigned long end) { - unsigned long nr_pages = 0; - struct vm_area_struct *vma; + trace_vma_mas_szero(mas->tree, start, end - 1); + mas_set_range(mas, start, end - 1); + mas_store_prealloc(mas, NULL); +} - /* Find first overlapping mapping */ - vma = find_vma_intersection(mm, addr, end); - if (!vma) - return 0; +static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) +{ + MA_STATE(mas, &mm->mm_mt, 0, 0); + struct address_space *mapping = NULL; - nr_pages = (min(end, vma->vm_end) - - max(addr, vma->vm_start)) >> PAGE_SHIFT; + if (mas_preallocate(&mas, vma, GFP_KERNEL)) + return -ENOMEM; - /* Iterate over the rest of the overlaps */ - for (vma = vma->vm_next; vma; vma = vma->vm_next) { - unsigned long overlap_len; + if (vma->vm_file) { + mapping = vma->vm_file->f_mapping; + i_mmap_lock_write(mapping); + } - if (vma->vm_start > end) - break; + vma_mas_store(vma, &mas); - overlap_len = min(end, vma->vm_end) - vma->vm_start; - nr_pages += overlap_len >> PAGE_SHIFT; + if (mapping) { + __vma_link_file(vma, mapping); + i_mmap_unlock_write(mapping); } - return nr_pages; + mm->map_count++; + validate_mm(mm); + return 0; } -void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, - struct rb_node **rb_link, struct rb_node *rb_parent) +/* + * vma_expand - Expand an existing VMA + * + * @mas: The maple state + * @vma: The vma to expand + * @start: The start of the vma + * @end: The exclusive end of the vma + * @pgoff: The page offset of vma + * @next: The current of next vma. + * + * Expand @vma to @start and @end. Can expand off the start and end. Will + * expand over @next if it's different from @vma and @end == @next->vm_end. + * Checking if the @vma can expand and merge with @next needs to be handled by + * the caller. + * + * Returns: 0 on success + */ +inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff, + struct vm_area_struct *next) { - /* Update tracking information for the gap following the new vma. */ - if (vma->vm_next) - vma_gap_update(vma->vm_next); - else - mm->highest_vm_end = vm_end_gap(vma); + struct mm_struct *mm = vma->vm_mm; + struct address_space *mapping = NULL; + struct rb_root_cached *root = NULL; + struct anon_vma *anon_vma = vma->anon_vma; + struct file *file = vma->vm_file; + bool remove_next = false; - /* - * vma->vm_prev wasn't known when we followed the rbtree to find the - * correct insertion point for that vma. As a result, we could not - * update the vma vm_rb parents rb_subtree_gap values on the way down. - * So, we first insert the vma with a zero rb_subtree_gap value - * (to be consistent with what we did on the way down), and then - * immediately update the gap to the correct value. Finally we - * rebalance the rbtree after all augmented values have been set. - */ - rb_link_node(&vma->vm_rb, rb_parent, rb_link); - vma->rb_subtree_gap = 0; - vma_gap_update(vma); - vma_rb_insert(vma, &mm->mm_rb); -} + if (next && (vma != next) && (end == next->vm_end)) { + remove_next = true; + if (next->anon_vma && !vma->anon_vma) { + int error; -static void __vma_link_file(struct vm_area_struct *vma) -{ - struct file *file; + anon_vma = next->anon_vma; + vma->anon_vma = anon_vma; + error = anon_vma_clone(vma, next); + if (error) + return error; + } + } + + /* Not merging but overwriting any part of next is not handled. */ + VM_BUG_ON(next && !remove_next && next != vma && end > next->vm_start); + /* Only handles expanding */ + VM_BUG_ON(vma->vm_start < start || vma->vm_end > end); + + if (mas_preallocate(mas, vma, GFP_KERNEL)) + goto nomem; + + vma_adjust_trans_huge(vma, start, end, 0); - file = vma->vm_file; if (file) { - struct address_space *mapping = file->f_mapping; + mapping = file->f_mapping; + root = &mapping->i_mmap; + uprobe_munmap(vma, vma->vm_start, vma->vm_end); + i_mmap_lock_write(mapping); + } - if (vma->vm_flags & VM_SHARED) - mapping_allow_writable(mapping); + if (anon_vma) { + anon_vma_lock_write(anon_vma); + anon_vma_interval_tree_pre_update_vma(vma); + } + if (file) { flush_dcache_mmap_lock(mapping); - vma_interval_tree_insert(vma, &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); + vma_interval_tree_remove(vma, root); } -} -static void -__vma_link(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, struct rb_node **rb_link, - struct rb_node *rb_parent) -{ - __vma_link_list(mm, vma, prev); - __vma_link_rb(mm, vma, rb_link, rb_parent); -} + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; + /* Note: mas must be pointing to the expanding VMA */ + vma_mas_store(vma, mas); -static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, struct rb_node **rb_link, - struct rb_node *rb_parent) -{ - struct address_space *mapping = NULL; + if (file) { + vma_interval_tree_insert(vma, root); + flush_dcache_mmap_unlock(mapping); + } - if (vma->vm_file) { - mapping = vma->vm_file->f_mapping; - i_mmap_lock_write(mapping); + /* Expanding over the next vma */ + if (remove_next && file) { + __remove_shared_vm_struct(next, file, mapping); } - __vma_link(mm, vma, prev, rb_link, rb_parent); - __vma_link_file(vma); + if (anon_vma) { + anon_vma_interval_tree_post_update_vma(vma); + anon_vma_unlock_write(anon_vma); + } - if (mapping) + if (file) { i_mmap_unlock_write(mapping); + uprobe_mmap(vma); + } - mm->map_count++; - validate_mm(mm); -} - -/* - * Helper for vma_adjust() in the split_vma insert case: insert a vma into the - * mm's list and rbtree. It has already been inserted into the interval tree. - */ -static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) -{ - struct vm_area_struct *prev; - struct rb_node **rb_link, *rb_parent; + if (remove_next) { + if (file) { + uprobe_munmap(next, next->vm_start, next->vm_end); + fput(file); + } + if (next->anon_vma) + anon_vma_merge(vma, next); + mm->map_count--; + mpol_put(vma_policy(next)); + vm_area_free(next); + } - if (find_vma_links(mm, vma->vm_start, vma->vm_end, - &prev, &rb_link, &rb_parent)) - BUG(); - __vma_link(mm, vma, prev, rb_link, rb_parent); - mm->map_count++; -} + validate_mm(mm); + return 0; -static __always_inline void __vma_unlink(struct mm_struct *mm, - struct vm_area_struct *vma, - struct vm_area_struct *ignore) -{ - vma_rb_erase_ignore(vma, &mm->mm_rb, ignore); - __vma_unlink_list(mm, vma); - /* Kill the cache */ - vmacache_invalidate(mm); +nomem: + return -ENOMEM; } /* @@ -701,18 +617,20 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, struct vm_area_struct *expand) { struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *next = vma->vm_next, *orig_vma = vma; + struct vm_area_struct *next_next = NULL; /* uninit var warning */ + struct vm_area_struct *next = find_vma(mm, vma->vm_end); + struct vm_area_struct *orig_vma = vma; struct address_space *mapping = NULL; struct rb_root_cached *root = NULL; struct anon_vma *anon_vma = NULL; struct file *file = vma->vm_file; - bool start_changed = false, end_changed = false; + bool vma_changed = false; long adjust_next = 0; int remove_next = 0; + MA_STATE(mas, &mm->mm_mt, 0, 0); + struct vm_area_struct *exporter = NULL, *importer = NULL; if (next && !insert) { - struct vm_area_struct *exporter = NULL, *importer = NULL; - if (end >= next->vm_end) { /* * vma expands, overlapping all the next, and @@ -741,10 +659,11 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * remove_next == 1 is case 1 or 7. */ remove_next = 1 + (end > next->vm_end); + if (remove_next == 2) + next_next = find_vma(mm, next->vm_end); + VM_WARN_ON(remove_next == 2 && - end != next->vm_next->vm_end); - /* trim end to next, for case 6 first pass */ - end = next->vm_end; + end != next_next->vm_end); } exporter = next; @@ -755,7 +674,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * next, if the vma overlaps with it. */ if (remove_next == 2 && !next->anon_vma) - exporter = next->vm_next; + exporter = next_next; } else if (end > next->vm_start) { /* @@ -792,9 +711,11 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, return error; } } -again: - vma_adjust_trans_huge(orig_vma, start, end, adjust_next); + if (mas_preallocate(&mas, vma, GFP_KERNEL)) + return -ENOMEM; + + vma_adjust_trans_huge(orig_vma, start, end, adjust_next); if (file) { mapping = file->f_mapping; root = &mapping->i_mmap; @@ -804,14 +725,14 @@ again: uprobe_munmap(next, next->vm_start, next->vm_end); i_mmap_lock_write(mapping); - if (insert) { + if (insert && insert->vm_file) { /* * Put into interval tree now, so instantiated pages * are visible to arm/parisc __flush_dcache_page * throughout; but we cannot insert into address * space until vma start or end is updated. */ - __vma_link_file(insert); + __vma_link_file(insert, insert->vm_file->f_mapping); } } @@ -835,17 +756,37 @@ again: } if (start != vma->vm_start) { + if ((vma->vm_start < start) && + (!insert || (insert->vm_end != start))) { + vma_mas_szero(&mas, vma->vm_start, start); + VM_WARN_ON(insert && insert->vm_start > vma->vm_start); + } else { + vma_changed = true; + } vma->vm_start = start; - start_changed = true; } if (end != vma->vm_end) { + if (vma->vm_end > end) { + if (!insert || (insert->vm_start != end)) { + vma_mas_szero(&mas, end, vma->vm_end); + mas_reset(&mas); + VM_WARN_ON(insert && + insert->vm_end < vma->vm_end); + } + } else { + vma_changed = true; + } vma->vm_end = end; - end_changed = true; } + + if (vma_changed) + vma_mas_store(vma, &mas); + vma->vm_pgoff = pgoff; if (adjust_next) { next->vm_start += adjust_next; next->vm_pgoff += adjust_next >> PAGE_SHIFT; + vma_mas_store(next, &mas); } if (file) { @@ -855,42 +796,19 @@ again: flush_dcache_mmap_unlock(mapping); } - if (remove_next) { - /* - * vma_merge has merged next into vma, and needs - * us to remove next before dropping the locks. - */ - if (remove_next != 3) - __vma_unlink(mm, next, next); - else - /* - * vma is not before next if they've been - * swapped. - * - * pre-swap() next->vm_start was reduced so - * tell validate_mm_rb to ignore pre-swap() - * "next" (which is stored in post-swap() - * "vma"). - */ - __vma_unlink(mm, next, vma); - if (file) - __remove_shared_vm_struct(next, file, mapping); + if (remove_next && file) { + __remove_shared_vm_struct(next, file, mapping); + if (remove_next == 2) + __remove_shared_vm_struct(next_next, file, mapping); } else if (insert) { /* * split_vma has split insert from vma, and needs * us to insert it before dropping the locks * (it may either follow vma or precede it). */ - __insert_vm_struct(mm, insert); - } else { - if (start_changed) - vma_gap_update(vma); - if (end_changed) { - if (!next) - mm->highest_vm_end = vm_end_gap(vma); - else if (!adjust_next) - vma_gap_update(next); - } + mas_reset(&mas); + vma_mas_store(insert, &mas); + mm->map_count++; } if (anon_vma) { @@ -909,6 +827,7 @@ again: } if (remove_next) { +again: if (file) { uprobe_munmap(next, next->vm_start, next->vm_end); fput(file); @@ -917,66 +836,24 @@ again: anon_vma_merge(vma, next); mm->map_count--; mpol_put(vma_policy(next)); + if (remove_next != 2) + BUG_ON(vma->vm_end < next->vm_end); vm_area_free(next); + /* * In mprotect's case 6 (see comments on vma_merge), - * we must remove another next too. It would clutter - * up the code too much to do both in one go. + * we must remove next_next too. */ - if (remove_next != 3) { - /* - * If "next" was removed and vma->vm_end was - * expanded (up) over it, in turn - * "next->vm_prev->vm_end" changed and the - * "vma->vm_next" gap must be updated. - */ - next = vma->vm_next; - } else { - /* - * For the scope of the comment "next" and - * "vma" considered pre-swap(): if "vma" was - * removed, next->vm_start was expanded (down) - * over it and the "next" gap must be updated. - * Because of the swap() the post-swap() "vma" - * actually points to pre-swap() "next" - * (post-swap() "next" as opposed is now a - * dangling pointer). - */ - next = vma; - } if (remove_next == 2) { remove_next = 1; - end = next->vm_end; + next = next_next; goto again; } - else if (next) - vma_gap_update(next); - else { - /* - * If remove_next == 2 we obviously can't - * reach this path. - * - * If remove_next == 3 we can't reach this - * path because pre-swap() next is always not - * NULL. pre-swap() "next" is not being - * removed and its next->vm_end is not altered - * (and furthermore "end" already matches - * next->vm_end in remove_next == 3). - * - * We reach this only in the remove_next == 1 - * case if the "next" vma that was removed was - * the highest vma of the mm. However in such - * case next->vm_end == "end" and the extended - * "vma" has vma->vm_end == next->vm_end so - * mm->highest_vm_end doesn't need any update - * in remove_next == 1 case. - */ - VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma)); - } } if (insert && file) uprobe_mmap(insert); + mas_destroy(&mas); validate_mm(mm); return 0; @@ -1128,8 +1005,10 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct anon_vma_name *anon_name) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; - struct vm_area_struct *area, *next; - int err; + struct vm_area_struct *mid, *next, *res; + int err = -1; + bool merge_prev = false; + bool merge_next = false; /* * We later require that vma->vm_flags == vm_flags, @@ -1138,76 +1017,61 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, if (vm_flags & VM_SPECIAL) return NULL; - next = vma_next(mm, prev); - area = next; - if (area && area->vm_end == end) /* cases 6, 7, 8 */ - next = next->vm_next; + next = find_vma(mm, prev ? prev->vm_end : 0); + mid = next; + if (next && next->vm_end == end) /* cases 6, 7, 8 */ + next = find_vma(mm, next->vm_end); /* verify some invariant that must be enforced by the caller */ VM_WARN_ON(prev && addr <= prev->vm_start); - VM_WARN_ON(area && end > area->vm_end); + VM_WARN_ON(mid && end > mid->vm_end); VM_WARN_ON(addr >= end); - /* - * Can it merge with the predecessor? - */ + /* Can we merge the predecessor? */ if (prev && prev->vm_end == addr && mpol_equal(vma_policy(prev), policy) && can_vma_merge_after(prev, vm_flags, anon_vma, file, pgoff, vm_userfaultfd_ctx, anon_name)) { - /* - * OK, it can. Can we now merge in the successor as well? - */ - if (next && end == next->vm_start && - mpol_equal(policy, vma_policy(next)) && - can_vma_merge_before(next, vm_flags, - anon_vma, file, - pgoff+pglen, - vm_userfaultfd_ctx, anon_name) && - is_mergeable_anon_vma(prev->anon_vma, - next->anon_vma, NULL)) { - /* cases 1, 6 */ - err = __vma_adjust(prev, prev->vm_start, - next->vm_end, prev->vm_pgoff, NULL, - prev); - } else /* cases 2, 5, 7 */ - err = __vma_adjust(prev, prev->vm_start, - end, prev->vm_pgoff, NULL, prev); - if (err) - return NULL; - khugepaged_enter_vma(prev, vm_flags); - return prev; + merge_prev = true; } - - /* - * Can this new request be merged in front of next? - */ + /* Can we merge the successor? */ if (next && end == next->vm_start && mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, vm_userfaultfd_ctx, anon_name)) { + merge_next = true; + } + /* Can we merge both the predecessor and the successor? */ + if (merge_prev && merge_next && + is_mergeable_anon_vma(prev->anon_vma, + next->anon_vma, NULL)) { /* cases 1, 6 */ + err = __vma_adjust(prev, prev->vm_start, + next->vm_end, prev->vm_pgoff, NULL, + prev); + res = prev; + } else if (merge_prev) { /* cases 2, 5, 7 */ + err = __vma_adjust(prev, prev->vm_start, + end, prev->vm_pgoff, NULL, prev); + res = prev; + } else if (merge_next) { if (prev && addr < prev->vm_end) /* case 4 */ err = __vma_adjust(prev, prev->vm_start, - addr, prev->vm_pgoff, NULL, next); - else { /* cases 3, 8 */ - err = __vma_adjust(area, addr, next->vm_end, - next->vm_pgoff - pglen, NULL, next); - /* - * In case 3 area is already equal to next and - * this is a noop, but in case 8 "area" has - * been removed and next was expanded over it. - */ - area = next; - } - if (err) - return NULL; - khugepaged_enter_vma(area, vm_flags); - return area; + addr, prev->vm_pgoff, NULL, next); + else /* cases 3, 8 */ + err = __vma_adjust(mid, addr, next->vm_end, + next->vm_pgoff - pglen, NULL, next); + res = next; } - return NULL; + /* + * Cannot merge with predecessor or successor or error in __vma_adjust? + */ + if (err) + return NULL; + khugepaged_enter_vma(res, vm_flags); + return res; } /* @@ -1275,18 +1139,24 @@ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_ */ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) { + MA_STATE(mas, &vma->vm_mm->mm_mt, vma->vm_end, vma->vm_end); struct anon_vma *anon_vma = NULL; + struct vm_area_struct *prev, *next; /* Try next first. */ - if (vma->vm_next) { - anon_vma = reusable_anon_vma(vma->vm_next, vma, vma->vm_next); + next = mas_walk(&mas); + if (next) { + anon_vma = reusable_anon_vma(next, vma, next); if (anon_vma) return anon_vma; } + prev = mas_prev(&mas, 0); + VM_BUG_ON_VMA(prev != vma, vma); + prev = mas_prev(&mas, 0); /* Try prev next. */ - if (vma->vm_prev) - anon_vma = reusable_anon_vma(vma->vm_prev, vma->vm_prev, vma); + if (prev) + anon_vma = reusable_anon_vma(prev, prev, vma); /* * We might reach here with anon_vma == NULL if we can't find @@ -1375,6 +1245,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags_t vm_flags; int pkey = 0; + validate_mm(mm); *populate = 0; if (!len) @@ -1678,388 +1549,63 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; } -unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma, *prev, *merge; - int error; - struct rb_node **rb_link, *rb_parent; - unsigned long charged = 0; - - /* Check against address space limit. */ - if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { - unsigned long nr_pages; - - /* - * MAP_FIXED may remove pages of mappings that intersects with - * requested mapping. Account for the pages it would unmap. - */ - nr_pages = count_vma_pages_range(mm, addr, addr + len); - - if (!may_expand_vm(mm, vm_flags, - (len >> PAGE_SHIFT) - nr_pages)) - return -ENOMEM; - } - - /* Clear old maps, set up prev, rb_link, rb_parent, and uf */ - if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf)) - return -ENOMEM; - /* - * Private writable mapping: check memory availability - */ - if (accountable_mapping(file, vm_flags)) { - charged = len >> PAGE_SHIFT; - if (security_vm_enough_memory_mm(mm, charged)) - return -ENOMEM; - vm_flags |= VM_ACCOUNT; - } - - /* - * Can we just expand an old mapping? - */ - vma = vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); - if (vma) - goto out; - - /* - * Determine the object being mapped and call the appropriate - * specific mapper. the address has already been validated, but - * not unmapped, but the maps are removed from the list. - */ - vma = vm_area_alloc(mm); - if (!vma) { - error = -ENOMEM; - goto unacct_error; - } - - vma->vm_start = addr; - vma->vm_end = addr + len; - vma->vm_flags = vm_flags; - vma->vm_page_prot = vm_get_page_prot(vm_flags); - vma->vm_pgoff = pgoff; - - if (file) { - if (vm_flags & VM_SHARED) { - error = mapping_map_writable(file->f_mapping); - if (error) - goto free_vma; - } - - vma->vm_file = get_file(file); - error = call_mmap(file, vma); - if (error) - goto unmap_and_free_vma; - - /* Can addr have changed?? - * - * Answer: Yes, several device drivers can do it in their - * f_op->mmap method. -DaveM - * Bug: If addr is changed, prev, rb_link, rb_parent should - * be updated for vma_link() - */ - WARN_ON_ONCE(addr != vma->vm_start); - - addr = vma->vm_start; - - /* If vm_flags changed after call_mmap(), we should try merge vma again - * as we may succeed this time. - */ - if (unlikely(vm_flags != vma->vm_flags && prev)) { - merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, - NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); - if (merge) { - /* ->mmap() can change vma->vm_file and fput the original file. So - * fput the vma->vm_file here or we would add an extra fput for file - * and cause general protection fault ultimately. - */ - fput(vma->vm_file); - vm_area_free(vma); - vma = merge; - /* Update vm_flags to pick up the change. */ - vm_flags = vma->vm_flags; - goto unmap_writable; - } - } - - vm_flags = vma->vm_flags; - } else if (vm_flags & VM_SHARED) { - error = shmem_zero_setup(vma); - if (error) - goto free_vma; - } else { - vma_set_anonymous(vma); - } - - /* Allow architectures to sanity-check the vm_flags */ - if (!arch_validate_flags(vma->vm_flags)) { - error = -EINVAL; - if (file) - goto unmap_and_free_vma; - else - goto free_vma; - } - - vma_link(mm, vma, prev, rb_link, rb_parent); - - /* - * vma_merge() calls khugepaged_enter_vma() either, the below - * call covers the non-merge case. - */ - khugepaged_enter_vma(vma, vma->vm_flags); - - /* Once vma denies write, undo our temporary denial count */ -unmap_writable: - if (file && vm_flags & VM_SHARED) - mapping_unmap_writable(file->f_mapping); - file = vma->vm_file; -out: - perf_event_mmap(vma); - - vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); - if (vm_flags & VM_LOCKED) { - if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || - is_vm_hugetlb_page(vma) || - vma == get_gate_vma(current->mm)) - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; - else - mm->locked_vm += (len >> PAGE_SHIFT); - } - - if (file) - uprobe_mmap(vma); - - /* - * New (or expanded) vma always get soft dirty status. - * Otherwise user-space soft-dirty page tracker won't - * be able to distinguish situation when vma area unmapped, - * then new mapped in-place (which must be aimed as - * a completely new data area). - */ - vma->vm_flags |= VM_SOFTDIRTY; - - vma_set_page_prot(vma); - - return addr; - -unmap_and_free_vma: - fput(vma->vm_file); - vma->vm_file = NULL; - - /* Undo any partial mapping done by a device driver. */ - unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); - if (vm_flags & VM_SHARED) - mapping_unmap_writable(file->f_mapping); -free_vma: - vm_area_free(vma); -unacct_error: - if (charged) - vm_unacct_memory(charged); - return error; -} - +/** + * unmapped_area() - Find an area between the low_limit and the high_limit with + * the correct alignment and offset, all from @info. Note: current->mm is used + * for the search. + * + * @info: The unmapped area information including the range (low_limit - + * hight_limit), the alignment offset and mask. + * + * Return: A memory address or -ENOMEM. + */ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) { - /* - * We implement the search by looking for an rbtree node that - * immediately follows a suitable gap. That is, - * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length; - * - gap_end = vma->vm_start >= info->low_limit + length; - * - gap_end - gap_start >= length - */ + unsigned long length, gap; - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long length, low_limit, high_limit, gap_start, gap_end; + MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); /* Adjust search length to account for worst case alignment overhead */ length = info->length + info->align_mask; if (length < info->length) return -ENOMEM; - /* Adjust search limits by the desired length */ - if (info->high_limit < length) + if (mas_empty_area(&mas, info->low_limit, info->high_limit - 1, + length)) return -ENOMEM; - high_limit = info->high_limit - length; - if (info->low_limit > high_limit) - return -ENOMEM; - low_limit = info->low_limit + length; - - /* Check if rbtree root looks promising */ - if (RB_EMPTY_ROOT(&mm->mm_rb)) - goto check_highest; - vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); - if (vma->rb_subtree_gap < length) - goto check_highest; - - while (true) { - /* Visit left subtree if it looks promising */ - gap_end = vm_start_gap(vma); - if (gap_end >= low_limit && vma->vm_rb.rb_left) { - struct vm_area_struct *left = - rb_entry(vma->vm_rb.rb_left, - struct vm_area_struct, vm_rb); - if (left->rb_subtree_gap >= length) { - vma = left; - continue; - } - } - - gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0; -check_current: - /* Check if current node has a suitable gap */ - if (gap_start > high_limit) - return -ENOMEM; - if (gap_end >= low_limit && - gap_end > gap_start && gap_end - gap_start >= length) - goto found; - - /* Visit right subtree if it looks promising */ - if (vma->vm_rb.rb_right) { - struct vm_area_struct *right = - rb_entry(vma->vm_rb.rb_right, - struct vm_area_struct, vm_rb); - if (right->rb_subtree_gap >= length) { - vma = right; - continue; - } - } - - /* Go back up the rbtree to find next candidate node */ - while (true) { - struct rb_node *prev = &vma->vm_rb; - if (!rb_parent(prev)) - goto check_highest; - vma = rb_entry(rb_parent(prev), - struct vm_area_struct, vm_rb); - if (prev == vma->vm_rb.rb_left) { - gap_start = vm_end_gap(vma->vm_prev); - gap_end = vm_start_gap(vma); - goto check_current; - } - } - } - -check_highest: - /* Check highest gap, which does not precede any rbtree node */ - gap_start = mm->highest_vm_end; - gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */ - if (gap_start > high_limit) - return -ENOMEM; - -found: - /* We found a suitable gap. Clip it with the original low_limit. */ - if (gap_start < info->low_limit) - gap_start = info->low_limit; - - /* Adjust gap address to the desired alignment */ - gap_start += (info->align_offset - gap_start) & info->align_mask; - - VM_BUG_ON(gap_start + info->length > info->high_limit); - VM_BUG_ON(gap_start + info->length > gap_end); - return gap_start; + gap = mas.index; + gap += (info->align_offset - gap) & info->align_mask; + return gap; } +/** + * unmapped_area_topdown() - Find an area between the low_limit and the + * high_limit with * the correct alignment and offset at the highest available + * address, all from @info. Note: current->mm is used for the search. + * + * @info: The unmapped area information including the range (low_limit - + * hight_limit), the alignment offset and mask. + * + * Return: A memory address or -ENOMEM. + */ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) { - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long length, low_limit, high_limit, gap_start, gap_end; + unsigned long length, gap; + MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); /* Adjust search length to account for worst case alignment overhead */ length = info->length + info->align_mask; if (length < info->length) return -ENOMEM; - /* - * Adjust search limits by the desired length. - * See implementation comment at top of unmapped_area(). - */ - gap_end = info->high_limit; - if (gap_end < length) - return -ENOMEM; - high_limit = gap_end - length; - - if (info->low_limit > high_limit) - return -ENOMEM; - low_limit = info->low_limit + length; - - /* Check highest gap, which does not precede any rbtree node */ - gap_start = mm->highest_vm_end; - if (gap_start <= high_limit) - goto found_highest; - - /* Check if rbtree root looks promising */ - if (RB_EMPTY_ROOT(&mm->mm_rb)) - return -ENOMEM; - vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); - if (vma->rb_subtree_gap < length) + if (mas_empty_area_rev(&mas, info->low_limit, info->high_limit - 1, + length)) return -ENOMEM; - while (true) { - /* Visit right subtree if it looks promising */ - gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0; - if (gap_start <= high_limit && vma->vm_rb.rb_right) { - struct vm_area_struct *right = - rb_entry(vma->vm_rb.rb_right, - struct vm_area_struct, vm_rb); - if (right->rb_subtree_gap >= length) { - vma = right; - continue; - } - } - -check_current: - /* Check if current node has a suitable gap */ - gap_end = vm_start_gap(vma); - if (gap_end < low_limit) - return -ENOMEM; - if (gap_start <= high_limit && - gap_end > gap_start && gap_end - gap_start >= length) - goto found; - - /* Visit left subtree if it looks promising */ - if (vma->vm_rb.rb_left) { - struct vm_area_struct *left = - rb_entry(vma->vm_rb.rb_left, - struct vm_area_struct, vm_rb); - if (left->rb_subtree_gap >= length) { - vma = left; - continue; - } - } - - /* Go back up the rbtree to find next candidate node */ - while (true) { - struct rb_node *prev = &vma->vm_rb; - if (!rb_parent(prev)) - return -ENOMEM; - vma = rb_entry(rb_parent(prev), - struct vm_area_struct, vm_rb); - if (prev == vma->vm_rb.rb_right) { - gap_start = vma->vm_prev ? - vm_end_gap(vma->vm_prev) : 0; - goto check_current; - } - } - } - -found: - /* We found a suitable gap. Clip it with the original high_limit. */ - if (gap_end > info->high_limit) - gap_end = info->high_limit; - -found_highest: - /* Compute highest gap address at the desired alignment */ - gap_end -= info->length; - gap_end -= (gap_end - info->align_offset) & info->align_mask; - - VM_BUG_ON(gap_end < info->low_limit); - VM_BUG_ON(gap_end < gap_start); - return gap_end; + gap = mas.last + 1 - info->length; + gap -= (gap - info->align_offset) & info->align_mask; + return gap; } /* @@ -2249,58 +1795,67 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, EXPORT_SYMBOL(get_unmapped_area); -/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ -struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +/** + * find_vma_intersection() - Look up the first VMA which intersects the interval + * @mm: The process address space. + * @start_addr: The inclusive start user address. + * @end_addr: The exclusive end user address. + * + * Returns: The first VMA within the provided range, %NULL otherwise. Assumes + * start_addr < end_addr. + */ +struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, + unsigned long start_addr, + unsigned long end_addr) { - struct rb_node *rb_node; - struct vm_area_struct *vma; + unsigned long index = start_addr; mmap_assert_locked(mm); - /* Check the cache first. */ - vma = vmacache_find(mm, addr); - if (likely(vma)) - return vma; - - rb_node = mm->mm_rb.rb_node; - - while (rb_node) { - struct vm_area_struct *tmp; - - tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); + return mt_find(&mm->mm_mt, &index, end_addr - 1); +} +EXPORT_SYMBOL(find_vma_intersection); - if (tmp->vm_end > addr) { - vma = tmp; - if (tmp->vm_start <= addr) - break; - rb_node = rb_node->rb_left; - } else - rb_node = rb_node->rb_right; - } +/** + * find_vma() - Find the VMA for a given address, or the next VMA. + * @mm: The mm_struct to check + * @addr: The address + * + * Returns: The VMA associated with addr, or the next VMA. + * May return %NULL in the case of no VMA at addr or above. + */ +struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +{ + unsigned long index = addr; - if (vma) - vmacache_update(addr, vma); - return vma; + mmap_assert_locked(mm); + return mt_find(&mm->mm_mt, &index, ULONG_MAX); } - EXPORT_SYMBOL(find_vma); -/* - * Same as find_vma, but also return a pointer to the previous VMA in *pprev. +/** + * find_vma_prev() - Find the VMA for a given address, or the next vma and + * set %pprev to the previous VMA, if any. + * @mm: The mm_struct to check + * @addr: The address + * @pprev: The pointer to set to the previous VMA + * + * Note that RCU lock is missing here since the external mmap_lock() is used + * instead. + * + * Returns: The VMA associated with @addr, or the next vma. + * May return %NULL in the case of no vma at addr or above. */ struct vm_area_struct * find_vma_prev(struct mm_struct *mm, unsigned long addr, struct vm_area_struct **pprev) { struct vm_area_struct *vma; + MA_STATE(mas, &mm->mm_mt, addr, addr); - vma = find_vma(mm, addr); - if (vma) { - *pprev = vma->vm_prev; - } else { - struct rb_node *rb_node = rb_last(&mm->mm_rb); - - *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL; - } + vma = mas_walk(&mas); + *pprev = mas_prev(&mas, 0); + if (!vma) + vma = mas_next(&mas, ULONG_MAX); return vma; } @@ -2354,6 +1909,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) struct vm_area_struct *next; unsigned long gap_addr; int error = 0; + MA_STATE(mas, &mm->mm_mt, 0, 0); if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; @@ -2371,16 +1927,21 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (gap_addr < address || gap_addr > TASK_SIZE) gap_addr = TASK_SIZE; - next = vma->vm_next; - if (next && next->vm_start < gap_addr && vma_is_accessible(next)) { + next = find_vma_intersection(mm, vma->vm_end, gap_addr); + if (next && vma_is_accessible(next)) { if (!(next->vm_flags & VM_GROWSUP)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ } + if (mas_preallocate(&mas, vma, GFP_KERNEL)) + return -ENOMEM; + /* We must make sure the anon_vma is allocated. */ - if (unlikely(anon_vma_prepare(vma))) + if (unlikely(anon_vma_prepare(vma))) { + mas_destroy(&mas); return -ENOMEM; + } /* * vma->vm_start/vm_end cannot change under us because the caller @@ -2401,15 +1962,13 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) error = acct_stack_growth(vma, size, grow); if (!error) { /* - * vma_gap_update() doesn't support concurrent - * updates, but we only hold a shared mmap_lock - * lock here, so we need to protect against - * concurrent vma expansions. - * anon_vma_lock_write() doesn't help here, as - * we don't guarantee that all growable vmas - * in a mm share the same root anon vma. - * So, we reuse mm->page_table_lock to guard - * against concurrent vma expansions. + * We only hold a shared mmap_lock lock here, so + * we need to protect against concurrent vma + * expansions. anon_vma_lock_write() doesn't + * help here, as we don't guarantee that all + * growable vmas in a mm share the same root + * anon vma. So, we reuse mm->page_table_lock + * to guard against concurrent vma expansions. */ spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) @@ -2417,11 +1976,9 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; + /* Overwrite old entry in mtree. */ + vma_mas_store(vma, &mas); anon_vma_interval_tree_post_update_vma(vma); - if (vma->vm_next) - vma_gap_update(vma->vm_next); - else - mm->highest_vm_end = vm_end_gap(vma); spin_unlock(&mm->page_table_lock); perf_event_mmap(vma); @@ -2430,7 +1987,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma(vma, vma->vm_flags); - validate_mm(mm); + mas_destroy(&mas); return error; } #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ @@ -2438,10 +1995,10 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* * vma is the first one with address < vma->vm_start. Have to extend vma. */ -int expand_downwards(struct vm_area_struct *vma, - unsigned long address) +int expand_downwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start); struct vm_area_struct *prev; int error = 0; @@ -2450,7 +2007,7 @@ int expand_downwards(struct vm_area_struct *vma, return -EPERM; /* Enforce stack_guard_gap */ - prev = vma->vm_prev; + prev = mas_prev(&mas, 0); /* Check that both stack segments have the same anon_vma? */ if (prev && !(prev->vm_flags & VM_GROWSDOWN) && vma_is_accessible(prev)) { @@ -2458,9 +2015,14 @@ int expand_downwards(struct vm_area_struct *vma, return -ENOMEM; } + if (mas_preallocate(&mas, vma, GFP_KERNEL)) + return -ENOMEM; + /* We must make sure the anon_vma is allocated. */ - if (unlikely(anon_vma_prepare(vma))) + if (unlikely(anon_vma_prepare(vma))) { + mas_destroy(&mas); return -ENOMEM; + } /* * vma->vm_start/vm_end cannot change under us because the caller @@ -2481,15 +2043,13 @@ int expand_downwards(struct vm_area_struct *vma, error = acct_stack_growth(vma, size, grow); if (!error) { /* - * vma_gap_update() doesn't support concurrent - * updates, but we only hold a shared mmap_lock - * lock here, so we need to protect against - * concurrent vma expansions. - * anon_vma_lock_write() doesn't help here, as - * we don't guarantee that all growable vmas - * in a mm share the same root anon vma. - * So, we reuse mm->page_table_lock to guard - * against concurrent vma expansions. + * We only hold a shared mmap_lock lock here, so + * we need to protect against concurrent vma + * expansions. anon_vma_lock_write() doesn't + * help here, as we don't guarantee that all + * growable vmas in a mm share the same root + * anon vma. So, we reuse mm->page_table_lock + * to guard against concurrent vma expansions. */ spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) @@ -2498,8 +2058,9 @@ int expand_downwards(struct vm_area_struct *vma, anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; vma->vm_pgoff -= grow; + /* Overwrite old entry in mtree. */ + vma_mas_store(vma, &mas); anon_vma_interval_tree_post_update_vma(vma); - vma_gap_update(vma); spin_unlock(&mm->page_table_lock); perf_event_mmap(vma); @@ -2508,7 +2069,7 @@ int expand_downwards(struct vm_area_struct *vma, } anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma(vma, vma->vm_flags); - validate_mm(mm); + mas_destroy(&mas); return error; } @@ -2581,25 +2142,26 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) EXPORT_SYMBOL_GPL(find_extend_vma); /* - * Ok - we have the memory areas we should free on the vma list, - * so release them, and do the vma updates. + * Ok - we have the memory areas we should free on a maple tree so release them, + * and do the vma updates. * * Called with the mm semaphore held. */ -static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) +static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) { unsigned long nr_accounted = 0; + struct vm_area_struct *vma; /* Update high watermark before we lower total_vm */ update_hiwater_vm(mm); - do { + mas_for_each(mas, vma, ULONG_MAX) { long nrpages = vma_pages(vma); if (vma->vm_flags & VM_ACCOUNT) nr_accounted += nrpages; vm_stat_account(mm, vma->vm_flags, -nrpages); - vma = remove_vma(vma); - } while (vma); + remove_vma(vma); + } vm_unacct_memory(nr_accounted); validate_mm(mm); } @@ -2609,67 +2171,23 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) * * Called with the mm semaphore held. */ -static void unmap_region(struct mm_struct *mm, +static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, struct vm_area_struct *vma, struct vm_area_struct *prev, + struct vm_area_struct *next, unsigned long start, unsigned long end) { - struct vm_area_struct *next = vma_next(mm, prev); struct mmu_gather tlb; lru_add_drain(); tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); - unmap_vmas(&tlb, vma, start, end); - free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, + unmap_vmas(&tlb, mt, vma, start, end); + free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, next ? next->vm_start : USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb); } /* - * Create a list of vma's touched by the unmap, removing them from the mm's - * vma list as we go.. - */ -static bool -detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, unsigned long end) -{ - struct vm_area_struct **insertion_point; - struct vm_area_struct *tail_vma = NULL; - - insertion_point = (prev ? &prev->vm_next : &mm->mmap); - vma->vm_prev = NULL; - do { - vma_rb_erase(vma, &mm->mm_rb); - if (vma->vm_flags & VM_LOCKED) - mm->locked_vm -= vma_pages(vma); - mm->map_count--; - tail_vma = vma; - vma = vma->vm_next; - } while (vma && vma->vm_start < end); - *insertion_point = vma; - if (vma) { - vma->vm_prev = prev; - vma_gap_update(vma); - } else - mm->highest_vm_end = prev ? vm_end_gap(prev) : 0; - tail_vma->vm_next = NULL; - - /* Kill the cache */ - vmacache_invalidate(mm); - - /* - * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or - * VM_GROWSUP VMA. Such VMAs can change their size under - * down_read(mmap_lock) and collide with the VMA we are about to unmap. - */ - if (vma && (vma->vm_flags & VM_GROWSDOWN)) - return false; - if (prev && (prev->vm_flags & VM_GROWSUP)) - return false; - return true; -} - -/* * __split_vma() bypasses sysctl_max_map_count checking. We use this where it * has already been checked or doesn't make sense to fail. */ @@ -2678,6 +2196,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, { struct vm_area_struct *new; int err; + validate_mm_mt(mm); if (vma->vm_ops && vma->vm_ops->may_split) { err = vma->vm_ops->may_split(vma, addr); @@ -2720,6 +2239,9 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (!err) return 0; + /* Avoid vm accounting in close() operation */ + new->vm_start = new->vm_end; + new->vm_pgoff = 0; /* Clean everything up if vma_adjust failed. */ if (new->vm_ops && new->vm_ops->close) new->vm_ops->close(new); @@ -2730,6 +2252,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, mpol_put(vma_policy(new)); out_free_vma: vm_area_free(new); + validate_mm_mt(mm); return err; } @@ -2746,38 +2269,48 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return __split_vma(mm, vma, addr, new_below); } -/* Munmap is split into 2 main parts -- this part which finds - * what needs doing, and the areas themselves, which do the - * work. This now handles partial unmappings. - * Jeremy Fitzhardinge <jeremy@goop.org> - */ -int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, - struct list_head *uf, bool downgrade) +static inline int munmap_sidetree(struct vm_area_struct *vma, + struct ma_state *mas_detach) { - unsigned long end; - struct vm_area_struct *vma, *prev, *last; - - if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) - return -EINVAL; + mas_set_range(mas_detach, vma->vm_start, vma->vm_end - 1); + if (mas_store_gfp(mas_detach, vma, GFP_KERNEL)) + return -ENOMEM; - len = PAGE_ALIGN(len); - end = start + len; - if (len == 0) - return -EINVAL; + if (vma->vm_flags & VM_LOCKED) + vma->vm_mm->locked_vm -= vma_pages(vma); - /* - * arch_unmap() might do unmaps itself. It must be called - * and finish any rbtree manipulation before this code - * runs and also starts to manipulate the rbtree. - */ - arch_unmap(mm, start, end); + return 0; +} - /* Find the first overlapping VMA where start < vma->vm_end */ - vma = find_vma_intersection(mm, start, end); - if (!vma) - return 0; - prev = vma->vm_prev; +/* + * do_mas_align_munmap() - munmap the aligned region from @start to @end. + * @mas: The maple_state, ideally set up to alter the correct tree location. + * @vma: The starting vm_area_struct + * @mm: The mm_struct + * @start: The aligned start address to munmap. + * @end: The aligned end address to munmap. + * @uf: The userfaultfd list_head + * @downgrade: Set to true to attempt a write downgrade of the mmap_sem + * + * If @downgrade is true, check return code for potential release of the lock. + */ +static int +do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, + struct mm_struct *mm, unsigned long start, + unsigned long end, struct list_head *uf, bool downgrade) +{ + struct vm_area_struct *prev, *next = NULL; + struct maple_tree mt_detach; + int count = 0; + int error = -ENOMEM; + MA_STATE(mas_detach, &mt_detach, 0, 0); + mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN); + mt_set_external_lock(&mt_detach, &mm->mmap_lock); + + if (mas_preallocate(mas, vma, GFP_KERNEL)) + return -ENOMEM; + mas->last = end - 1; /* * If we need to split any vma, do it now to save pain later. * @@ -2785,8 +2318,9 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, * unmapped vm_area_struct will remain in use: so lower split_vma * places tmp vma above, and higher split_vma places tmp vma below. */ + + /* Does it split the first one? */ if (start > vma->vm_start) { - int error; /* * Make sure that map_count on return from munmap() will @@ -2794,22 +2328,61 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, * its limit temporarily, to help free resources as expected. */ if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) - return -ENOMEM; + goto map_count_exceeded; + /* + * mas_pause() is not needed since mas->index needs to be set + * differently than vma->vm_end anyways. + */ error = __split_vma(mm, vma, start, 0); if (error) - return error; - prev = vma; + goto start_split_failed; + + mas_set(mas, start); + vma = mas_walk(mas); } - /* Does it split the last one? */ - last = find_vma(mm, end); - if (last && end > last->vm_start) { - int error = __split_vma(mm, last, end, 1); + prev = mas_prev(mas, 0); + if (unlikely((!prev))) + mas_set(mas, start); + + /* + * Detach a range of VMAs from the mm. Using next as a temp variable as + * it is always overwritten. + */ + mas_for_each(mas, next, end - 1) { + /* Does it split the end? */ + if (next->vm_end > end) { + struct vm_area_struct *split; + + error = __split_vma(mm, next, end, 1); + if (error) + goto end_split_failed; + + mas_set(mas, end); + split = mas_prev(mas, 0); + error = munmap_sidetree(split, &mas_detach); + if (error) + goto munmap_sidetree_failed; + + count++; + if (vma == next) + vma = split; + break; + } + error = munmap_sidetree(next, &mas_detach); if (error) - return error; + goto munmap_sidetree_failed; + + count++; +#ifdef CONFIG_DEBUG_VM_MAPLE_TREE + BUG_ON(next->vm_start < start); + BUG_ON(next->vm_start > end); +#endif } - vma = vma_next(mm, prev); + + if (!next) + next = mas_next(mas, ULONG_MAX); if (unlikely(uf)) { /* @@ -2821,30 +2394,372 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, * split, despite we could. This is unlikely enough * failure that it's not worth optimizing it for. */ - int error = userfaultfd_unmap_prep(vma, start, end, uf); + error = userfaultfd_unmap_prep(mm, start, end, uf); + if (error) - return error; + goto userfaultfd_error; + } + + /* Point of no return */ + mas_set_range(mas, start, end - 1); +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) + /* Make sure no VMAs are about to be lost. */ + { + MA_STATE(test, &mt_detach, start, end - 1); + struct vm_area_struct *vma_mas, *vma_test; + int test_count = 0; + + rcu_read_lock(); + vma_test = mas_find(&test, end - 1); + mas_for_each(mas, vma_mas, end - 1) { + BUG_ON(vma_mas != vma_test); + test_count++; + vma_test = mas_next(&test, end - 1); + } + rcu_read_unlock(); + BUG_ON(count != test_count); + mas_set_range(mas, start, end - 1); + } +#endif + mas_store_prealloc(mas, NULL); + mm->map_count -= count; + /* + * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or + * VM_GROWSUP VMA. Such VMAs can change their size under + * down_read(mmap_lock) and collide with the VMA we are about to unmap. + */ + if (downgrade) { + if (next && (next->vm_flags & VM_GROWSDOWN)) + downgrade = false; + else if (prev && (prev->vm_flags & VM_GROWSUP)) + downgrade = false; + else + mmap_write_downgrade(mm); } - /* Detach vmas from rbtree */ - if (!detach_vmas_to_be_unmapped(mm, vma, prev, end)) - downgrade = false; + unmap_region(mm, &mt_detach, vma, prev, next, start, end); + /* Statistics and freeing VMAs */ + mas_set(&mas_detach, start); + remove_mt(mm, &mas_detach); + __mt_destroy(&mt_detach); - if (downgrade) - mmap_write_downgrade(mm); - unmap_region(mm, vma, prev, start, end); + validate_mm(mm); + return downgrade ? 1 : 0; - /* Fix up all other VM information */ - remove_vma_list(mm, vma); +userfaultfd_error: +munmap_sidetree_failed: +end_split_failed: + __mt_destroy(&mt_detach); +start_split_failed: +map_count_exceeded: + mas_destroy(mas); + return error; +} - return downgrade ? 1 : 0; +/* + * do_mas_munmap() - munmap a given range. + * @mas: The maple state + * @mm: The mm_struct + * @start: The start address to munmap + * @len: The length of the range to munmap + * @uf: The userfaultfd list_head + * @downgrade: set to true if the user wants to attempt to write_downgrade the + * mmap_sem + * + * This function takes a @mas that is either pointing to the previous VMA or set + * to MA_START and sets it up to remove the mapping(s). The @len will be + * aligned and any arch_unmap work will be preformed. + * + * Returns: -EINVAL on failure, 1 on success and unlock, 0 otherwise. + */ +int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, + unsigned long start, size_t len, struct list_head *uf, + bool downgrade) +{ + unsigned long end; + struct vm_area_struct *vma; + + if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) + return -EINVAL; + + end = start + PAGE_ALIGN(len); + if (end == start) + return -EINVAL; + + /* arch_unmap() might do unmaps itself. */ + arch_unmap(mm, start, end); + + /* Find the first overlapping VMA */ + vma = mas_find(mas, end - 1); + if (!vma) + return 0; + + return do_mas_align_munmap(mas, vma, mm, start, end, uf, downgrade); } +/* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls. + * @mm: The mm_struct + * @start: The start address to munmap + * @len: The length to be munmapped. + * @uf: The userfaultfd list_head + */ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { - return __do_munmap(mm, start, len, uf, false); + MA_STATE(mas, &mm->mm_mt, start, start); + + return do_mas_munmap(&mas, mm, start, len, uf, false); +} + +unsigned long mmap_region(struct file *file, unsigned long addr, + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, + struct list_head *uf) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = NULL; + struct vm_area_struct *next, *prev, *merge; + pgoff_t pglen = len >> PAGE_SHIFT; + unsigned long charged = 0; + unsigned long end = addr + len; + unsigned long merge_start = addr, merge_end = end; + pgoff_t vm_pgoff; + int error; + MA_STATE(mas, &mm->mm_mt, addr, end - 1); + + /* Check against address space limit. */ + if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { + unsigned long nr_pages; + + /* + * MAP_FIXED may remove pages of mappings that intersects with + * requested mapping. Account for the pages it would unmap. + */ + nr_pages = count_vma_pages_range(mm, addr, end); + + if (!may_expand_vm(mm, vm_flags, + (len >> PAGE_SHIFT) - nr_pages)) + return -ENOMEM; + } + + /* Unmap any existing mapping in the area */ + if (do_mas_munmap(&mas, mm, addr, len, uf, false)) + return -ENOMEM; + + /* + * Private writable mapping: check memory availability + */ + if (accountable_mapping(file, vm_flags)) { + charged = len >> PAGE_SHIFT; + if (security_vm_enough_memory_mm(mm, charged)) + return -ENOMEM; + vm_flags |= VM_ACCOUNT; + } + + next = mas_next(&mas, ULONG_MAX); + prev = mas_prev(&mas, 0); + if (vm_flags & VM_SPECIAL) + goto cannot_expand; + + /* Attempt to expand an old mapping */ + /* Check next */ + if (next && next->vm_start == end && !vma_policy(next) && + can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen, + NULL_VM_UFFD_CTX, NULL)) { + merge_end = next->vm_end; + vma = next; + vm_pgoff = next->vm_pgoff - pglen; + } + + /* Check prev */ + if (prev && prev->vm_end == addr && !vma_policy(prev) && + (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file, + pgoff, vma->vm_userfaultfd_ctx, NULL) : + can_vma_merge_after(prev, vm_flags, NULL, file, pgoff, + NULL_VM_UFFD_CTX, NULL))) { + merge_start = prev->vm_start; + vma = prev; + vm_pgoff = prev->vm_pgoff; + } + + + /* Actually expand, if possible */ + if (vma && + !vma_expand(&mas, vma, merge_start, merge_end, vm_pgoff, next)) { + khugepaged_enter_vma(vma, vm_flags); + goto expanded; + } + + mas.index = addr; + mas.last = end - 1; +cannot_expand: + /* + * Determine the object being mapped and call the appropriate + * specific mapper. the address has already been validated, but + * not unmapped, but the maps are removed from the list. + */ + vma = vm_area_alloc(mm); + if (!vma) { + error = -ENOMEM; + goto unacct_error; + } + + vma->vm_start = addr; + vma->vm_end = end; + vma->vm_flags = vm_flags; + vma->vm_page_prot = vm_get_page_prot(vm_flags); + vma->vm_pgoff = pgoff; + + if (file) { + if (vm_flags & VM_SHARED) { + error = mapping_map_writable(file->f_mapping); + if (error) + goto free_vma; + } + + vma->vm_file = get_file(file); + error = call_mmap(file, vma); + if (error) + goto unmap_and_free_vma; + + /* + * Expansion is handled above, merging is handled below. + * Drivers should not alter the address of the VMA. + */ + if (WARN_ON((addr != vma->vm_start))) { + error = -EINVAL; + goto close_and_free_vma; + } + mas_reset(&mas); + + /* + * If vm_flags changed after call_mmap(), we should try merge + * vma again as we may succeed this time. + */ + if (unlikely(vm_flags != vma->vm_flags && prev)) { + merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, + NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); + if (merge) { + /* + * ->mmap() can change vma->vm_file and fput + * the original file. So fput the vma->vm_file + * here or we would add an extra fput for file + * and cause general protection fault + * ultimately. + */ + fput(vma->vm_file); + vm_area_free(vma); + vma = merge; + /* Update vm_flags to pick up the change. */ + vm_flags = vma->vm_flags; + goto unmap_writable; + } + } + + vm_flags = vma->vm_flags; + } else if (vm_flags & VM_SHARED) { + error = shmem_zero_setup(vma); + if (error) + goto free_vma; + } else { + vma_set_anonymous(vma); + } + + /* Allow architectures to sanity-check the vm_flags */ + if (!arch_validate_flags(vma->vm_flags)) { + error = -EINVAL; + if (file) + goto close_and_free_vma; + else if (vma->vm_file) + goto unmap_and_free_vma; + else + goto free_vma; + } + + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + error = -ENOMEM; + if (file) + goto close_and_free_vma; + else if (vma->vm_file) + goto unmap_and_free_vma; + else + goto free_vma; + } + + if (vma->vm_file) + i_mmap_lock_write(vma->vm_file->f_mapping); + + vma_mas_store(vma, &mas); + mm->map_count++; + if (vma->vm_file) { + if (vma->vm_flags & VM_SHARED) + mapping_allow_writable(vma->vm_file->f_mapping); + + flush_dcache_mmap_lock(vma->vm_file->f_mapping); + vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap); + flush_dcache_mmap_unlock(vma->vm_file->f_mapping); + i_mmap_unlock_write(vma->vm_file->f_mapping); + } + + /* + * vma_merge() calls khugepaged_enter_vma() either, the below + * call covers the non-merge case. + */ + khugepaged_enter_vma(vma, vma->vm_flags); + + /* Once vma denies write, undo our temporary denial count */ +unmap_writable: + if (file && vm_flags & VM_SHARED) + mapping_unmap_writable(file->f_mapping); + file = vma->vm_file; +expanded: + perf_event_mmap(vma); + + vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); + if (vm_flags & VM_LOCKED) { + if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || + is_vm_hugetlb_page(vma) || + vma == get_gate_vma(current->mm)) + vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + else + mm->locked_vm += (len >> PAGE_SHIFT); + } + + if (file) + uprobe_mmap(vma); + + /* + * New (or expanded) vma always get soft dirty status. + * Otherwise user-space soft-dirty page tracker won't + * be able to distinguish situation when vma area unmapped, + * then new mapped in-place (which must be aimed as + * a completely new data area). + */ + vma->vm_flags |= VM_SOFTDIRTY; + + vma_set_page_prot(vma); + + validate_mm(mm); + return addr; + +close_and_free_vma: + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); +unmap_and_free_vma: + fput(vma->vm_file); + vma->vm_file = NULL; + + /* Undo any partial mapping done by a device driver. */ + unmap_region(mm, mas.tree, vma, prev, next, vma->vm_start, vma->vm_end); + if (file && (vm_flags & VM_SHARED)) + mapping_unmap_writable(file->f_mapping); +free_vma: + vm_area_free(vma); +unacct_error: + if (charged) + vm_unacct_memory(charged); + validate_mm(mm); + return error; } static int __vm_munmap(unsigned long start, size_t len, bool downgrade) @@ -2852,11 +2767,12 @@ static int __vm_munmap(unsigned long start, size_t len, bool downgrade) int ret; struct mm_struct *mm = current->mm; LIST_HEAD(uf); + MA_STATE(mas, &mm->mm_mt, start, start); if (mmap_write_lock_killable(mm)) return -EINTR; - ret = __do_munmap(mm, start, len, &uf, downgrade); + ret = do_mas_munmap(&mas, mm, start, len, &uf, downgrade); /* * Returning 1 indicates mmap_lock is downgraded. * But 1 is not legal return value of vm_munmap() and munmap(), reset @@ -2922,11 +2838,12 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, goto out; if (start + size > vma->vm_end) { - struct vm_area_struct *next; + VMA_ITERATOR(vmi, mm, vma->vm_end); + struct vm_area_struct *next, *prev = vma; - for (next = vma->vm_next; next; next = next->vm_next) { + for_each_vma_range(vmi, next, start + size) { /* hole between vmas ? */ - if (next->vm_start != next->vm_prev->vm_end) + if (next->vm_start != prev->vm_end) goto out; if (next->vm_file != vma->vm_file) @@ -2937,6 +2854,8 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (start + size <= next->vm_end) break; + + prev = next; } if (!next) @@ -2966,37 +2885,53 @@ out: } /* - * this is really a simplified "do_mmap". it only handles - * anonymous maps. eventually we may be able to do some - * brk-specific accounting here. + * brk_munmap() - Unmap a parital vma. + * @mas: The maple tree state. + * @vma: The vma to be modified + * @newbrk: the start of the address to unmap + * @oldbrk: The end of the address to unmap + * @uf: The userfaultfd list_head + * + * Returns: 1 on success. + * unmaps a partial VMA mapping. Does not handle alignment, downgrades lock if + * possible. */ -static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long flags, struct list_head *uf) +static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long newbrk, unsigned long oldbrk, + struct list_head *uf) { - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma, *prev; - struct rb_node **rb_link, *rb_parent; - pgoff_t pgoff = addr >> PAGE_SHIFT; - int error; - unsigned long mapped_addr; - - /* Until we need other flags, refuse anything except VM_EXEC. */ - if ((flags & (~VM_EXEC)) != 0) - return -EINVAL; - flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - - mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); - if (IS_ERR_VALUE(mapped_addr)) - return mapped_addr; + struct mm_struct *mm = vma->vm_mm; + int ret; - error = mlock_future_check(mm, mm->def_flags, len); - if (error) - return error; + arch_unmap(mm, newbrk, oldbrk); + ret = do_mas_align_munmap(mas, vma, mm, newbrk, oldbrk, uf, true); + validate_mm_mt(mm); + return ret; +} - /* Clear old maps, set up prev, rb_link, rb_parent, and uf */ - if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf)) - return -ENOMEM; +/* + * do_brk_flags() - Increase the brk vma if the flags match. + * @mas: The maple tree state. + * @addr: The start address + * @len: The length of the increase + * @vma: The vma, + * @flags: The VMA Flags + * + * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags + * do not match then create a new anonymous VMA. Eventually we may be able to + * do some brk-specific accounting here. + */ +static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long addr, unsigned long len, unsigned long flags) +{ + struct mm_struct *mm = current->mm; - /* Check against address space limits *after* clearing old maps... */ + validate_mm_mt(mm); + /* + * Check against address space limits by the changed size + * Note: This happens *after* clearing old mappings in some code paths. + */ + flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) return -ENOMEM; @@ -3006,28 +2941,50 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) return -ENOMEM; - /* Can we just expand an old private anonymous mapping? */ - vma = vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); - if (vma) - goto out; - /* - * create a vma struct for an anonymous mapping + * Expand the existing vma if possible; Note that singular lists do not + * occur after forking, so the expand will only happen on new VMAs. */ - vma = vm_area_alloc(mm); - if (!vma) { - vm_unacct_memory(len >> PAGE_SHIFT); - return -ENOMEM; + if (vma && vma->vm_end == addr && !vma_policy(vma) && + can_vma_merge_after(vma, flags, NULL, NULL, + addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) { + mas_set_range(mas, vma->vm_start, addr + len - 1); + if (mas_preallocate(mas, vma, GFP_KERNEL)) + goto unacct_fail; + + vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); + if (vma->anon_vma) { + anon_vma_lock_write(vma->anon_vma); + anon_vma_interval_tree_pre_update_vma(vma); + } + vma->vm_end = addr + len; + vma->vm_flags |= VM_SOFTDIRTY; + mas_store_prealloc(mas, vma); + + if (vma->anon_vma) { + anon_vma_interval_tree_post_update_vma(vma); + anon_vma_unlock_write(vma->anon_vma); + } + khugepaged_enter_vma(vma, flags); + goto out; } + /* create a vma struct for an anonymous mapping */ + vma = vm_area_alloc(mm); + if (!vma) + goto unacct_fail; + vma_set_anonymous(vma); vma->vm_start = addr; vma->vm_end = addr + len; - vma->vm_pgoff = pgoff; + vma->vm_pgoff = addr >> PAGE_SHIFT; vma->vm_flags = flags; vma->vm_page_prot = vm_get_page_prot(flags); - vma_link(mm, vma, prev, rb_link, rb_parent); + mas_set_range(mas, vma->vm_start, addr + len - 1); + if (mas_store_gfp(mas, vma, GFP_KERNEL)) + goto mas_store_fail; + + mm->map_count++; out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; @@ -3035,16 +2992,25 @@ out: if (flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); vma->vm_flags |= VM_SOFTDIRTY; + validate_mm(mm); return 0; + +mas_store_fail: + vm_area_free(vma); +unacct_fail: + vm_unacct_memory(len >> PAGE_SHIFT); + return -ENOMEM; } int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) { struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = NULL; unsigned long len; int ret; bool populate; LIST_HEAD(uf); + MA_STATE(mas, &mm->mm_mt, addr, addr); len = PAGE_ALIGN(request); if (len < request) @@ -3055,13 +3021,31 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) if (mmap_write_lock_killable(mm)) return -EINTR; - ret = do_brk_flags(addr, len, flags, &uf); + /* Until we need other flags, refuse anything except VM_EXEC. */ + if ((flags & (~VM_EXEC)) != 0) + return -EINVAL; + + ret = check_brk_limits(addr, len); + if (ret) + goto limits_failed; + + ret = do_mas_munmap(&mas, mm, addr, len, &uf, 0); + if (ret) + goto munmap_failed; + + vma = mas_prev(&mas, 0); + ret = do_brk_flags(&mas, vma, addr, len, flags); populate = ((mm->def_flags & VM_LOCKED) != 0); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); if (populate && !ret) mm_populate(addr, len); return ret; + +munmap_failed: +limits_failed: + mmap_write_unlock(mm); + return ret; } EXPORT_SYMBOL(vm_brk_flags); @@ -3077,34 +3061,19 @@ void exit_mmap(struct mm_struct *mm) struct mmu_gather tlb; struct vm_area_struct *vma; unsigned long nr_accounted = 0; + MA_STATE(mas, &mm->mm_mt, 0, 0); + int count = 0; /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); - if (unlikely(mm_is_oom_victim(mm))) { - /* - * Manually reap the mm to free as much memory as possible. - * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard - * this mm from further consideration. Taking mm->mmap_lock for - * write after setting MMF_OOM_SKIP will guarantee that the oom - * reaper will not run on this mm again after mmap_lock is - * dropped. - * - * Nothing can be holding mm->mmap_lock here and the above call - * to mmu_notifier_release(mm) ensures mmu notifier callbacks in - * __oom_reap_task_mm() will not block. - */ - (void)__oom_reap_task_mm(mm); - set_bit(MMF_OOM_SKIP, &mm->flags); - } - - mmap_write_lock(mm); + mmap_read_lock(mm); arch_exit_mmap(mm); - vma = mm->mmap; + vma = mas_find(&mas, ULONG_MAX); if (!vma) { /* Can happen if dup_mmap() received an OOM */ - mmap_write_unlock(mm); + mmap_read_unlock(mm); return; } @@ -3112,19 +3081,37 @@ void exit_mmap(struct mm_struct *mm) flush_cache_mm(mm); tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ - /* Use -1 here to ensure all VMAs in the mm are unmapped */ - unmap_vmas(&tlb, vma, 0, -1); - free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); + /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ + unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX); + mmap_read_unlock(mm); + + /* + * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper + * because the memory has been already freed. + */ + set_bit(MMF_OOM_SKIP, &mm->flags); + mmap_write_lock(mm); + free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS, + USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb); - /* Walk the list again, actually closing and freeing it. */ - while (vma) { + /* + * Walk the list again, actually closing and freeing it, with preemption + * enabled, without holding any MM locks besides the unreachable + * mmap_write_lock. + */ + do { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); - vma = remove_vma(vma); + remove_vma(vma); + count++; cond_resched(); - } - mm->mmap = NULL; + } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); + + BUG_ON(count != mm->map_count); + + trace_exit_mmap(mm); + __mt_destroy(&mm->mm_mt); mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); } @@ -3135,14 +3122,14 @@ void exit_mmap(struct mm_struct *mm) */ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct *prev; - struct rb_node **rb_link, *rb_parent; + unsigned long charged = vma_pages(vma); - if (find_vma_links(mm, vma->vm_start, vma->vm_end, - &prev, &rb_link, &rb_parent)) + + if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) return -ENOMEM; + if ((vma->vm_flags & VM_ACCOUNT) && - security_vm_enough_memory_mm(mm, vma_pages(vma))) + security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; /* @@ -3162,7 +3149,11 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; } - vma_link(mm, vma, prev, rb_link, rb_parent); + if (vma_link(mm, vma)) { + vm_unacct_memory(charged); + return -ENOMEM; + } + return 0; } @@ -3178,9 +3169,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, unsigned long vma_start = vma->vm_start; struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma, *prev; - struct rb_node **rb_link, *rb_parent; bool faulted_in_anon_vma = true; + validate_mm_mt(mm); /* * If anonymous vma has not yet been faulted, update new pgoff * to match new location, to increase its chance of merging. @@ -3190,8 +3181,10 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, faulted_in_anon_vma = false; } - if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) + new_vma = find_vma_prev(mm, addr, &prev); + if (new_vma && new_vma->vm_start < addr + len) return NULL; /* should never get here */ + new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); @@ -3232,16 +3225,27 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, get_file(new_vma->vm_file); if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); - vma_link(mm, new_vma, prev, rb_link, rb_parent); + if (vma_link(mm, new_vma)) + goto out_vma_link; *need_rmap_locks = false; } + validate_mm_mt(mm); return new_vma; +out_vma_link: + if (new_vma->vm_ops && new_vma->vm_ops->close) + new_vma->vm_ops->close(new_vma); + + if (new_vma->vm_file) + fput(new_vma->vm_file); + + unlink_anon_vmas(new_vma); out_free_mempol: mpol_put(vma_policy(new_vma)); out_free_vma: vm_area_free(new_vma); out: + validate_mm_mt(mm); return NULL; } @@ -3378,6 +3382,7 @@ static struct vm_area_struct *__install_special_mapping( int ret; struct vm_area_struct *vma; + validate_mm_mt(mm); vma = vm_area_alloc(mm); if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); @@ -3400,10 +3405,12 @@ static struct vm_area_struct *__install_special_mapping( perf_event_mmap(vma); + validate_mm_mt(mm); return vma; out: vm_area_free(vma); + validate_mm_mt(mm); return ERR_PTR(ret); } @@ -3528,12 +3535,13 @@ int mm_take_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; struct anon_vma_chain *avc; + MA_STATE(mas, &mm->mm_mt, 0, 0); mmap_assert_write_locked(mm); mutex_lock(&mm_all_locks_mutex); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_for_each(&mas, vma, ULONG_MAX) { if (signal_pending(current)) goto out_unlock; if (vma->vm_file && vma->vm_file->f_mapping && @@ -3541,7 +3549,8 @@ int mm_take_all_locks(struct mm_struct *mm) vm_lock_mapping(mm, vma->vm_file->f_mapping); } - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_set(&mas, 0); + mas_for_each(&mas, vma, ULONG_MAX) { if (signal_pending(current)) goto out_unlock; if (vma->vm_file && vma->vm_file->f_mapping && @@ -3549,7 +3558,8 @@ int mm_take_all_locks(struct mm_struct *mm) vm_lock_mapping(mm, vma->vm_file->f_mapping); } - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_set(&mas, 0); + mas_for_each(&mas, vma, ULONG_MAX) { if (signal_pending(current)) goto out_unlock; if (vma->anon_vma) @@ -3608,11 +3618,12 @@ void mm_drop_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; struct anon_vma_chain *avc; + MA_STATE(mas, &mm->mm_mt, 0, 0); mmap_assert_write_locked(mm); BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_for_each(&mas, vma, ULONG_MAX) { if (vma->anon_vma) list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) vm_unlock_anon_vma(avc->anon_vma); @@ -3733,13 +3744,9 @@ static int reserve_mem_notifier(struct notifier_block *nb, return NOTIFY_OK; } -static struct notifier_block reserve_mem_nb = { - .notifier_call = reserve_mem_notifier, -}; - static int __meminit init_reserve_notifier(void) { - if (register_hotmemory_notifier(&reserve_mem_nb)) + if (hotplug_memory_notifier(reserve_mem_notifier, DEFAULT_CALLBACK_PRI)) pr_err("Failed registering memory add/remove notifier for admin reserve\n"); return 0; diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index a71924bd38c0..2b93cf6ac9ae 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -8,6 +8,7 @@ #include <linux/rcupdate.h> #include <linux/smp.h> #include <linux/swap.h> +#include <linux/rmap.h> #include <asm/pgalloc.h> #include <asm/tlb.h> @@ -18,6 +19,10 @@ static bool tlb_next_batch(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; + /* Limit batching if we have delayed rmaps pending */ + if (tlb->delayed_rmap && tlb->active != &tlb->local) + return false; + batch = tlb->active; if (batch->next) { tlb->active = batch->next; @@ -42,12 +47,46 @@ static bool tlb_next_batch(struct mmu_gather *tlb) return true; } +#ifdef CONFIG_SMP +static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma) +{ + for (int i = 0; i < batch->nr; i++) { + struct encoded_page *enc = batch->encoded_pages[i]; + + if (encoded_page_flags(enc)) { + struct page *page = encoded_page_ptr(enc); + page_remove_rmap(page, vma, false); + } + } +} + +/** + * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB + * @tlb: the current mmu_gather + * + * Note that because of how tlb_next_batch() above works, we will + * never start multiple new batches with pending delayed rmaps, so + * we only need to walk through the current active batch and the + * original local one. + */ +void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) +{ + if (!tlb->delayed_rmap) + return; + + tlb_flush_rmap_batch(&tlb->local, vma); + if (tlb->active != &tlb->local) + tlb_flush_rmap_batch(tlb->active, vma); + tlb->delayed_rmap = 0; +} +#endif + static void tlb_batch_pages_flush(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; for (batch = &tlb->local; batch && batch->nr; batch = batch->next) { - struct page **pages = batch->pages; + struct encoded_page **pages = batch->encoded_pages; do { /* @@ -76,7 +115,7 @@ static void tlb_batch_list_free(struct mmu_gather *tlb) tlb->local.next = NULL; } -bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) +bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *page, int page_size) { struct mmu_gather_batch *batch; @@ -91,13 +130,13 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ * Add the page and check if we are full. If so * force a flush. */ - batch->pages[batch->nr++] = page; + batch->encoded_pages[batch->nr++] = page; if (batch->nr == batch->max) { if (!tlb_next_batch(tlb)) return true; batch = tlb->active; } - VM_BUG_ON_PAGE(batch->nr > batch->max, page); + VM_BUG_ON_PAGE(batch->nr > batch->max, encoded_page_ptr(page)); return false; } @@ -152,7 +191,7 @@ static void tlb_remove_table_smp_sync(void *arg) /* Simply deliver the interrupt */ } -static void tlb_remove_table_sync_one(void) +void tlb_remove_table_sync_one(void) { /* * This isn't an RCU grace period and hence the page-tables cannot be @@ -176,8 +215,6 @@ static void tlb_remove_table_free(struct mmu_table_batch *batch) #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */ -static void tlb_remove_table_sync_one(void) { } - static void tlb_remove_table_free(struct mmu_table_batch *batch) { __tlb_remove_table_free(batch); @@ -276,6 +313,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, tlb->active = &tlb->local; tlb->batch_count = 0; #endif + tlb->delayed_rmap = 0; tlb_table_init(tlb); #ifdef CONFIG_MMU_GATHER_PAGE_SIZE diff --git a/mm/mmzone.c b/mm/mmzone.c index 0ae7571e35ab..68e1511be12d 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -88,6 +88,8 @@ void lruvec_init(struct lruvec *lruvec) * Poison its list head, so that any operations on it would crash. */ list_del(&lruvec->lists[LRU_UNEVICTABLE]); + + lru_gen_init_lruvec(lruvec); } #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) diff --git a/mm/mprotect.c b/mm/mprotect.c index bc6bddd156ca..908df12caa26 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -31,6 +31,7 @@ #include <linux/pgtable.h> #include <linux/sched/sysctl.h> #include <linux/userfaultfd_k.h> +#include <linux/memory-tiers.h> #include <asm/cacheflush.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> @@ -38,14 +39,16 @@ #include "internal.h" -static inline bool can_change_pte_writable(struct vm_area_struct *vma, - unsigned long addr, pte_t pte) +bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, + pte_t pte) { struct page *page; - VM_BUG_ON(!(vma->vm_flags & VM_WRITE) || pte_write(pte)); + if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE))) + return false; - if (pte_protnone(pte) || !pte_dirty(pte)) + /* Don't touch entries that are not even readable. */ + if (pte_protnone(pte)) return false; /* Do we need write faults for softdirty tracking? */ @@ -58,17 +61,23 @@ static inline bool can_change_pte_writable(struct vm_area_struct *vma, if (!(vma->vm_flags & VM_SHARED)) { /* - * We can only special-case on exclusive anonymous pages, - * because we know that our write-fault handler similarly would - * map them writable without any additional checks while holding - * the PT lock. + * Writable MAP_PRIVATE mapping: We can only special-case on + * exclusive anonymous pages, because we know that our + * write-fault handler similarly would map them writable without + * any additional checks while holding the PT lock. */ page = vm_normal_page(vma, addr, pte); - if (!page || !PageAnon(page) || !PageAnonExclusive(page)) - return false; + return page && PageAnon(page) && PageAnonExclusive(page); } - return true; + /* + * Writable MAP_SHARED mapping: "clean" might indicate that the FS still + * needs a real write-fault for writenotify + * (see vma_wants_writenotify()). If "dirty", the assumption is that the + * FS was already notified and we can simply mark the PTE writable + * just like the write-fault handler would do. + */ + return pte_dirty(pte); } static unsigned long change_pte_range(struct mmu_gather *tlb, @@ -112,7 +121,6 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, oldpte = *pte; if (pte_present(oldpte)) { pte_t ptent; - bool preserve_write = prot_numa && pte_write(oldpte); /* * Avoid trapping faults against the zero or KSM @@ -121,6 +129,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, if (prot_numa) { struct page *page; int nid; + bool toptier; /* Avoid TLB flush if possible */ if (pte_protnone(oldpte)) @@ -150,20 +159,23 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, nid = page_to_nid(page); if (target_node == nid) continue; + toptier = node_is_toptier(nid); /* * Skip scanning top tier node if normal numa * balancing is disabled */ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && - node_is_toptier(nid)) + toptier) continue; + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && + !toptier) + xchg_page_access_time(page, + jiffies_to_msecs(jiffies)); } oldpte = ptep_modify_prot_start(vma, addr, pte); ptent = pte_modify(oldpte, newprot); - if (preserve_write) - ptent = pte_mk_savedwrite(ptent); if (uffd_wp) { ptent = pte_wrprotect(ptent); @@ -285,7 +297,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, */ static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd) { - pmd_t pmdval = pmd_read_atomic(pmd); + pmd_t pmdval = pmdp_get_lockless(pmd); /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -543,8 +555,8 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; unsigned long oldflags = vma->vm_flags; long nrpages = (end - start) >> PAGE_SHIFT; + unsigned int mm_cp_flags = 0; unsigned long charged = 0; - bool try_change_writable; pgoff_t pgoff; int error; @@ -622,20 +634,11 @@ success: * held in write mode. */ vma->vm_flags = newflags; - /* - * We want to check manually if we can change individual PTEs writable - * if we can't do that automatically for all PTEs in a mapping. For - * private mappings, that's always the case when we have write - * permissions as we properly have to handle COW. - */ - if (vma->vm_flags & VM_SHARED) - try_change_writable = vma_wants_writenotify(vma, vma->vm_page_prot); - else - try_change_writable = !!(vma->vm_flags & VM_WRITE); + if (vma_wants_manual_pte_write_upgrade(vma)) + mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; vma_set_page_prot(vma); - change_protection(tlb, vma, start, end, vma->vm_page_prot, - try_change_writable ? MM_CP_TRY_CHANGE_WRITABLE : 0); + change_protection(tlb, vma, start, end, vma->vm_page_prot, mm_cp_flags); /* * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major @@ -669,6 +672,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, const bool rier = (current->personality & READ_IMPLIES_EXEC) && (prot & PROT_READ); struct mmu_gather tlb; + MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); start = untagged_addr(start); @@ -700,7 +704,8 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey)) goto out; - vma = find_vma(current->mm, start); + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); error = -ENOMEM; if (!vma) goto out; @@ -726,7 +731,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if (start > vma->vm_start) prev = vma; else - prev = vma->vm_prev; + prev = mas_prev(&mas, 0); tlb_gather_mmu(&tlb, current->mm); for (nstart = start ; ; ) { @@ -745,8 +750,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, * If a permission is not passed to mprotect(), it must be * cleared from the VMA. */ - mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC | - VM_FLAGS_CLEAR; + mask_off_old_flags = VM_ACCESS_FLAGS | VM_FLAGS_CLEAR; new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey); newflags = calc_vm_prot_bits(prot, new_vma_pkey); @@ -789,7 +793,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if (nstart >= end) break; - vma = prev->vm_next; + vma = find_vma(current->mm, prev->vm_end); if (!vma || vma->vm_start != nstart) { error = -ENOMEM; break; diff --git a/mm/mremap.c b/mm/mremap.c index b522cd0259a0..fe587c5d6591 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -9,6 +9,7 @@ */ #include <linux/mm.h> +#include <linux/mm_inline.h> #include <linux/hugetlb.h> #include <linux/shm.h> #include <linux/ksm.h> @@ -23,6 +24,7 @@ #include <linux/mmu_notifier.h> #include <linux/uaccess.h> #include <linux/userfaultfd_k.h> +#include <linux/mempolicy.h> #include <asm/cacheflush.h> #include <asm/tlb.h> @@ -716,7 +718,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (excess) { vma->vm_flags |= VM_ACCOUNT; if (split) - vma->vm_next->vm_flags |= VM_ACCOUNT; + find_vma(mm, vma->vm_end)->vm_flags |= VM_ACCOUNT; } return new_addr; @@ -866,9 +868,10 @@ out: static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) { unsigned long end = vma->vm_end + delta; + if (end < vma->vm_end) /* overflow */ return 0; - if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */ + if (find_vma_intersection(vma->vm_mm, vma->vm_end, end)) return 0; if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start, 0, MAP_FIXED) & ~PAGE_MASK) @@ -975,20 +978,23 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* * Always allow a shrinking remap: that just unmaps * the unnecessary pages.. - * __do_munmap does all the needed commit accounting, and + * do_mas_munmap does all the needed commit accounting, and * downgrades mmap_lock to read if so directed. */ if (old_len >= new_len) { int retval; + MA_STATE(mas, &mm->mm_mt, addr + new_len, addr + new_len); - retval = __do_munmap(mm, addr+new_len, old_len - new_len, - &uf_unmap, true); - if (retval < 0 && old_len != new_len) { - ret = retval; - goto out; + retval = do_mas_munmap(&mas, mm, addr + new_len, + old_len - new_len, &uf_unmap, true); /* Returning 1 indicates mmap_lock is downgraded to read. */ - } else if (retval == 1) + if (retval == 1) { downgraded = true; + } else if (retval < 0 && old_len != new_len) { + ret = retval; + goto out; + } + ret = addr; goto out; } @@ -1008,6 +1014,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* can we just expand the current mapping? */ if (vma_expandable(vma, new_len - old_len)) { long pages = (new_len - old_len) >> PAGE_SHIFT; + unsigned long extension_start = addr + old_len; + unsigned long extension_end = addr + new_len; + pgoff_t extension_pgoff = vma->vm_pgoff + + ((extension_start - vma->vm_start) >> PAGE_SHIFT); if (vma->vm_flags & VM_ACCOUNT) { if (security_vm_enough_memory_mm(mm, pages)) { @@ -1016,8 +1026,18 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, } } - if (vma_adjust(vma, vma->vm_start, addr + new_len, - vma->vm_pgoff, NULL)) { + /* + * Function vma_merge() is called on the extension we are adding to + * the already existing vma, vma_merge() will merge this extension with + * the already existing vma (expand operation itself) and possibly also + * with the next vma if it becomes adjacent to the expanded vma and + * otherwise compatible. + */ + vma = vma_merge(mm, vma, extension_start, extension_end, + vma->vm_flags, vma->anon_vma, vma->vm_file, + extension_pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + if (!vma) { vm_unacct_memory(pages); ret = -ENOMEM; goto out; diff --git a/mm/msync.c b/mm/msync.c index 137d1c104f3e..ac4c9bfea2e7 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -104,7 +104,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) error = 0; goto out_unlock; } - vma = vma->vm_next; + vma = find_vma(mm, vma->vm_end); } } out_unlock: diff --git a/mm/nommu.c b/mm/nommu.c index e819cbc21b39..214c70e1d059 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -19,7 +19,6 @@ #include <linux/export.h> #include <linux/mm.h> #include <linux/sched/mm.h> -#include <linux/vmacache.h> #include <linux/mman.h> #include <linux/swap.h> #include <linux/file.h> @@ -545,26 +544,27 @@ static void put_nommu_region(struct vm_region *region) __put_nommu_region(region); } -/* - * add a VMA into a process's mm_struct in the appropriate place in the list - * and tree and add to the address space's page tree also if not an anonymous - * page - * - should be called with mm->mmap_lock held writelocked - */ -static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) +void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas) { - struct vm_area_struct *pvma, *prev; - struct address_space *mapping; - struct rb_node **p, *parent, *rb_prev; + mas_set_range(mas, vma->vm_start, vma->vm_end - 1); + mas_store_prealloc(mas, vma); +} - BUG_ON(!vma->vm_region); +void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas) +{ + mas->index = vma->vm_start; + mas->last = vma->vm_end - 1; + mas_store_prealloc(mas, NULL); +} +static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm) +{ mm->map_count++; vma->vm_mm = mm; /* add the VMA to the mapping */ if (vma->vm_file) { - mapping = vma->vm_file->f_mapping; + struct address_space *mapping = vma->vm_file->f_mapping; i_mmap_lock_write(mapping); flush_dcache_mmap_lock(mapping); @@ -572,67 +572,51 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) flush_dcache_mmap_unlock(mapping); i_mmap_unlock_write(mapping); } +} - /* add the VMA to the tree */ - parent = rb_prev = NULL; - p = &mm->mm_rb.rb_node; - while (*p) { - parent = *p; - pvma = rb_entry(parent, struct vm_area_struct, vm_rb); - - /* sort by: start addr, end addr, VMA struct addr in that order - * (the latter is necessary as we may get identical VMAs) */ - if (vma->vm_start < pvma->vm_start) - p = &(*p)->rb_left; - else if (vma->vm_start > pvma->vm_start) { - rb_prev = parent; - p = &(*p)->rb_right; - } else if (vma->vm_end < pvma->vm_end) - p = &(*p)->rb_left; - else if (vma->vm_end > pvma->vm_end) { - rb_prev = parent; - p = &(*p)->rb_right; - } else if (vma < pvma) - p = &(*p)->rb_left; - else if (vma > pvma) { - rb_prev = parent; - p = &(*p)->rb_right; - } else - BUG(); - } - - rb_link_node(&vma->vm_rb, parent, p); - rb_insert_color(&vma->vm_rb, &mm->mm_rb); +/* + * mas_add_vma_to_mm() - Maple state variant of add_mas_to_mm(). + * @mas: The maple state with preallocations. + * @mm: The mm_struct + * @vma: The vma to add + * + */ +static void mas_add_vma_to_mm(struct ma_state *mas, struct mm_struct *mm, + struct vm_area_struct *vma) +{ + BUG_ON(!vma->vm_region); - /* add VMA to the VMA list also */ - prev = NULL; - if (rb_prev) - prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); + setup_vma_to_mm(vma, mm); - __vma_link_list(mm, vma, prev); + /* add the VMA to the tree */ + vma_mas_store(vma, mas); } /* - * delete a VMA from its owning mm_struct and address space + * add a VMA into a process's mm_struct in the appropriate place in the list + * and tree and add to the address space's page tree also if not an anonymous + * page + * - should be called with mm->mmap_lock held writelocked */ -static void delete_vma_from_mm(struct vm_area_struct *vma) -{ - int i; - struct address_space *mapping; - struct mm_struct *mm = vma->vm_mm; - struct task_struct *curr = current; - - mm->map_count--; - for (i = 0; i < VMACACHE_SIZE; i++) { - /* if the vma is cached, invalidate the entire cache */ - if (curr->vmacache.vmas[i] == vma) { - vmacache_invalidate(mm); - break; - } +static int add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) +{ + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); + + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + pr_warn("Allocation of vma tree for process %d failed\n", + current->pid); + return -ENOMEM; } + mas_add_vma_to_mm(&mas, mm, vma); + return 0; +} +static void cleanup_vma_from_mm(struct vm_area_struct *vma) +{ + vma->vm_mm->map_count--; /* remove the VMA from the mapping */ if (vma->vm_file) { + struct address_space *mapping; mapping = vma->vm_file->f_mapping; i_mmap_lock_write(mapping); @@ -641,11 +625,24 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) flush_dcache_mmap_unlock(mapping); i_mmap_unlock_write(mapping); } +} +/* + * delete a VMA from its owning mm_struct and address space + */ +static int delete_vma_from_mm(struct vm_area_struct *vma) +{ + MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0); - /* remove from the MM's tree and list */ - rb_erase(&vma->vm_rb, &mm->mm_rb); + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + pr_warn("Allocation of vma tree for process %d failed\n", + current->pid); + return -ENOMEM; + } + cleanup_vma_from_mm(vma); - __vma_unlink_list(mm, vma); + /* remove from the MM's tree and list */ + vma_mas_remove(vma, &mas); + return 0; } /* @@ -661,31 +658,26 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) vm_area_free(vma); } +struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, + unsigned long start_addr, + unsigned long end_addr) +{ + unsigned long index = start_addr; + + mmap_assert_locked(mm); + return mt_find(&mm->mm_mt, &index, end_addr - 1); +} +EXPORT_SYMBOL(find_vma_intersection); + /* * look up the first VMA in which addr resides, NULL if none * - should be called with mm->mmap_lock at least held readlocked */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct *vma; + MA_STATE(mas, &mm->mm_mt, addr, addr); - /* check the cache first */ - vma = vmacache_find(mm, addr); - if (likely(vma)) - return vma; - - /* trawl the list (there may be multiple mappings in which addr - * resides) */ - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->vm_start > addr) - return NULL; - if (vma->vm_end > addr) { - vmacache_update(addr, vma); - return vma; - } - } - - return NULL; + return mas_walk(&mas); } EXPORT_SYMBOL(find_vma); @@ -717,26 +709,17 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, { struct vm_area_struct *vma; unsigned long end = addr + len; + MA_STATE(mas, &mm->mm_mt, addr, addr); - /* check the cache first */ - vma = vmacache_find_exact(mm, addr, end); - if (vma) - return vma; - - /* trawl the list (there may be multiple mappings in which addr - * resides) */ - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->vm_start < addr) - continue; - if (vma->vm_start > addr) - return NULL; - if (vma->vm_end == end) { - vmacache_update(addr, vma); - return vma; - } - } + vma = mas_walk(&mas); + if (!vma) + return NULL; + if (vma->vm_start != addr) + return NULL; + if (vma->vm_end != end) + return NULL; - return NULL; + return vma; } /* @@ -1069,6 +1052,7 @@ unsigned long do_mmap(struct file *file, vm_flags_t vm_flags; unsigned long capabilities, result; int ret; + MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); *populate = 0; @@ -1087,6 +1071,7 @@ unsigned long do_mmap(struct file *file, * now know into VMA flags */ vm_flags = determine_vm_flags(file, prot, flags, capabilities); + /* we're going to need to record the mapping */ region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); if (!region) @@ -1096,6 +1081,9 @@ unsigned long do_mmap(struct file *file, if (!vma) goto error_getting_vma; + if (mas_preallocate(&mas, vma, GFP_KERNEL)) + goto error_maple_preallocate; + region->vm_usage = 1; region->vm_flags = vm_flags; region->vm_pgoff = pgoff; @@ -1236,7 +1224,7 @@ unsigned long do_mmap(struct file *file, current->mm->total_vm += len >> PAGE_SHIFT; share: - add_vma_to_mm(current->mm, vma); + mas_add_vma_to_mm(&mas, current->mm, vma); /* we flush the region from the icache only when the first executable * mapping of it is made */ @@ -1262,6 +1250,7 @@ error: sharing_violation: up_write(&nommu_region_sem); + mas_destroy(&mas); pr_warn("Attempt to share mismatched mappings\n"); ret = -EINVAL; goto error; @@ -1278,6 +1267,14 @@ error_getting_region: len, current->pid); show_free_areas(0, NULL); return -ENOMEM; + +error_maple_preallocate: + kmem_cache_free(vm_region_jar, region); + vm_area_free(vma); + pr_warn("Allocation of vma tree for process %d failed\n", current->pid); + show_free_areas(0, NULL); + return -ENOMEM; + } unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, @@ -1343,6 +1340,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *new; struct vm_region *region; unsigned long npages; + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); /* we're only permitted to split anonymous regions (these should have * only a single usage on the region) */ @@ -1357,9 +1355,13 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return -ENOMEM; new = vm_area_dup(vma); - if (!new) { - kmem_cache_free(vm_region_jar, region); - return -ENOMEM; + if (!new) + goto err_vma_dup; + + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + pr_warn("Allocation of vma tree for process %d failed\n", + current->pid); + goto err_mas_preallocate; } /* most fields are the same, copy all, and then fixup */ @@ -1378,7 +1380,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); - delete_vma_from_mm(vma); down_write(&nommu_region_sem); delete_nommu_region(vma->vm_region); if (new_below) { @@ -1391,9 +1392,19 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, add_nommu_region(vma->vm_region); add_nommu_region(new->vm_region); up_write(&nommu_region_sem); - add_vma_to_mm(mm, vma); - add_vma_to_mm(mm, new); + + setup_vma_to_mm(vma, mm); + setup_vma_to_mm(new, mm); + mas_set_range(&mas, vma->vm_start, vma->vm_end - 1); + mas_store(&mas, vma); + vma_mas_store(new, &mas); return 0; + +err_mas_preallocate: + vm_area_free(new); +err_vma_dup: + kmem_cache_free(vm_region_jar, region); + return -ENOMEM; } /* @@ -1408,12 +1419,14 @@ static int shrink_vma(struct mm_struct *mm, /* adjust the VMA's pointers, which may reposition it in the MM's tree * and list */ - delete_vma_from_mm(vma); + if (delete_vma_from_mm(vma)) + return -ENOMEM; if (from > vma->vm_start) vma->vm_end = from; else vma->vm_start = to; - add_vma_to_mm(mm, vma); + if (add_vma_to_mm(mm, vma)) + return -ENOMEM; /* cut the backing region down to size */ region = vma->vm_region; @@ -1441,9 +1454,10 @@ static int shrink_vma(struct mm_struct *mm, */ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { + MA_STATE(mas, &mm->mm_mt, start, start); struct vm_area_struct *vma; unsigned long end; - int ret; + int ret = 0; len = PAGE_ALIGN(len); if (len == 0) @@ -1452,7 +1466,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list end = start + len; /* find the first potentially overlapping VMA */ - vma = find_vma(mm, start); + vma = mas_find(&mas, end - 1); if (!vma) { static int limit; if (limit < 5) { @@ -1471,7 +1485,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list return -EINVAL; if (end == vma->vm_end) goto erase_whole_vma; - vma = vma->vm_next; + vma = mas_next(&mas, end - 1); } while (vma); return -EINVAL; } else { @@ -1493,9 +1507,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list } erase_whole_vma: - delete_vma_from_mm(vma); + if (delete_vma_from_mm(vma)) + ret = -ENOMEM; delete_vma(mm, vma); - return 0; + return ret; } int vm_munmap(unsigned long addr, size_t len) @@ -1520,6 +1535,7 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) */ void exit_mmap(struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; if (!mm) @@ -1527,12 +1543,18 @@ void exit_mmap(struct mm_struct *mm) mm->total_vm = 0; - while ((vma = mm->mmap)) { - mm->mmap = vma->vm_next; - delete_vma_from_mm(vma); + /* + * Lock the mm to avoid assert complaining even though this is the only + * user of the mm + */ + mmap_write_lock(mm); + for_each_vma(vmi, vma) { + cleanup_vma_from_mm(vma); delete_vma(mm, vma); cond_resched(); } + __mt_destroy(&mm->mm_mt); + mmap_write_unlock(mm); } int vm_brk(unsigned long addr, unsigned long len) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3c6cf9e3cd66..1276e49b31b0 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -461,7 +461,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) if (is_memcg_oom(oc)) mem_cgroup_print_oom_meminfo(oc->memcg); else { - show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask); + __show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask, gfp_zone(oc->gfp_mask)); if (should_dump_unreclaim_slab()) dump_unreclaimable_slab(); } @@ -509,10 +509,11 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); static struct task_struct *oom_reaper_list; static DEFINE_SPINLOCK(oom_reaper_lock); -bool __oom_reap_task_mm(struct mm_struct *mm) +static bool __oom_reap_task_mm(struct mm_struct *mm) { struct vm_area_struct *vma; bool ret = true; + VMA_ITERATOR(vmi, mm, 0); /* * Tell all users of get_user/copy_from_user etc... that the content @@ -522,7 +523,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm) */ set_bit(MMF_UNSTABLE, &mm->flags); - for (vma = mm->mmap ; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP)) continue; @@ -764,10 +765,8 @@ static void mark_oom_victim(struct task_struct *tsk) return; /* oom_mm is bound to the signal struct life time. */ - if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) { + if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) mmgrab(tsk->signal->oom_mm); - set_bit(MMF_OOM_VICTIM, &mm->flags); - } /* * Make sure that the task is woken up from uninterruptible sleep diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 032a7bf8d259..ad608ef2a243 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -13,6 +13,7 @@ */ #include <linux/kernel.h> +#include <linux/math64.h> #include <linux/export.h> #include <linux/spinlock.h> #include <linux/fs.h> @@ -197,7 +198,7 @@ static void wb_min_max_ratio(struct bdi_writeback *wb, min *= this_bw; min = div64_ul(min, tot_bw); } - if (max < 100) { + if (max < 100 * BDI_RATIO_SCALE) { max *= this_bw; max = div64_ul(max, tot_bw); } @@ -650,11 +651,49 @@ void wb_domain_exit(struct wb_domain *dom) */ static unsigned int bdi_min_ratio; -int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) +static int bdi_check_pages_limit(unsigned long pages) +{ + unsigned long max_dirty_pages = global_dirtyable_memory(); + + if (pages > max_dirty_pages) + return -EINVAL; + + return 0; +} + +static unsigned long bdi_ratio_from_pages(unsigned long pages) +{ + unsigned long background_thresh; + unsigned long dirty_thresh; + unsigned long ratio; + + global_dirty_limits(&background_thresh, &dirty_thresh); + ratio = div64_u64(pages * 100ULL * BDI_RATIO_SCALE, dirty_thresh); + + return ratio; +} + +static u64 bdi_get_bytes(unsigned int ratio) +{ + unsigned long background_thresh; + unsigned long dirty_thresh; + u64 bytes; + + global_dirty_limits(&background_thresh, &dirty_thresh); + bytes = (dirty_thresh * PAGE_SIZE * ratio) / BDI_RATIO_SCALE / 100; + + return bytes; +} + +static int __bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { unsigned int delta; int ret = 0; + if (min_ratio > 100 * BDI_RATIO_SCALE) + return -EINVAL; + min_ratio *= BDI_RATIO_SCALE; + spin_lock_bh(&bdi_lock); if (min_ratio > bdi->max_ratio) { ret = -EINVAL; @@ -665,7 +704,7 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) bdi->min_ratio = min_ratio; } else { delta = min_ratio - bdi->min_ratio; - if (bdi_min_ratio + delta < 100) { + if (bdi_min_ratio + delta < 100 * BDI_RATIO_SCALE) { bdi_min_ratio += delta; bdi->min_ratio = min_ratio; } else { @@ -678,11 +717,11 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) return ret; } -int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) +static int __bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio) { int ret = 0; - if (max_ratio > 100) + if (max_ratio > 100 * BDI_RATIO_SCALE) return -EINVAL; spin_lock_bh(&bdi_lock); @@ -696,8 +735,81 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) return ret; } + +int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio) +{ + return __bdi_set_min_ratio(bdi, min_ratio); +} + +int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio) +{ + return __bdi_set_max_ratio(bdi, max_ratio); +} + +int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) +{ + return __bdi_set_min_ratio(bdi, min_ratio * BDI_RATIO_SCALE); +} + +int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio) +{ + return __bdi_set_max_ratio(bdi, max_ratio * BDI_RATIO_SCALE); +} EXPORT_SYMBOL(bdi_set_max_ratio); +u64 bdi_get_min_bytes(struct backing_dev_info *bdi) +{ + return bdi_get_bytes(bdi->min_ratio); +} + +int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes) +{ + int ret; + unsigned long pages = min_bytes >> PAGE_SHIFT; + unsigned long min_ratio; + + ret = bdi_check_pages_limit(pages); + if (ret) + return ret; + + min_ratio = bdi_ratio_from_pages(pages); + return __bdi_set_min_ratio(bdi, min_ratio); +} + +u64 bdi_get_max_bytes(struct backing_dev_info *bdi) +{ + return bdi_get_bytes(bdi->max_ratio); +} + +int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes) +{ + int ret; + unsigned long pages = max_bytes >> PAGE_SHIFT; + unsigned long max_ratio; + + ret = bdi_check_pages_limit(pages); + if (ret) + return ret; + + max_ratio = bdi_ratio_from_pages(pages); + return __bdi_set_max_ratio(bdi, max_ratio); +} + +int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit) +{ + if (strict_limit > 1) + return -EINVAL; + + spin_lock_bh(&bdi_lock); + if (strict_limit) + bdi->capabilities |= BDI_CAP_STRICTLIMIT; + else + bdi->capabilities &= ~BDI_CAP_STRICTLIMIT; + spin_unlock_bh(&bdi_lock); + + return 0; +} + static unsigned long dirty_freerun_ceiling(unsigned long thresh, unsigned long bg_thresh) { @@ -760,15 +872,15 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) fprop_fraction_percpu(&dom->completions, dtc->wb_completions, &numerator, &denominator); - wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100; + wb_thresh = (thresh * (100 * BDI_RATIO_SCALE - bdi_min_ratio)) / (100 * BDI_RATIO_SCALE); wb_thresh *= numerator; wb_thresh = div64_ul(wb_thresh, denominator); wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio); - wb_thresh += (thresh * wb_min_ratio) / 100; - if (wb_thresh > (thresh * wb_max_ratio) / 100) - wb_thresh = thresh * wb_max_ratio / 100; + wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE); + if (wb_thresh > (thresh * wb_max_ratio) / (100 * BDI_RATIO_SCALE)) + wb_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE); return wb_thresh; } @@ -1933,6 +2045,7 @@ int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, wb_put(wb); return ret; } +EXPORT_SYMBOL_GPL(balance_dirty_pages_ratelimited_flags); /** * balance_dirty_pages_ratelimited - balance dirty memory state. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e5486d47406e..0745aedebb37 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -27,6 +27,7 @@ #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/kasan.h> +#include <linux/kmsan.h> #include <linux/module.h> #include <linux/suspend.h> #include <linux/pagevec.h> @@ -169,21 +170,12 @@ static DEFINE_MUTEX(pcp_batch_high_lock); _ret; \ }) -#define pcpu_spin_lock_irqsave(type, member, ptr, flags) \ +#define pcpu_spin_trylock(type, member, ptr) \ ({ \ type *_ret; \ pcpu_task_pin(); \ _ret = this_cpu_ptr(ptr); \ - spin_lock_irqsave(&_ret->member, flags); \ - _ret; \ -}) - -#define pcpu_spin_trylock_irqsave(type, member, ptr, flags) \ -({ \ - type *_ret; \ - pcpu_task_pin(); \ - _ret = this_cpu_ptr(ptr); \ - if (!spin_trylock_irqsave(&_ret->member, flags)) { \ + if (!spin_trylock(&_ret->member)) { \ pcpu_task_unpin(); \ _ret = NULL; \ } \ @@ -196,27 +188,16 @@ static DEFINE_MUTEX(pcp_batch_high_lock); pcpu_task_unpin(); \ }) -#define pcpu_spin_unlock_irqrestore(member, ptr, flags) \ -({ \ - spin_unlock_irqrestore(&ptr->member, flags); \ - pcpu_task_unpin(); \ -}) - /* struct per_cpu_pages specific helpers. */ #define pcp_spin_lock(ptr) \ pcpu_spin_lock(struct per_cpu_pages, lock, ptr) -#define pcp_spin_lock_irqsave(ptr, flags) \ - pcpu_spin_lock_irqsave(struct per_cpu_pages, lock, ptr, flags) - -#define pcp_spin_trylock_irqsave(ptr, flags) \ - pcpu_spin_trylock_irqsave(struct per_cpu_pages, lock, ptr, flags) +#define pcp_spin_trylock(ptr) \ + pcpu_spin_trylock(struct per_cpu_pages, lock, ptr) #define pcp_spin_unlock(ptr) \ pcpu_spin_unlock(lock, ptr) -#define pcp_spin_unlock_irqrestore(ptr, flags) \ - pcpu_spin_unlock_irqrestore(lock, ptr, flags) #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); @@ -482,6 +463,8 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) { static unsigned long prev_end_pfn, nr_initialised; + if (early_page_ext_enabled()) + return false; /* * prev_end_pfn static that contains the end of previous zone * No need to protect because called very early in boot before smp_init. @@ -542,7 +525,7 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn) #ifdef CONFIG_SPARSEMEM pfn &= (PAGES_PER_SECTION-1); #else - pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); + pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn); #endif /* CONFIG_SPARSEMEM */ return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; } @@ -795,6 +778,7 @@ static void prep_compound_head(struct page *page, unsigned int order) set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); set_compound_order(page, order); atomic_set(compound_mapcount_ptr(page), -1); + atomic_set(subpages_mapcount_ptr(page), 0); atomic_set(compound_pincount_ptr(page), 0); } @@ -804,6 +788,7 @@ static void prep_compound_tail(struct page *head, int tail_idx) p->mapping = TAIL_MAPPING; set_compound_head(p, head); + set_page_private(p, 0); } void prep_compound_page(struct page *page, unsigned int order) @@ -870,7 +855,8 @@ static inline bool set_page_guard(struct zone *zone, struct page *page, INIT_LIST_HEAD(&page->buddy_list); set_page_private(page, order); /* Guard pages are not available for any usage */ - __mod_zone_freepage_state(zone, -(1 << order), migratetype); + if (!is_migrate_isolate(migratetype)) + __mod_zone_freepage_state(zone, -(1 << order), migratetype); return true; } @@ -900,7 +886,7 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, * order of appearance. So we need to first gather the full picture of what was * enabled, and then make decisions. */ -void init_mem_debugging_and_hardening(void) +void __init init_mem_debugging_and_hardening(void) { bool page_poisoning_requested = false; @@ -935,6 +921,10 @@ void init_mem_debugging_and_hardening(void) else static_branch_disable(&init_on_free); + if (IS_ENABLED(CONFIG_KMSAN) && + (_init_on_alloc_enabled_early || _init_on_free_enabled_early)) + pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n"); + #ifdef CONFIG_DEBUG_PAGEALLOC if (!debug_pagealloc_enabled()) return; @@ -1105,7 +1095,7 @@ static inline void __free_one_page(struct page *page, int migratetype, fpi_t fpi_flags) { struct capture_control *capc = task_capc(zone); - unsigned long buddy_pfn; + unsigned long buddy_pfn = 0; unsigned long combined_pfn; struct page *buddy; bool to_tail; @@ -1283,20 +1273,20 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) return bad_reason; } -static void check_free_page_bad(struct page *page) +static void free_page_is_bad_report(struct page *page) { bad_page(page, page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); } -static inline int check_free_page(struct page *page) +static inline bool free_page_is_bad(struct page *page) { if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) - return 0; + return false; /* Something has gone sideways, find it */ - check_free_page_bad(page); - return 1; + free_page_is_bad_report(page); + return true; } static int free_tail_pages_check(struct page *head_page, struct page *page) @@ -1315,11 +1305,19 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) } switch (page - head_page) { case 1: - /* the first tail page: ->mapping may be compound_mapcount() */ - if (unlikely(compound_mapcount(page))) { + /* the first tail page: these may be in place of ->mapping */ + if (unlikely(head_compound_mapcount(head_page))) { bad_page(page, "nonzero compound_mapcount"); goto out; } + if (unlikely(atomic_read(subpages_mapcount_ptr(head_page)))) { + bad_page(page, "nonzero subpages_mapcount"); + goto out; + } + if (unlikely(head_compound_pincount(head_page))) { + bad_page(page, "nonzero compound_pincount"); + goto out; + } break; case 2: /* @@ -1398,6 +1396,7 @@ static __always_inline bool free_pages_prepare(struct page *page, VM_BUG_ON_PAGE(PageTail(page), page); trace_mm_page_free(page, order); + kmsan_free_page(page, order); if (unlikely(PageHWPoison(page)) && !order) { /* @@ -1421,14 +1420,12 @@ static __always_inline bool free_pages_prepare(struct page *page, VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); - if (compound) { - ClearPageDoubleMap(page); + if (compound) ClearPageHasHWPoisoned(page); - } for (i = 1; i < (1 << order); i++) { if (compound) bad += free_tail_pages_check(page, page + i); - if (unlikely(check_free_page(page + i))) { + if (unlikely(free_page_is_bad(page + i))) { bad++; continue; } @@ -1439,8 +1436,8 @@ static __always_inline bool free_pages_prepare(struct page *page, page->mapping = NULL; if (memcg_kmem_enabled() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); - if (check_free) - bad += check_free_page(page); + if (check_free && free_page_is_bad(page)) + bad++; if (bad) return false; @@ -1499,10 +1496,11 @@ static bool free_pcp_prepare(struct page *page, unsigned int order) return free_pages_prepare(page, order, true, FPI_NONE); } +/* return true if this page has an inappropriate state */ static bool bulkfree_pcp_prepare(struct page *page) { if (debug_pagealloc_enabled_static()) - return check_free_page(page); + return free_page_is_bad(page); else return false; } @@ -1523,7 +1521,7 @@ static bool free_pcp_prepare(struct page *page, unsigned int order) static bool bulkfree_pcp_prepare(struct page *page) { - return check_free_page(page); + return free_page_is_bad(page); } #endif /* CONFIG_DEBUG_VM */ @@ -1536,6 +1534,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, struct per_cpu_pages *pcp, int pindex) { + unsigned long flags; int min_pindex = 0; int max_pindex = NR_PCP_LISTS - 1; unsigned int order; @@ -1551,8 +1550,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* Ensure requested pindex is drained first. */ pindex = pindex - 1; - /* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */ - spin_lock(&zone->lock); + spin_lock_irqsave(&zone->lock, flags); isolated_pageblocks = has_isolate_pageblock(zone); while (count > 0) { @@ -1575,7 +1573,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, order = pindex_to_order(pindex); nr_pages = 1 << order; - BUILD_BUG_ON(MAX_ORDER >= (1<<NR_PCP_ORDER_WIDTH)); do { int mt; @@ -1601,7 +1598,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, } while (count > 0 && !list_empty(list)); } - spin_unlock(&zone->lock); + spin_unlock_irqrestore(&zone->lock, flags); } static void free_one_page(struct zone *zone, @@ -1705,6 +1702,11 @@ static void __free_pages_ok(struct page *page, unsigned int order, if (!free_pages_prepare(page, order, true, fpi_flags)) return; + /* + * Calling get_pfnblock_migratetype() without spin_lock_irqsave() here + * is used to avoid calling get_pfnblock_migratetype() under the lock. + * This will reduce the lock holding time. + */ migratetype = get_pfnblock_migratetype(page, pfn); spin_lock_irqsave(&zone->lock, flags); @@ -1804,6 +1806,10 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, { if (early_page_uninitialised(pfn)) return; + if (!kmsan_memblock_free_pages(page, order)) { + /* KMSAN will take care of these pages. */ + return; + } __free_pages_core(page, order); } @@ -1855,7 +1861,7 @@ void set_zone_contiguous(struct zone *zone) unsigned long block_start_pfn = zone->zone_start_pfn; unsigned long block_end_pfn; - block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages); + block_end_pfn = pageblock_end_pfn(block_start_pfn); for (; block_start_pfn < zone_end_pfn(zone); block_start_pfn = block_end_pfn, block_end_pfn += pageblock_nr_pages) { @@ -1890,15 +1896,14 @@ static void __init deferred_free_range(unsigned long pfn, page = pfn_to_page(pfn); /* Free a large naturally-aligned chunk if possible */ - if (nr_pages == pageblock_nr_pages && - (pfn & (pageblock_nr_pages - 1)) == 0) { + if (nr_pages == pageblock_nr_pages && pageblock_aligned(pfn)) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); __free_pages_core(page, pageblock_order); return; } for (i = 0; i < nr_pages; i++, page++, pfn++) { - if ((pfn & (pageblock_nr_pages - 1)) == 0) + if (pageblock_aligned(pfn)) set_pageblock_migratetype(page, MIGRATE_MOVABLE); __free_pages_core(page, 0); } @@ -1917,16 +1922,12 @@ static inline void __init pgdat_init_report_one_done(void) /* * Returns true if page needs to be initialized or freed to buddy allocator. * - * First we check if pfn is valid on architectures where it is possible to have - * holes within pageblock_nr_pages. On systems where it is not possible, this - * function is optimized out. - * - * Then, we check if a current large page is valid by only checking the validity + * We check if a current large page is valid by only checking the validity * of the head pfn. */ static inline bool __init deferred_pfn_valid(unsigned long pfn) { - if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn)) + if (pageblock_aligned(pfn) && !pfn_valid(pfn)) return false; return true; } @@ -1938,14 +1939,13 @@ static inline bool __init deferred_pfn_valid(unsigned long pfn) static void __init deferred_free_pages(unsigned long pfn, unsigned long end_pfn) { - unsigned long nr_pgmask = pageblock_nr_pages - 1; unsigned long nr_free = 0; for (; pfn < end_pfn; pfn++) { if (!deferred_pfn_valid(pfn)) { deferred_free_range(pfn - nr_free, nr_free); nr_free = 0; - } else if (!(pfn & nr_pgmask)) { + } else if (pageblock_aligned(pfn)) { deferred_free_range(pfn - nr_free, nr_free); nr_free = 1; } else { @@ -1965,7 +1965,6 @@ static unsigned long __init deferred_init_pages(struct zone *zone, unsigned long pfn, unsigned long end_pfn) { - unsigned long nr_pgmask = pageblock_nr_pages - 1; int nid = zone_to_nid(zone); unsigned long nr_pages = 0; int zid = zone_idx(zone); @@ -1975,7 +1974,7 @@ static unsigned long __init deferred_init_pages(struct zone *zone, if (!deferred_pfn_valid(pfn)) { page = NULL; continue; - } else if (!page || !(pfn & nr_pgmask)) { + } else if (!page || pageblock_aligned(pfn)) { page = pfn_to_page(pfn); } else { page++; @@ -2651,8 +2650,8 @@ int move_freepages_block(struct zone *zone, struct page *page, *num_movable = 0; pfn = page_to_pfn(page); - start_pfn = pfn & ~(pageblock_nr_pages - 1); - end_pfn = start_pfn + pageblock_nr_pages - 1; + start_pfn = pageblock_start_pfn(pfn); + end_pfn = pageblock_end_pfn(pfn) - 1; /* Do not cross zone boundaries */ if (!zone_spans_pfn(zone, start_pfn)) @@ -3010,7 +3009,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, * i.e. orders < pageblock_order. If there are no local zones free, * the zonelists will be reiterated without ALLOC_NOFRAGMENT. */ - if (alloc_flags & ALLOC_NOFRAGMENT) + if (order < pageblock_order && alloc_flags & ALLOC_NOFRAGMENT) min_order = pageblock_order; /* @@ -3118,10 +3117,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, unsigned int alloc_flags) { + unsigned long flags; int i, allocated = 0; - /* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */ - spin_lock(&zone->lock); + spin_lock_irqsave(&zone->lock, flags); for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype, alloc_flags); @@ -3155,7 +3154,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * pages added to the pcp list. */ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); - spin_unlock(&zone->lock); + spin_unlock_irqrestore(&zone->lock, flags); return allocated; } @@ -3172,16 +3171,9 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); if (to_drain > 0) { - unsigned long flags; - - /* - * free_pcppages_bulk expects IRQs disabled for zone->lock - * so even though pcp->lock is not intended to be IRQ-safe, - * it's needed in this context. - */ - spin_lock_irqsave(&pcp->lock, flags); + spin_lock(&pcp->lock); free_pcppages_bulk(zone, to_drain, pcp, 0); - spin_unlock_irqrestore(&pcp->lock, flags); + spin_unlock(&pcp->lock); } } #endif @@ -3195,12 +3187,9 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); if (pcp->count) { - unsigned long flags; - - /* See drain_zone_pages on why this is disabling IRQs */ - spin_lock_irqsave(&pcp->lock, flags); + spin_lock(&pcp->lock); free_pcppages_bulk(zone, pcp->count, pcp, 0); - spin_unlock_irqrestore(&pcp->lock, flags); + spin_unlock(&pcp->lock); } } @@ -3440,7 +3429,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, int pindex; bool free_high; - __count_vm_event(PGFREE); + __count_vm_events(PGFREE, 1 << order); pindex = order_to_pindex(migratetype, order); list_add(&page->pcp_list, &pcp->lists[pindex]); pcp->count += 1 << order; @@ -3466,7 +3455,6 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, */ void free_unref_page(struct page *page, unsigned int order) { - unsigned long flags; unsigned long __maybe_unused UP_flags; struct per_cpu_pages *pcp; struct zone *zone; @@ -3494,10 +3482,10 @@ void free_unref_page(struct page *page, unsigned int order) zone = page_zone(page); pcp_trylock_prepare(UP_flags); - pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (pcp) { free_unref_page_commit(zone, pcp, page, migratetype, order); - pcp_spin_unlock_irqrestore(pcp, flags); + pcp_spin_unlock(pcp); } else { free_one_page(zone, page, pfn, order, migratetype, FPI_NONE); } @@ -3509,10 +3497,10 @@ void free_unref_page(struct page *page, unsigned int order) */ void free_unref_page_list(struct list_head *list) { + unsigned long __maybe_unused UP_flags; struct page *page, *next; struct per_cpu_pages *pcp = NULL; struct zone *locked_zone = NULL; - unsigned long flags; int batch_count = 0; int migratetype; @@ -3539,39 +3527,54 @@ void free_unref_page_list(struct list_head *list) list_for_each_entry_safe(page, next, list, lru) { struct zone *zone = page_zone(page); - /* Different zone, different pcp lock. */ - if (zone != locked_zone) { - if (pcp) - pcp_spin_unlock_irqrestore(pcp, flags); + list_del(&page->lru); + migratetype = get_pcppage_migratetype(page); + + /* + * Either different zone requiring a different pcp lock or + * excessive lock hold times when freeing a large list of + * pages. + */ + if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) { + if (pcp) { + pcp_spin_unlock(pcp); + pcp_trylock_finish(UP_flags); + } + batch_count = 0; + + /* + * trylock is necessary as pages may be getting freed + * from IRQ or SoftIRQ context after an IO completion. + */ + pcp_trylock_prepare(UP_flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); + if (unlikely(!pcp)) { + pcp_trylock_finish(UP_flags); + free_one_page(zone, page, page_to_pfn(page), + 0, migratetype, FPI_NONE); + locked_zone = NULL; + continue; + } locked_zone = zone; - pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags); } /* * Non-isolated types over MIGRATE_PCPTYPES get added * to the MIGRATE_MOVABLE pcp list. */ - migratetype = get_pcppage_migratetype(page); if (unlikely(migratetype >= MIGRATE_PCPTYPES)) migratetype = MIGRATE_MOVABLE; trace_mm_page_free_batched(page); free_unref_page_commit(zone, pcp, page, migratetype, 0); - - /* - * Guard against excessive IRQ disabled times when we get - * a large list of pages to free. - */ - if (++batch_count == SWAP_CLUSTER_MAX) { - pcp_spin_unlock_irqrestore(pcp, flags); - batch_count = 0; - pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags); - } + batch_count++; } - if (pcp) - pcp_spin_unlock_irqrestore(pcp, flags); + if (pcp) { + pcp_spin_unlock(pcp); + pcp_trylock_finish(UP_flags); + } } /* @@ -3598,16 +3601,11 @@ EXPORT_SYMBOL_GPL(split_page); int __isolate_free_page(struct page *page, unsigned int order) { - unsigned long watermark; - struct zone *zone; - int mt; - - BUG_ON(!PageBuddy(page)); - - zone = page_zone(page); - mt = get_pageblock_migratetype(page); + struct zone *zone = page_zone(page); + int mt = get_pageblock_migratetype(page); if (!is_migrate_isolate(mt)) { + unsigned long watermark; /* * Obey watermarks as if the page was being allocated. We can * emulate a high-order watermark check with a raised order-0 @@ -3621,8 +3619,6 @@ int __isolate_free_page(struct page *page, unsigned int order) __mod_zone_freepage_state(zone, -(1UL << order), mt); } - /* Remove page from free list */ - del_page_from_free_list(page, zone, order); /* @@ -3643,7 +3639,6 @@ int __isolate_free_page(struct page *page, unsigned int order) } } - return 1UL << order; } @@ -3670,8 +3665,6 @@ void __putback_isolated_page(struct page *page, unsigned int order, int mt) /* * Update NUMA hit/miss statistics - * - * Must be called with interrupts disabled. */ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, long nr_account) @@ -3777,21 +3770,16 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, /* Lock and remove page from the per-cpu list */ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct zone *zone, unsigned int order, - gfp_t gfp_flags, int migratetype, - unsigned int alloc_flags) + int migratetype, unsigned int alloc_flags) { struct per_cpu_pages *pcp; struct list_head *list; struct page *page; - unsigned long flags; unsigned long __maybe_unused UP_flags; - /* - * spin_trylock may fail due to a parallel drain. In the future, the - * trylock will also protect against IRQ reentrancy. - */ + /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */ pcp_trylock_prepare(UP_flags); - pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (!pcp) { pcp_trylock_finish(UP_flags); return NULL; @@ -3805,18 +3793,27 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, pcp->free_factor >>= 1; list = &pcp->lists[order_to_pindex(migratetype, order)]; page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); - pcp_spin_unlock_irqrestore(pcp, flags); + pcp_spin_unlock(pcp); pcp_trylock_finish(UP_flags); if (page) { - __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone, 1); } return page; } /* - * Allocate a page from the given zone. Use pcplists for order-0 allocations. + * Allocate a page from the given zone. + * Use pcplists for THP or "cheap" high-order allocations. */ + +/* + * Do not instrument rmqueue() with KMSAN. This function may call + * __msan_poison_alloca() through a call to set_pfnblock_flags_mask(). + * If __msan_poison_alloca() attempts to allocate pages for the stack depot, it + * may call rmqueue() again, which will result in a deadlock. + */ +__no_sanitize_memory static inline struct page *rmqueue(struct zone *preferred_zone, struct zone *zone, unsigned int order, @@ -3839,7 +3836,7 @@ struct page *rmqueue(struct zone *preferred_zone, if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA || migratetype != MIGRATE_MOVABLE) { page = rmqueue_pcplist(preferred_zone, zone, order, - gfp_flags, migratetype, alloc_flags); + migratetype, alloc_flags); if (likely(page)) goto out; } @@ -3882,6 +3879,8 @@ __setup("fail_page_alloc=", setup_fail_page_alloc); static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { + int flags = 0; + if (order < fail_page_alloc.min_order) return false; if (gfp_mask & __GFP_NOFAIL) @@ -3892,10 +3891,11 @@ static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) (gfp_mask & __GFP_DIRECT_RECLAIM)) return false; + /* See comment in __should_failslab() */ if (gfp_mask & __GFP_NOWARN) - fail_page_alloc.attr.no_warn = true; + flags |= FAULT_NOWARN; - return should_fail(&fail_page_alloc.attr, 1 << order); + return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags); } #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS @@ -4329,7 +4329,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) if (!in_task() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; - show_mem(filter, nodemask); + __show_mem(filter, nodemask, gfp_zone(gfp_mask)); } void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) @@ -4708,6 +4708,30 @@ void fs_reclaim_release(gfp_t gfp_mask) EXPORT_SYMBOL_GPL(fs_reclaim_release); #endif +/* + * Zonelists may change due to hotplug during allocation. Detect when zonelists + * have been rebuilt so allocation retries. Reader side does not lock and + * retries the allocation if zonelist changes. Writer side is protected by the + * embedded spin_lock. + */ +static DEFINE_SEQLOCK(zonelist_update_seq); + +static unsigned int zonelist_iter_begin(void) +{ + if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) + return read_seqbegin(&zonelist_update_seq); + + return 0; +} + +static unsigned int check_retry_zonelist(unsigned int seq) +{ + if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) + return read_seqretry(&zonelist_update_seq, seq); + + return seq; +} + /* Perform direct synchronous page reclaim */ static unsigned long __perform_reclaim(gfp_t gfp_mask, unsigned int order, @@ -5001,6 +5025,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, int compaction_retries; int no_progress_loops; unsigned int cpuset_mems_cookie; + unsigned int zonelist_iter_cookie; int reserve_flags; /* @@ -5011,11 +5036,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) gfp_mask &= ~__GFP_ATOMIC; -retry_cpuset: +restart: compaction_retries = 0; no_progress_loops = 0; compact_priority = DEF_COMPACT_PRIORITY; cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist_iter_cookie = zonelist_iter_begin(); /* * The fast path uses conservative alloc_flags to succeed only until @@ -5121,7 +5147,8 @@ retry: reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); if (reserve_flags) - alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags); + alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags) | + (alloc_flags & ALLOC_KSWAPD); /* * Reset the nodemask and zonelist iterators if memory policies can be @@ -5187,9 +5214,13 @@ retry: goto retry; - /* Deal with possible cpuset update races before we start OOM killing */ - if (check_retry_cpuset(cpuset_mems_cookie, ac)) - goto retry_cpuset; + /* + * Deal with possible cpuset update races or zonelist updates to avoid + * a unnecessary OOM kill. + */ + if (check_retry_cpuset(cpuset_mems_cookie, ac) || + check_retry_zonelist(zonelist_iter_cookie)) + goto restart; /* Reclaim has failed us, start killing things */ page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); @@ -5209,9 +5240,13 @@ retry: } nopage: - /* Deal with possible cpuset update races before we fail */ - if (check_retry_cpuset(cpuset_mems_cookie, ac)) - goto retry_cpuset; + /* + * Deal with possible cpuset update races or zonelist updates to avoid + * a unnecessary OOM kill. + */ + if (check_retry_cpuset(cpuset_mems_cookie, ac) || + check_retry_zonelist(zonelist_iter_cookie)) + goto restart; /* * Make sure that __GFP_NOFAIL request doesn't leak out and make sure @@ -5238,7 +5273,7 @@ nopage: * so that we can identify them and convert them to something * else. */ - WARN_ON_ONCE_GFP(order > PAGE_ALLOC_COSTLY_ORDER, gfp_mask); + WARN_ON_ONCE_GFP(costly_order, gfp_mask); /* * Help non-failing allocations by giving them access to memory @@ -5329,7 +5364,6 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, struct page **page_array) { struct page *page; - unsigned long flags; unsigned long __maybe_unused UP_flags; struct zone *zone; struct zoneref *z; @@ -5411,9 +5445,9 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, if (unlikely(!zone)) goto failed; - /* Is a parallel drain in progress? */ + /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */ pcp_trylock_prepare(UP_flags); - pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (!pcp) goto failed_irq; @@ -5432,7 +5466,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, if (unlikely(!page)) { /* Try and allocate at least one page */ if (!nr_account) { - pcp_spin_unlock_irqrestore(pcp, flags); + pcp_spin_unlock(pcp); goto failed_irq; } break; @@ -5447,7 +5481,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nr_populated++; } - pcp_spin_unlock_irqrestore(pcp, flags); + pcp_spin_unlock(pcp); pcp_trylock_finish(UP_flags); __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); @@ -5535,6 +5569,7 @@ out: } trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); + kmsan_alloc_page(page, order, alloc_gfp); return page; } @@ -5706,6 +5741,18 @@ refill: /* reset page count bias and offset to start of new frag */ nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; offset = size - fragsz; + if (unlikely(offset < 0)) { + /* + * The caller is trying to allocate a fragment + * with fragsz > PAGE_SIZE but the cache isn't big + * enough to satisfy the request, this may + * happen in low memory conditions. + * We don't release the cache page because + * it could make memory pressure worse + * so we simply return NULL here. + */ + return NULL; + } } nc->pagecnt_bias--; @@ -5732,14 +5779,18 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, size_t size) { if (addr) { - unsigned long alloc_end = addr + (PAGE_SIZE << order); - unsigned long used = addr + PAGE_ALIGN(size); + unsigned long nr = DIV_ROUND_UP(size, PAGE_SIZE); + struct page *page = virt_to_page((void *)addr); + struct page *last = page + nr; - split_page(virt_to_page((void *)addr), order); - while (used < alloc_end) { - free_page(used); - used += PAGE_SIZE; - } + split_page_owner(page, 1 << order); + split_page_memcg(page, 1 << order); + while (page < --last) + set_page_refcounted(last); + + last = page + (1UL << order); + for (page += nr; page < last; page++) + __free_pages_ok(page, 0, FPI_TO_TAIL); } return (void *)addr; } @@ -6011,6 +6062,15 @@ static void show_migration_types(unsigned char type) printk(KERN_CONT "(%s) ", tmp); } +static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx) +{ + int zone_idx; + for (zone_idx = 0; zone_idx <= max_zone_idx; zone_idx++) + if (zone_managed_pages(pgdat->node_zones + zone_idx)) + return true; + return false; +} + /* * Show free area list (used inside shift_scroll-lock stuff) * We also calculate the percentage fragmentation. We do this by counting the @@ -6020,7 +6080,7 @@ static void show_migration_types(unsigned char type) * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's * cpuset. */ -void show_free_areas(unsigned int filter, nodemask_t *nodemask) +void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) { unsigned long free_pcp = 0; int cpu, nid; @@ -6028,6 +6088,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) pg_data_t *pgdat; for_each_populated_zone(zone) { + if (zone_idx(zone) > max_zone_idx) + continue; if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; @@ -6039,7 +6101,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " active_file:%lu inactive_file:%lu isolated_file:%lu\n" " unevictable:%lu dirty:%lu writeback:%lu\n" " slab_reclaimable:%lu slab_unreclaimable:%lu\n" - " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" + " mapped:%lu shmem:%lu pagetables:%lu\n" + " sec_pagetables:%lu bounce:%lu\n" " kernel_misc_reclaimable:%lu\n" " free:%lu free_pcp:%lu free_cma:%lu\n", global_node_page_state(NR_ACTIVE_ANON), @@ -6056,6 +6119,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) global_node_page_state(NR_FILE_MAPPED), global_node_page_state(NR_SHMEM), global_node_page_state(NR_PAGETABLE), + global_node_page_state(NR_SECONDARY_PAGETABLE), global_zone_page_state(NR_BOUNCE), global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE), global_zone_page_state(NR_FREE_PAGES), @@ -6065,6 +6129,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) for_each_online_pgdat(pgdat) { if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) continue; + if (!node_has_managed_zones(pgdat, max_zone_idx)) + continue; printk("Node %d" " active_anon:%lukB" @@ -6089,6 +6155,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " shadow_call_stack:%lukB" #endif " pagetables:%lukB" + " sec_pagetables:%lukB" " all_unreclaimable? %s" "\n", pgdat->node_id, @@ -6114,6 +6181,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) node_page_state(pgdat, NR_KERNEL_SCS_KB), #endif K(node_page_state(pgdat, NR_PAGETABLE)), + K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)), pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? "yes" : "no"); } @@ -6121,6 +6189,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) for_each_populated_zone(zone) { int i; + if (zone_idx(zone) > max_zone_idx) + continue; if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; @@ -6182,6 +6252,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) unsigned long nr[MAX_ORDER], flags, total = 0; unsigned char types[MAX_ORDER]; + if (zone_idx(zone) > max_zone_idx) + continue; if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; show_node(zone); @@ -6507,16 +6579,15 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta #define BOOT_PAGESET_BATCH 1 static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset); static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats); -DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); +static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); static void __build_all_zonelists(void *data) { int nid; int __maybe_unused cpu; pg_data_t *self = data; - static DEFINE_SPINLOCK(lock); - spin_lock(&lock); + write_seqlock(&zonelist_update_seq); #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); @@ -6553,7 +6624,7 @@ static void __build_all_zonelists(void *data) #endif } - spin_unlock(&lock); + write_sequnlock(&zonelist_update_seq); } static noinline void __init @@ -6704,7 +6775,7 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone * such that unmovable allocations won't be scattered all * over the place during system boot. */ - if (IS_ALIGNED(pfn, pageblock_nr_pages)) { + if (pageblock_aligned(pfn)) { set_pageblock_migratetype(page, migratetype); cond_resched(); } @@ -6747,10 +6818,18 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, * Please note that MEMINIT_HOTPLUG path doesn't clear memmap * because this is done early in section_activate() */ - if (IS_ALIGNED(pfn, pageblock_nr_pages)) { + if (pageblock_aligned(pfn)) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); cond_resched(); } + + /* + * ZONE_DEVICE pages are released directly to the driver page allocator + * which will set the page count to 1 when allocating the page. + */ + if (pgmap->type == MEMORY_DEVICE_PRIVATE || + pgmap->type == MEMORY_DEVICE_COHERENT) + set_page_count(page, 0); } /* @@ -6786,13 +6865,11 @@ static void __ref memmap_init_compound(struct page *head, set_page_count(page, 0); /* - * The first tail page stores compound_mapcount_ptr() and - * compound_order() and the second tail page stores - * compound_pincount_ptr(). Call prep_compound_head() after - * the first and second tail pages have been initialized to - * not have the data overwritten. + * The first tail page stores important compound page info. + * Call prep_compound_head() after the first tail page has + * been initialized, to not have the data overwritten. */ - if (pfn == head_pfn + 2) + if (pfn == head_pfn + 1) prep_compound_head(head, order); } } @@ -6810,7 +6887,7 @@ void __ref memmap_init_zone_device(struct zone *zone, unsigned long start = jiffies; int nid = pgdat->node_id; - if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE)) + if (WARN_ON_ONCE(!pgmap || zone_idx != ZONE_DEVICE)) return; /* @@ -6879,9 +6956,8 @@ static void __init init_unavailable_range(unsigned long spfn, u64 pgcnt = 0; for (pfn = spfn; pfn < epfn; pfn++) { - if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { - pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) - + pageblock_nr_pages - 1; + if (!pfn_valid(pageblock_start_pfn(pfn))) { + pfn = pageblock_end_pfn(pfn) - 1; continue; } __init_single_page(pfn_to_page(pfn), pfn, zone, node); @@ -6986,7 +7062,7 @@ static int zone_batchsize(struct zone *zone) * size is striking a balance between allocation latency * and zone lock contention. */ - batch = min(zone_managed_pages(zone) >> 10, (1024 * 1024) / PAGE_SIZE); + batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE); batch /= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; @@ -7171,6 +7247,17 @@ void __meminit setup_zone_pageset(struct zone *zone) } /* + * The zone indicated has a new number of managed_pages; batch sizes and percpu + * page high values need to be recalculated. + */ +static void zone_pcp_update(struct zone *zone, int cpu_online) +{ + mutex_lock(&pcp_batch_high_lock); + zone_set_pageset_high_and_batch(zone, cpu_online); + mutex_unlock(&pcp_batch_high_lock); +} + +/* * Allocate per cpu pagesets and initialize them. * Before this call only boot pagesets were available. */ @@ -7614,6 +7701,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) int i; pgdat_resize_init(pgdat); + pgdat_kswapd_lock_init(pgdat); pgdat_init_split_queue(pgdat); pgdat_init_kcompactd(pgdat); @@ -7908,17 +7996,6 @@ unsigned long __init node_map_pfn_alignment(void) return ~accl_mask + 1; } -/** - * find_min_pfn_with_active_regions - Find the minimum PFN registered - * - * Return: the minimum PFN based on information provided via - * memblock_set_node(). - */ -unsigned long __init find_min_pfn_with_active_regions(void) -{ - return PHYS_PFN(memblock_start_of_DRAM()); -} - /* * early_calculate_totalpages() * Sum pages in active regions for movable zone. @@ -8211,7 +8288,7 @@ void __init free_area_init(unsigned long *max_zone_pfn) memset(arch_zone_highest_possible_pfn, 0, sizeof(arch_zone_highest_possible_pfn)); - start_pfn = find_min_pfn_with_active_regions(); + start_pfn = PHYS_PFN(memblock_start_of_DRAM()); descending = arch_has_descending_max_zone_pfns(); for (i = 0; i < MAX_NR_ZONES; i++) { @@ -8460,8 +8537,8 @@ void __init mem_init_print_info(void) #endif ")\n", K(nr_free_pages()), K(physpages), - codesize >> 10, datasize >> 10, rosize >> 10, - (init_data_size + init_code_size) >> 10, bss_size >> 10, + codesize / SZ_1K, datasize / SZ_1K, rosize / SZ_1K, + (init_data_size + init_code_size) / SZ_1K, bss_size / SZ_1K, K(physpages - totalram_pages() - totalcma_pages), K(totalcma_pages) #ifdef CONFIG_HIGHMEM @@ -8974,7 +9051,7 @@ void *__init alloc_large_system_hash(const char *tablename, { unsigned long long max = high_limit; unsigned long log2qty, size; - void *table = NULL; + void *table; gfp_t gfp_flags; bool virt; bool huge; @@ -8986,8 +9063,8 @@ void *__init alloc_large_system_hash(const char *tablename, numentries -= arch_reserved_kernel_pages(); /* It isn't necessary when PAGE_SIZE >= 1MB */ - if (PAGE_SHIFT < 20) - numentries = round_up(numentries, (1<<20)/PAGE_SIZE); + if (PAGE_SIZE < SZ_1M) + numentries = round_up(numentries, SZ_1M / PAGE_SIZE); #if __BITS_PER_LONG > 32 if (!high_limit) { @@ -9412,17 +9489,6 @@ void free_contig_range(unsigned long pfn, unsigned long nr_pages) EXPORT_SYMBOL(free_contig_range); /* - * The zone indicated has a new number of managed_pages; batch sizes and percpu - * page high values need to be recalculated. - */ -void zone_pcp_update(struct zone *zone, int cpu_online) -{ - mutex_lock(&pcp_batch_high_lock); - zone_set_pageset_high_and_batch(zone, cpu_online); - mutex_unlock(&pcp_batch_high_lock); -} - -/* * Effectively disable pcplists for the zone by setting the high limit to 0 * and draining all cpus. A concurrent page freeing on another CPU that's about * to put the page on pcplist will either finish before the drain and the page @@ -9454,9 +9520,11 @@ void zone_pcp_reset(struct zone *zone) drain_zonestat(zone, pzstats); } free_percpu(zone->per_cpu_pageset); - free_percpu(zone->per_cpu_zonestats); zone->per_cpu_pageset = &boot_pageset; - zone->per_cpu_zonestats = &boot_zonestats; + if (zone->per_cpu_zonestats != &boot_zonestats) { + free_percpu(zone->per_cpu_zonestats); + zone->per_cpu_zonestats = &boot_zonestats; + } } } diff --git a/mm/page_counter.c b/mm/page_counter.c index eb156ff5d603..db20d6452b71 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -17,24 +17,23 @@ static void propagate_protected_usage(struct page_counter *c, unsigned long usage) { unsigned long protected, old_protected; - unsigned long low, min; long delta; if (!c->parent) return; - min = READ_ONCE(c->min); - if (min || atomic_long_read(&c->min_usage)) { - protected = min(usage, min); + protected = min(usage, READ_ONCE(c->min)); + old_protected = atomic_long_read(&c->min_usage); + if (protected != old_protected) { old_protected = atomic_long_xchg(&c->min_usage, protected); delta = protected - old_protected; if (delta) atomic_long_add(delta, &c->parent->children_min_usage); } - low = READ_ONCE(c->low); - if (low || atomic_long_read(&c->low_usage)) { - protected = min(usage, low); + protected = min(usage, READ_ONCE(c->low)); + old_protected = atomic_long_read(&c->low_usage); + if (protected != old_protected) { old_protected = atomic_long_xchg(&c->low_usage, protected); delta = protected - old_protected; if (delta) @@ -193,7 +192,7 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) old = xchg(&counter->max, nr_pages); - if (page_counter_read(counter) <= usage) + if (page_counter_read(counter) <= usage || nr_pages >= old) return 0; counter->max = old; diff --git a/mm/page_ext.c b/mm/page_ext.c index 3dc715d7ac29..4ee522fd381c 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -9,6 +9,7 @@ #include <linux/page_owner.h> #include <linux/page_idle.h> #include <linux/page_table_check.h> +#include <linux/rcupdate.h> /* * struct page extension @@ -59,6 +60,10 @@ * can utilize this callback to initialize the state of it correctly. */ +#ifdef CONFIG_SPARSEMEM +#define PAGE_EXT_INVALID (0x1) +#endif + #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) static bool need_page_idle(void) { @@ -84,6 +89,15 @@ static struct page_ext_operations *page_ext_ops[] __initdata = { unsigned long page_ext_size = sizeof(struct page_ext); static unsigned long total_usage; +static struct page_ext *lookup_page_ext(const struct page *page); + +bool early_page_ext; +static int __init setup_early_page_ext(char *str) +{ + early_page_ext = true; + return 0; +} +early_param("early_page_ext", setup_early_page_ext); static bool __init invoke_need_callbacks(void) { @@ -125,6 +139,48 @@ static inline struct page_ext *get_entry(void *base, unsigned long index) return base + page_ext_size * index; } +/** + * page_ext_get() - Get the extended information for a page. + * @page: The page we're interested in. + * + * Ensures that the page_ext will remain valid until page_ext_put() + * is called. + * + * Return: NULL if no page_ext exists for this page. + * Context: Any context. Caller may not sleep until they have called + * page_ext_put(). + */ +struct page_ext *page_ext_get(struct page *page) +{ + struct page_ext *page_ext; + + rcu_read_lock(); + page_ext = lookup_page_ext(page); + if (!page_ext) { + rcu_read_unlock(); + return NULL; + } + + return page_ext; +} + +/** + * page_ext_put() - Working with page extended information is done. + * @page_ext: Page extended information received from page_ext_get(). + * + * The page extended information of the page may not be valid after this + * function is called. + * + * Return: None. + * Context: Any context with corresponding page_ext_get() is called. + */ +void page_ext_put(struct page_ext *page_ext) +{ + if (unlikely(!page_ext)) + return; + + rcu_read_unlock(); +} #ifndef CONFIG_SPARSEMEM @@ -133,12 +189,13 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) pgdat->node_page_ext = NULL; } -struct page_ext *lookup_page_ext(const struct page *page) +static struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); unsigned long index; struct page_ext *base; + WARN_ON_ONCE(!rcu_read_lock_held()); base = NODE_DATA(page_to_nid(page))->node_page_ext; /* * The sanity checks the page allocator does upon freeing a @@ -206,20 +263,27 @@ fail: } #else /* CONFIG_SPARSEMEM */ +static bool page_ext_invalid(struct page_ext *page_ext) +{ + return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID); +} -struct page_ext *lookup_page_ext(const struct page *page) +static struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); + struct page_ext *page_ext = READ_ONCE(section->page_ext); + + WARN_ON_ONCE(!rcu_read_lock_held()); /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. */ - if (!section->page_ext) + if (page_ext_invalid(page_ext)) return NULL; - return get_entry(section->page_ext, pfn); + return get_entry(page_ext, pfn); } static void *__meminit alloc_page_ext(size_t size, int nid) @@ -298,9 +362,30 @@ static void __free_page_ext(unsigned long pfn) ms = __pfn_to_section(pfn); if (!ms || !ms->page_ext) return; - base = get_entry(ms->page_ext, pfn); + + base = READ_ONCE(ms->page_ext); + /* + * page_ext here can be valid while doing the roll back + * operation in online_page_ext(). + */ + if (page_ext_invalid(base)) + base = (void *)base - PAGE_EXT_INVALID; + WRITE_ONCE(ms->page_ext, NULL); + + base = get_entry(base, pfn); free_page_ext(base); - ms->page_ext = NULL; +} + +static void __invalidate_page_ext(unsigned long pfn) +{ + struct mem_section *ms; + void *val; + + ms = __pfn_to_section(pfn); + if (!ms || !ms->page_ext) + return; + val = (void *)ms->page_ext + PAGE_EXT_INVALID; + WRITE_ONCE(ms->page_ext, val); } static int __meminit online_page_ext(unsigned long start_pfn, @@ -336,13 +421,27 @@ static int __meminit online_page_ext(unsigned long start_pfn, } static int __meminit offline_page_ext(unsigned long start_pfn, - unsigned long nr_pages, int nid) + unsigned long nr_pages) { unsigned long start, end, pfn; start = SECTION_ALIGN_DOWN(start_pfn); end = SECTION_ALIGN_UP(start_pfn + nr_pages); + /* + * Freeing of page_ext is done in 3 steps to avoid + * use-after-free of it: + * 1) Traverse all the sections and mark their page_ext + * as invalid. + * 2) Wait for all the existing users of page_ext who + * started before invalidation to finish. + * 3) Free the page_ext. + */ + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) + __invalidate_page_ext(pfn); + + synchronize_rcu(); + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_ext(pfn); return 0; @@ -362,11 +461,11 @@ static int __meminit page_ext_callback(struct notifier_block *self, break; case MEM_OFFLINE: offline_page_ext(mn->start_pfn, - mn->nr_pages, mn->status_change_nid); + mn->nr_pages); break; case MEM_CANCEL_ONLINE: offline_page_ext(mn->start_pfn, - mn->nr_pages, mn->status_change_nid); + mn->nr_pages); break; case MEM_GOING_OFFLINE: break; @@ -414,7 +513,7 @@ void __init page_ext_init(void) cond_resched(); } } - hotplug_memory_notifier(page_ext_callback, 0); + hotplug_memory_notifier(page_ext_callback, DEFAULT_CALLBACK_PRI); pr_info("allocated %ld bytes of page_ext\n", total_usage); invoke_init_callbacks(); return; diff --git a/mm/page_io.c b/mm/page_io.c index 68318134dc92..3a5f921b932e 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -28,7 +28,7 @@ #include <linux/delayacct.h> #include "swap.h" -void end_swap_bio_write(struct bio *bio) +static void end_swap_bio_write(struct bio *bio) { struct page *page = bio_first_page_all(bio); @@ -180,29 +180,30 @@ bad_bmap: */ int swap_writepage(struct page *page, struct writeback_control *wbc) { + struct folio *folio = page_folio(page); int ret = 0; - if (try_to_free_swap(page)) { - unlock_page(page); + if (folio_free_swap(folio)) { + folio_unlock(folio); goto out; } /* * Arch code may have to preserve more data than just the page * contents, e.g. memory tags. */ - ret = arch_prepare_to_swap(page); + ret = arch_prepare_to_swap(&folio->page); if (ret) { - set_page_dirty(page); - unlock_page(page); + folio_mark_dirty(folio); + folio_unlock(folio); goto out; } - if (frontswap_store(page) == 0) { - set_page_writeback(page); - unlock_page(page); - end_page_writeback(page); + if (frontswap_store(&folio->page) == 0) { + folio_start_writeback(folio); + folio_unlock(folio); + folio_end_writeback(folio); goto out; } - ret = __swap_writepage(page, wbc, end_swap_bio_write); + ret = __swap_writepage(&folio->page, wbc); out: return ret; } @@ -332,8 +333,7 @@ static int swap_writepage_fs(struct page *page, struct writeback_control *wbc) return 0; } -int __swap_writepage(struct page *page, struct writeback_control *wbc, - bio_end_io_t end_write_func) +int __swap_writepage(struct page *page, struct writeback_control *wbc) { struct bio *bio; int ret; @@ -358,7 +358,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc), GFP_NOIO); bio->bi_iter.bi_sector = swap_page_sector(page); - bio->bi_end_io = end_write_func; + bio->bi_end_io = end_swap_bio_write; bio_add_page(bio, page, thp_size(page), 0); bio_associate_blkg_from_page(bio, page); @@ -376,7 +376,7 @@ void swap_write_unplug(struct swap_iocb *sio) struct address_space *mapping = sio->iocb.ki_filp->f_mapping; int ret; - iov_iter_bvec(&from, WRITE, sio->bvec, sio->pages, sio->len); + iov_iter_bvec(&from, ITER_SOURCE, sio->bvec, sio->pages, sio->len); ret = mapping->a_ops->swap_rw(&sio->iocb, &from); if (ret != -EIOCBQUEUED) sio_write_complete(&sio->iocb, ret); @@ -453,18 +453,21 @@ int swap_readpage(struct page *page, bool synchronous, struct swap_info_struct *sis = page_swap_info(page); bool workingset = PageWorkingset(page); unsigned long pflags; + bool in_thrashing; VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageUptodate(page), page); /* - * Count submission time as memory stall. When the device is congested, - * or the submitting cgroup IO-throttled, submission can be a - * significant part of overall IO time. + * Count submission time as memory stall and delay. When the device + * is congested, or the submitting cgroup IO-throttled, submission + * can be a significant part of overall IO time. */ - if (workingset) + if (workingset) { + delayacct_thrashing_start(&in_thrashing); psi_memstall_enter(&pflags); + } delayacct_swapin_start(); if (frontswap_load(page) == 0) { @@ -513,8 +516,10 @@ int swap_readpage(struct page *page, bool synchronous, bio_put(bio); out: - if (workingset) + if (workingset) { + delayacct_thrashing_end(&in_thrashing); psi_memstall_leave(&pflags); + } delayacct_swapin_end(); return ret; } @@ -525,7 +530,7 @@ void __swap_read_unplug(struct swap_iocb *sio) struct address_space *mapping = sio->iocb.ki_filp->f_mapping; int ret; - iov_iter_bvec(&from, READ, sio->bvec, sio->pages, sio->len); + iov_iter_bvec(&from, ITER_DEST, sio->bvec, sio->pages, sio->len); ret = mapping->a_ops->swap_rw(&sio->iocb, &from); if (ret != -EIOCBQUEUED) sio_read_complete(&sio->iocb, ret); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 9d73dc38e3d7..47fbc1696466 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -37,8 +37,8 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e struct zone *zone = page_zone(page); unsigned long pfn; - VM_BUG_ON(ALIGN_DOWN(start_pfn, pageblock_nr_pages) != - ALIGN_DOWN(end_pfn - 1, pageblock_nr_pages)); + VM_BUG_ON(pageblock_start_pfn(start_pfn) != + pageblock_start_pfn(end_pfn - 1)); if (is_migrate_cma_page(page)) { /* @@ -172,7 +172,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ * to avoid redundant checks. */ check_unmovable_start = max(page_to_pfn(page), start_pfn); - check_unmovable_end = min(ALIGN(page_to_pfn(page) + 1, pageblock_nr_pages), + check_unmovable_end = min(pageblock_end_pfn(page_to_pfn(page)), end_pfn); unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end, @@ -288,6 +288,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * @isolate_before: isolate the pageblock before the boundary_pfn * @skip_isolation: the flag to skip the pageblock isolation in second * isolate_single_pageblock() + * @migratetype: migrate type to set in error recovery. * * Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one * pageblock. When not all pageblocks within a page are isolated at the same @@ -302,16 +303,16 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * the in-use page then splitting the free page. */ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, - gfp_t gfp_flags, bool isolate_before, bool skip_isolation) + gfp_t gfp_flags, bool isolate_before, bool skip_isolation, + int migratetype) { - unsigned char saved_mt; unsigned long start_pfn; unsigned long isolate_pageblock; unsigned long pfn; struct zone *zone; int ret; - VM_BUG_ON(!IS_ALIGNED(boundary_pfn, pageblock_nr_pages)); + VM_BUG_ON(!pageblock_aligned(boundary_pfn)); if (isolate_before) isolate_pageblock = boundary_pfn - pageblock_nr_pages; @@ -328,13 +329,13 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, start_pfn = max(ALIGN_DOWN(isolate_pageblock, MAX_ORDER_NR_PAGES), zone->zone_start_pfn); - saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); + if (skip_isolation) { + int mt __maybe_unused = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); - if (skip_isolation) - VM_BUG_ON(!is_migrate_isolate(saved_mt)); - else { - ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags, - isolate_pageblock, isolate_pageblock + pageblock_nr_pages); + VM_BUG_ON(!is_migrate_isolate(mt)); + } else { + ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype, + flags, isolate_pageblock, isolate_pageblock + pageblock_nr_pages); if (ret) return ret; @@ -475,7 +476,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, failed: /* restore the original migratetype */ if (!skip_isolation) - unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt); + unset_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype); return -EBUSY; } @@ -531,13 +532,14 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned long pfn; struct page *page; /* isolation is done at page block granularity */ - unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages); - unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages); + unsigned long isolate_start = pageblock_start_pfn(start_pfn); + unsigned long isolate_end = pageblock_align(end_pfn); int ret; bool skip_isolation = false; /* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */ - ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false, skip_isolation); + ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false, + skip_isolation, migratetype); if (ret) return ret; @@ -545,7 +547,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, skip_isolation = true; /* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */ - ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true, skip_isolation); + ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true, + skip_isolation, migratetype); if (ret) { unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype); return ret; @@ -576,9 +579,8 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, { unsigned long pfn; struct page *page; - unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages); - unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages); - + unsigned long isolate_start = pageblock_start_pfn(start_pfn); + unsigned long isolate_end = pageblock_align(end_pfn); for (pfn = isolate_start; pfn < isolate_end; diff --git a/mm/page_owner.c b/mm/page_owner.c index e4c6f3f1695b..2d27f532df4c 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -141,7 +141,7 @@ void __reset_page_owner(struct page *page, unsigned short order) struct page_owner *page_owner; u64 free_ts_nsec = local_clock(); - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; @@ -153,6 +153,7 @@ void __reset_page_owner(struct page *page, unsigned short order) page_owner->free_ts_nsec = free_ts_nsec; page_ext = page_ext_next(page_ext); } + page_ext_put(page_ext); } static inline void __set_page_owner_handle(struct page_ext *page_ext, @@ -183,19 +184,21 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext, noinline void __set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext; depot_stack_handle_t handle; + handle = save_stack(gfp_mask); + + page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; - - handle = save_stack(gfp_mask); __set_page_owner_handle(page_ext, handle, order, gfp_mask); + page_ext_put(page_ext); } void __set_page_owner_migrate_reason(struct page *page, int reason) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get(page); struct page_owner *page_owner; if (unlikely(!page_ext)) @@ -203,12 +206,13 @@ void __set_page_owner_migrate_reason(struct page *page, int reason) page_owner = get_page_owner(page_ext); page_owner->last_migrate_reason = reason; + page_ext_put(page_ext); } void __split_page_owner(struct page *page, unsigned int nr) { int i; - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get(page); struct page_owner *page_owner; if (unlikely(!page_ext)) @@ -219,17 +223,25 @@ void __split_page_owner(struct page *page, unsigned int nr) page_owner->order = 0; page_ext = page_ext_next(page_ext); } + page_ext_put(page_ext); } void __folio_copy_owner(struct folio *newfolio, struct folio *old) { - struct page_ext *old_ext = lookup_page_ext(&old->page); - struct page_ext *new_ext = lookup_page_ext(&newfolio->page); + struct page_ext *old_ext; + struct page_ext *new_ext; struct page_owner *old_page_owner, *new_page_owner; - if (unlikely(!old_ext || !new_ext)) + old_ext = page_ext_get(&old->page); + if (unlikely(!old_ext)) return; + new_ext = page_ext_get(&newfolio->page); + if (unlikely(!new_ext)) { + page_ext_put(old_ext); + return; + } + old_page_owner = get_page_owner(old_ext); new_page_owner = get_page_owner(new_ext); new_page_owner->order = old_page_owner->order; @@ -254,6 +266,8 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) */ __set_bit(PAGE_EXT_OWNER, &new_ext->flags); __set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags); + page_ext_put(new_ext); + page_ext_put(old_ext); } void pagetypeinfo_showmixedcount_print(struct seq_file *m, @@ -283,7 +297,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, continue; } - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = pageblock_end_pfn(pfn); block_end_pfn = min(block_end_pfn, end_pfn); pageblock_mt = get_pageblock_migratetype(page); @@ -307,12 +321,12 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, if (PageReserved(page)) continue; - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); if (unlikely(!page_ext)) continue; if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) - continue; + goto ext_put_continue; page_owner = get_page_owner(page_ext); page_mt = gfp_migratetype(page_owner->gfp_mask); @@ -323,9 +337,12 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, count[pageblock_mt]++; pfn = block_end_pfn; + page_ext_put(page_ext); break; } pfn += (1UL << page_owner->order) - 1; +ext_put_continue: + page_ext_put(page_ext); } } @@ -435,7 +452,7 @@ err: void __dump_page_owner(const struct page *page) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get((void *)page); struct page_owner *page_owner; depot_stack_handle_t handle; gfp_t gfp_mask; @@ -452,6 +469,7 @@ void __dump_page_owner(const struct page *page) if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { pr_alert("page_owner info is not present (never set?)\n"); + page_ext_put(page_ext); return; } @@ -482,6 +500,7 @@ void __dump_page_owner(const struct page *page) if (page_owner->last_migrate_reason != -1) pr_alert("page has been migrated, last migrate reason: %s\n", migrate_reason_names[page_owner->last_migrate_reason]); + page_ext_put(page_ext); } static ssize_t @@ -497,17 +516,25 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) return -EINVAL; page = NULL; - pfn = min_low_pfn + *ppos; - + if (*ppos == 0) + pfn = min_low_pfn; + else + pfn = *ppos; /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */ while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) pfn++; - drain_all_pages(NULL); - /* Find an allocated page */ for (; pfn < max_pfn; pfn++) { /* + * This temporary page_owner is required so + * that we can avoid the context switches while holding + * the rcu lock and copying the page owner information to + * user through copy_to_user() or GFP_KERNEL allocations. + */ + struct page_owner page_owner_tmp; + + /* * If the new page is in a new MAX_ORDER_NR_PAGES area, * validate the area as existing, skip it if not */ @@ -525,7 +552,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) continue; } - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); if (unlikely(!page_ext)) continue; @@ -534,14 +561,14 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) * because we don't hold the zone lock. */ if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) - continue; + goto ext_put_continue; /* * Although we do have the info about past allocation of free * pages, it's not relevant for current memory usage. */ if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) - continue; + goto ext_put_continue; page_owner = get_page_owner(page_ext); @@ -550,7 +577,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) * would inflate the stats. */ if (!IS_ALIGNED(pfn, 1 << page_owner->order)) - continue; + goto ext_put_continue; /* * Access to page_ext->handle isn't synchronous so we should @@ -558,18 +585,37 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) */ handle = READ_ONCE(page_owner->handle); if (!handle) - continue; + goto ext_put_continue; /* Record the next PFN to read in the file offset */ - *ppos = (pfn - min_low_pfn) + 1; + *ppos = pfn + 1; + page_owner_tmp = *page_owner; + page_ext_put(page_ext); return print_page_owner(buf, count, pfn, page, - page_owner, handle); + &page_owner_tmp, handle); +ext_put_continue: + page_ext_put(page_ext); } return 0; } +static loff_t lseek_page_owner(struct file *file, loff_t offset, int orig) +{ + switch (orig) { + case SEEK_SET: + file->f_pos = offset; + break; + case SEEK_CUR: + file->f_pos += offset; + break; + default: + return -EINVAL; + } + return file->f_pos; +} + static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) { unsigned long pfn = zone->zone_start_pfn; @@ -589,7 +635,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) continue; } - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = pageblock_end_pfn(pfn); block_end_pfn = min(block_end_pfn, end_pfn); for (; pfn < block_end_pfn; pfn++) { @@ -617,18 +663,20 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) if (PageReserved(page)) continue; - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); if (unlikely(!page_ext)) continue; /* Maybe overlapping zone */ if (test_bit(PAGE_EXT_OWNER, &page_ext->flags)) - continue; + goto ext_put_continue; /* Found early allocated page */ __set_page_owner_handle(page_ext, early_handle, 0, 0); count++; +ext_put_continue: + page_ext_put(page_ext); } cond_resched(); } @@ -660,6 +708,7 @@ static void init_early_allocated_pages(void) static const struct file_operations proc_page_owner_operations = { .read = read_page_owner, + .llseek = lseek_page_owner, }; static int __init pageowner_init(void) diff --git a/mm/page_reporting.c b/mm/page_reporting.c index 382958eef8a9..79a8554f024c 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -11,10 +11,42 @@ #include "page_reporting.h" #include "internal.h" -unsigned int page_reporting_order = MAX_ORDER; -module_param(page_reporting_order, uint, 0644); +/* Initialize to an unsupported value */ +unsigned int page_reporting_order = -1; + +static int page_order_update_notify(const char *val, const struct kernel_param *kp) +{ + /* + * If param is set beyond this limit, order is set to default + * pageblock_order value + */ + return param_set_uint_minmax(val, kp, 0, MAX_ORDER-1); +} + +static const struct kernel_param_ops page_reporting_param_ops = { + .set = &page_order_update_notify, + /* + * For the get op, use param_get_int instead of param_get_uint. + * This is to make sure that when unset the initialized value of + * -1 is shown correctly + */ + .get = ¶m_get_int, +}; + +module_param_cb(page_reporting_order, &page_reporting_param_ops, + &page_reporting_order, 0644); MODULE_PARM_DESC(page_reporting_order, "Set page reporting order"); +/* + * This symbol is also a kernel parameter. Export the page_reporting_order + * symbol so that other drivers can access it to control order values without + * having to introduce another configurable parameter. Only one driver can + * register with the page_reporting driver for the service, so we have just + * one control parameter for the use case(which can be accessed in both + * drivers) + */ +EXPORT_SYMBOL_GPL(page_reporting_order); + #define PAGE_REPORTING_DELAY (2 * HZ) static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly; @@ -330,10 +362,18 @@ int page_reporting_register(struct page_reporting_dev_info *prdev) } /* - * Update the page reporting order if it's specified by driver. - * Otherwise, it falls back to @pageblock_order. + * If the page_reporting_order value is not set, we check if + * an order is provided from the driver that is performing the + * registration. If that is not provided either, we default to + * pageblock_order. */ - page_reporting_order = prdev->order ? : pageblock_order; + + if (page_reporting_order == -1) { + if (prdev->order > 0 && prdev->order <= MAX_ORDER) + page_reporting_order = prdev->order; + else + page_reporting_order = pageblock_order; + } /* initialize state and work structures */ atomic_set(&prdev->state, PAGE_REPORTING_IDLE); diff --git a/mm/page_table_check.c b/mm/page_table_check.c index e2062748791a..93e633c1d587 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -4,6 +4,7 @@ * Copyright (c) 2021, Google LLC. * Pasha Tatashin <pasha.tatashin@soleen.com> */ +#include <linux/kstrtox.h> #include <linux/mm.h> #include <linux/page_table_check.h> @@ -23,7 +24,7 @@ EXPORT_SYMBOL(page_table_check_disabled); static int __init early_page_table_check_param(char *buf) { - return strtobool(buf, &__page_table_check_enabled); + return kstrtobool(buf, &__page_table_check_enabled); } early_param("page_table_check", early_page_table_check_param); @@ -53,7 +54,7 @@ static struct page_table_check *get_page_table_check(struct page_ext *page_ext) } /* - * An enty is removed from the page table, decrement the counters for that page + * An entry is removed from the page table, decrement the counters for that page * verify that it is of correct type and counters do not become negative. */ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr, @@ -68,7 +69,7 @@ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr, return; page = pfn_to_page(pfn); - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); anon = PageAnon(page); for (i = 0; i < pgcnt; i++) { @@ -83,10 +84,11 @@ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr, } page_ext = page_ext_next(page_ext); } + page_ext_put(page_ext); } /* - * A new enty is added to the page table, increment the counters for that page + * A new entry is added to the page table, increment the counters for that page * verify that it is of correct type and is not being mapped with a different * type to a different process. */ @@ -103,7 +105,7 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr, return; page = pfn_to_page(pfn); - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); anon = PageAnon(page); for (i = 0; i < pgcnt; i++) { @@ -118,6 +120,7 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr, } page_ext = page_ext_next(page_ext); } + page_ext_put(page_ext); } /* @@ -126,9 +129,10 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr, */ void __page_table_check_zero(struct page *page, unsigned int order) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext; unsigned long i; + page_ext = page_ext_get(page); BUG_ON(!page_ext); for (i = 0; i < (1ul << order); i++) { struct page_table_check *ptc = get_page_table_check(page_ext); @@ -137,6 +141,7 @@ void __page_table_check_zero(struct page *page, unsigned int order) BUG_ON(atomic_read(&ptc->file_map_count)); page_ext = page_ext_next(page_ext); } + page_ext_put(page_ext); } void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 8e9e574d535a..93e13fc17d3c 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -86,7 +86,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw) !is_device_exclusive_entry(entry)) return false; - pfn = swp_offset(entry); + pfn = swp_offset_pfn(entry); } else if (is_swap_pte(*pvmw->pte)) { swp_entry_t entry; @@ -96,7 +96,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw) !is_device_exclusive_entry(entry)) return false; - pfn = swp_offset(entry); + pfn = swp_offset_pfn(entry); } else { if (!pte_present(*pvmw->pte)) return false; @@ -221,7 +221,7 @@ restart: return not_found(pvmw); entry = pmd_to_swp_entry(pmde); if (!is_migration_entry(entry) || - !check_pmd(swp_offset(entry), pvmw)) + !check_pmd(swp_offset_pfn(entry), pvmw)) return not_found(pvmw); return true; } diff --git a/mm/pagewalk.c b/mm/pagewalk.c index fa7a3d21a751..7f1c9b274906 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -460,7 +460,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, } else { /* inside vma */ walk.vma = vma; next = min(end, vma->vm_end); - vma = vma->vm_next; + vma = find_vma(mm, vma->vm_end); err = walk_page_test(start, next, &walk); if (err > 0) { @@ -482,7 +482,15 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, return err; } -/* +/** + * walk_page_range_novma - walk a range of pagetables not backed by a vma + * @mm: mm_struct representing the target process of page table walk + * @start: start address of the virtual address range + * @end: end address of the virtual address range + * @ops: operation to call during the walk + * @pgd: pgd to walk if different from mm->pgd + * @private: private data for callbacks' usage + * * Similar to walk_page_range() but can walk any page tables even if they are * not backed by VMAs. Because 'unusual' entries may be walked this function * will also not lock the PTEs for the pte_entry() callback. This is useful for @@ -509,6 +517,26 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, return walk_pgd_range(start, end, &walk); } +int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private) +{ + struct mm_walk walk = { + .ops = ops, + .mm = vma->vm_mm, + .vma = vma, + .private = private, + }; + + if (start >= end || !walk.mm) + return -EINVAL; + if (start < vma->vm_start || end > vma->vm_end) + return -EINVAL; + + mmap_assert_locked(walk.mm); + return __walk_page_range(start, end, &walk); +} + int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, void *private) { @@ -518,18 +546,11 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, .vma = vma, .private = private, }; - int err; if (!walk.mm) return -EINVAL; mmap_assert_locked(walk.mm); - - err = walk_page_test(vma->vm_start, vma->vm_end, &walk); - if (err > 0) - return 0; - if (err < 0) - return err; return __walk_page_range(vma->vm_start, vma->vm_end, &walk); } diff --git a/mm/percpu.c b/mm/percpu.c index 27697b2429c2..acd78da0493b 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -72,7 +72,6 @@ #include <linux/cpumask.h> #include <linux/memblock.h> #include <linux/err.h> -#include <linux/lcm.h> #include <linux/list.h> #include <linux/log2.h> #include <linux/mm.h> @@ -174,9 +173,6 @@ static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */ -/* chunks which need their map areas extended, protected by pcpu_lock */ -static LIST_HEAD(pcpu_map_extend_chunks); - /* * The number of empty populated pages, protected by pcpu_lock. * The reserved chunk doesn't contribute to the count. @@ -834,13 +830,15 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, /* * Update s_block. - * block->first_free must be updated if the allocation takes its place. - * If the allocation breaks the contig_hint, a scan is required to - * restore this hint. */ if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS) nr_empty_pages++; + /* + * block->first_free must be updated if the allocation takes its place. + * If the allocation breaks the contig_hint, a scan is required to + * restore this hint. + */ if (s_off == s_block->first_free) s_block->first_free = find_next_zero_bit( pcpu_index_alloc_map(chunk, s_index), @@ -915,6 +913,12 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, } } + /* + * If the allocation is not atomic, some blocks may not be + * populated with pages, while we account it here. The number + * of pages will be added back with pcpu_chunk_populated() + * when populating pages. + */ if (nr_empty_pages) pcpu_update_empty_pages(chunk, -nr_empty_pages); @@ -1342,7 +1346,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, int map_size) { struct pcpu_chunk *chunk; - unsigned long aligned_addr, lcm_align; + unsigned long aligned_addr; int start_offset, offset_bits, region_size, region_bits; size_t alloc_size; @@ -1350,14 +1354,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, aligned_addr = tmp_addr & PAGE_MASK; start_offset = tmp_addr - aligned_addr; - - /* - * Align the end of the region with the LCM of PAGE_SIZE and - * PCPU_BITMAP_BLOCK_SIZE. One of these constants is a multiple of - * the other. - */ - lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE); - region_size = ALIGN(start_offset + map_size, lcm_align); + region_size = ALIGN(start_offset + map_size, PAGE_SIZE); /* allocate chunk */ alloc_size = struct_size(chunk, populated, @@ -1820,16 +1817,12 @@ restart: spin_unlock_irqrestore(&pcpu_lock, flags); - /* - * No space left. Create a new chunk. We don't want multiple - * tasks to create chunks simultaneously. Serialize and create iff - * there's still no empty chunk after grabbing the mutex. - */ if (is_atomic) { err = "atomic alloc failed, no space left"; goto fail; } + /* No space left. Create a new chunk. */ if (list_empty(&pcpu_chunk_lists[pcpu_free_slot])) { chunk = pcpu_create_chunk(pcpu_gfp); if (!chunk) { @@ -2146,9 +2139,9 @@ static void pcpu_reclaim_populated(void) * other accessor is the free path which only returns area back to the * allocator not touching the populated bitmap. */ - while (!list_empty(&pcpu_chunk_lists[pcpu_to_depopulate_slot])) { - chunk = list_first_entry(&pcpu_chunk_lists[pcpu_to_depopulate_slot], - struct pcpu_chunk, list); + while ((chunk = list_first_entry_or_null( + &pcpu_chunk_lists[pcpu_to_depopulate_slot], + struct pcpu_chunk, list))) { WARN_ON(chunk->immutable); /* @@ -2166,7 +2159,7 @@ static void pcpu_reclaim_populated(void) /* reintegrate chunk to prevent atomic alloc failures */ if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) { reintegrate = true; - goto end_chunk; + break; } /* @@ -2202,7 +2195,6 @@ static void pcpu_reclaim_populated(void) end = -1; } -end_chunk: /* batch tlb flush per chunk to amortize cost */ if (freed_page_start < freed_page_end) { spin_unlock_irq(&pcpu_lock); diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 4bcc11958089..78dfaf9e8990 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -263,7 +263,7 @@ static ssize_t process_vm_rw(pid_t pid, struct iovec *iov_r; struct iov_iter iter; ssize_t rc; - int dir = vm_write ? WRITE : READ; + int dir = vm_write ? ITER_SOURCE : ITER_DEST; if (flags != 0) return -EINVAL; diff --git a/mm/readahead.c b/mm/readahead.c index fdcd28cbd92d..b10f0cf81d80 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -122,6 +122,7 @@ #include <linux/task_io_accounting_ops.h> #include <linux/pagevec.h> #include <linux/pagemap.h> +#include <linux/psi.h> #include <linux/syscalls.h> #include <linux/file.h> #include <linux/mm_inline.h> @@ -152,6 +153,8 @@ static void read_pages(struct readahead_control *rac) if (!readahead_count(rac)) return; + if (unlikely(rac->_workingset)) + psi_memstall_enter(&rac->_pflags); blk_start_plug(&plug); if (aops->readahead) { @@ -179,6 +182,9 @@ static void read_pages(struct readahead_control *rac) } blk_finish_plug(&plug); + if (unlikely(rac->_workingset)) + psi_memstall_leave(&rac->_pflags); + rac->_workingset = false; BUG_ON(readahead_count(rac)); } @@ -252,6 +258,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, } if (i == nr_to_read - lookahead_size) folio_set_readahead(folio); + ractl->_workingset |= folio_test_workingset(folio); ractl->_nr_pages++; } @@ -480,11 +487,14 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, if (index == mark) folio_set_readahead(folio); err = filemap_add_folio(ractl->mapping, folio, index, gfp); - if (err) + if (err) { folio_put(folio); - else - ractl->_nr_pages += 1UL << order; - return err; + return err; + } + + ractl->_nr_pages += 1UL << order; + ractl->_workingset |= folio_test_workingset(folio); + return 0; } void page_cache_ra_order(struct readahead_control *ractl, @@ -826,6 +836,10 @@ void readahead_expand(struct readahead_control *ractl, put_page(page); return; } + if (unlikely(PageWorkingset(page)) && !ractl->_workingset) { + ractl->_workingset = true; + psi_memstall_enter(&ractl->_pflags); + } ractl->_nr_pages++; if (ra) { ra->size++; diff --git a/mm/rmap.c b/mm/rmap.c index 93d5a6f793d2..b616870a09be 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -23,10 +23,9 @@ * inode->i_rwsem (while writing or truncating, not reading or faulting) * mm->mmap_lock * mapping->invalidate_lock (in filemap_fault) - * page->flags PG_locked (lock_page) * (see hugetlbfs below) - * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) + * page->flags PG_locked (lock_page) + * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below) * mapping->i_mmap_rwsem - * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) * anon_vma->rwsem * mm->page_table_lock or pte_lock * swap_lock (in swap_duplicate, swap_info_get) @@ -46,10 +45,11 @@ * ->tasklist_lock * pte map lock * - * * hugetlbfs PageHuge() pages take locks in this order: - * mapping->i_mmap_rwsem - * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) - * page->flags PG_locked (lock_page) + * hugetlbfs PageHuge() take locks in this order: + * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) + * vma_lock (hugetlb specific lock for pmd_sharing) + * mapping->i_mmap_rwsem (also used for hugetlb pmd sharing) + * page->flags PG_locked (lock_page) */ #include <linux/mm.h> @@ -315,8 +315,8 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) enomem_failure: /* - * dst->anon_vma is dropped here otherwise its degree can be incorrectly - * decremented in unlink_anon_vmas(). + * dst->anon_vma is dropped here otherwise its num_active_vmas can + * be incorrectly decremented in unlink_anon_vmas(). * We can safely do this because callers of anon_vma_clone() don't care * about dst->anon_vma if anon_vma_clone() failed. */ @@ -489,16 +489,16 @@ void __init anon_vma_init(void) * if there is a mapcount, we can dereference the anon_vma after observing * those. */ -struct anon_vma *page_get_anon_vma(struct page *page) +struct anon_vma *folio_get_anon_vma(struct folio *folio) { struct anon_vma *anon_vma = NULL; unsigned long anon_mapping; rcu_read_lock(); - anon_mapping = (unsigned long)READ_ONCE(page->mapping); + anon_mapping = (unsigned long)READ_ONCE(folio->mapping); if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) goto out; - if (!page_mapped(page)) + if (!folio_mapped(folio)) goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); @@ -508,13 +508,13 @@ struct anon_vma *page_get_anon_vma(struct page *page) } /* - * If this page is still mapped, then its anon_vma cannot have been + * If this folio is still mapped, then its anon_vma cannot have been * freed. But if it has been unmapped, we have no security against the * anon_vma structure being freed and reused (for another anon_vma: * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero() * above cannot corrupt). */ - if (!page_mapped(page)) { + if (!folio_mapped(folio)) { rcu_read_unlock(); put_anon_vma(anon_vma); return NULL; @@ -526,11 +526,11 @@ out: } /* - * Similar to page_get_anon_vma() except it locks the anon_vma. + * Similar to folio_get_anon_vma() except it locks the anon_vma. * * Its a little more complex as it tries to keep the fast path to a single * atomic op -- the trylock. If we fail the trylock, we fall back to getting a - * reference like with page_get_anon_vma() and then block on the mutex + * reference like with folio_get_anon_vma() and then block on the mutex * on !rwc->try_lock case. */ struct anon_vma *folio_lock_anon_vma_read(struct folio *folio, @@ -602,11 +602,6 @@ out: return anon_vma; } -void page_unlock_anon_vma_read(struct anon_vma *anon_vma) -{ - anon_vma_unlock_read(anon_vma); -} - #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* * Flush TLB entries for recently unmapped pages from remote CPUs. It is @@ -770,13 +765,17 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) return vma_address(page, vma); } +/* + * Returns the actual pmd_t* where we expect 'address' to be mapped from, or + * NULL if it doesn't exist. No guarantees / checks on what the pmd_t* + * represents. + */ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd = NULL; - pmd_t pmde; pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) @@ -791,15 +790,6 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) goto out; pmd = pmd_offset(pud, address); - /* - * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() - * without holding anon_vma lock for write. So when looking for a - * genuine pmde (in which to find pte), test present and !THP together. - */ - pmde = *pmd; - barrier(); - if (!pmd_present(pmde) || pmd_trans_huge(pmde)) - pmd = NULL; out: return pmd; } @@ -833,6 +823,12 @@ static bool folio_referenced_one(struct folio *folio, } if (pvmw.pte) { + if (lru_gen_enabled() && pte_young(*pvmw.pte) && + !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) { + lru_gen_look_around(&pvmw); + referenced++; + } + if (ptep_clear_flush_young_notify(vma, address, pvmw.pte)) { /* @@ -1089,6 +1085,29 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, return page_vma_mkclean_one(&pvmw); } +int total_compound_mapcount(struct page *head) +{ + int mapcount = head_compound_mapcount(head); + int nr_subpages; + int i; + + /* In the common case, avoid the loop when no subpages mapped by PTE */ + if (head_subpages_mapcount(head) == 0) + return mapcount; + /* + * Add all the PTE mappings of those subpages mapped by PTE. + * Limit the loop, knowing that only subpages_mapcount are mapped? + * Perhaps: given all the raciness, that may be a good or a bad idea. + */ + nr_subpages = thp_nr_pages(head); + for (i = 0; i < nr_subpages; i++) + mapcount += atomic_read(&head[i]._mapcount); + + /* But each of those _mapcounts was based on -1 */ + mapcount += nr_subpages; + return mapcount; +} + /** * page_move_anon_rmap - move a page to our anon_vma * @page: the page to move to our anon_vma @@ -1101,22 +1120,20 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, */ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) { - struct anon_vma *anon_vma = vma->anon_vma; - struct page *subpage = page; - - page = compound_head(page); + void *anon_vma = vma->anon_vma; + struct folio *folio = page_folio(page); - VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_VMA(!anon_vma, vma); - anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; + anon_vma += PAGE_MAPPING_ANON; /* * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written * simultaneously, so a concurrent reader (eg folio_referenced()'s * folio_test_anon()) will not see one without the other. */ - WRITE_ONCE(page->mapping, (struct address_space *) anon_vma); - SetPageAnonExclusive(subpage); + WRITE_ONCE(folio->mapping, anon_vma); + SetPageAnonExclusive(page); } /** @@ -1200,38 +1217,50 @@ static void __page_check_anon_rmap(struct page *page, void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { + atomic_t *mapped; + int nr = 0, nr_pmdmapped = 0; bool compound = flags & RMAP_COMPOUND; - bool first; + bool first = true; if (unlikely(PageKsm(page))) lock_page_memcg(page); - else - VM_BUG_ON_PAGE(!PageLocked(page), page); - if (compound) { - atomic_t *mapcount; - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - mapcount = compound_mapcount_ptr(page); - first = atomic_inc_and_test(mapcount); - } else { + /* Is page being mapped by PTE? Is this its first map to be added? */ + if (likely(!compound)) { first = atomic_inc_and_test(&page->_mapcount); + nr = first; + if (first && PageCompound(page)) { + mapped = subpages_mapcount_ptr(compound_head(page)); + nr = atomic_inc_return_relaxed(mapped); + nr = (nr < COMPOUND_MAPPED); + } + } else if (PageTransHuge(page)) { + /* That test is redundant: it's for safety or to optimize out */ + + first = atomic_inc_and_test(compound_mapcount_ptr(page)); + if (first) { + mapped = subpages_mapcount_ptr(page); + nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); + if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) { + nr_pmdmapped = thp_nr_pages(page); + nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); + /* Raced ahead of a remove and another add? */ + if (unlikely(nr < 0)) + nr = 0; + } else { + /* Raced ahead of a remove of COMPOUND_MAPPED */ + nr = 0; + } + } } + VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); - if (first) { - int nr = compound ? thp_nr_pages(page) : 1; - /* - * We use the irq-unsafe __{inc|mod}_zone_page_stat because - * these counters are not modified in interrupt context, and - * pte lock(a spinlock) is held, which implies preemption - * disabled. - */ - if (compound) - __mod_lruvec_page_state(page, NR_ANON_THPS, nr); + if (nr_pmdmapped) + __mod_lruvec_page_state(page, NR_ANON_THPS, nr_pmdmapped); + if (nr) __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); - } if (unlikely(PageKsm(page))) unlock_page_memcg(page); @@ -1262,22 +1291,24 @@ void page_add_anon_rmap(struct page *page, void page_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { - const bool compound = PageCompound(page); - int nr = compound ? thp_nr_pages(page) : 1; + int nr; VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); __SetPageSwapBacked(page); - if (compound) { + + if (likely(!PageCompound(page))) { + /* increment count (starts at -1) */ + atomic_set(&page->_mapcount, 0); + nr = 1; + } else { VM_BUG_ON_PAGE(!PageTransHuge(page), page); /* increment count (starts at -1) */ atomic_set(compound_mapcount_ptr(page), 0); - atomic_set(compound_pincount_ptr(page), 0); - + atomic_set(subpages_mapcount_ptr(page), COMPOUND_MAPPED); + nr = thp_nr_pages(page); __mod_lruvec_page_state(page, NR_ANON_THPS, nr); - } else { - /* increment count (starts at -1) */ - atomic_set(&page->_mapcount, 0); } + __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); __page_set_anon_rmap(page, vma, address, 1); } @@ -1293,45 +1324,45 @@ void page_add_new_anon_rmap(struct page *page, void page_add_file_rmap(struct page *page, struct vm_area_struct *vma, bool compound) { - int i, nr = 0; + atomic_t *mapped; + int nr = 0, nr_pmdmapped = 0; + bool first; VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); lock_page_memcg(page); - if (compound && PageTransHuge(page)) { - int nr_pages = thp_nr_pages(page); - for (i = 0; i < nr_pages; i++) { - if (atomic_inc_and_test(&page[i]._mapcount)) - nr++; + /* Is page being mapped by PTE? Is this its first map to be added? */ + if (likely(!compound)) { + first = atomic_inc_and_test(&page->_mapcount); + nr = first; + if (first && PageCompound(page)) { + mapped = subpages_mapcount_ptr(compound_head(page)); + nr = atomic_inc_return_relaxed(mapped); + nr = (nr < COMPOUND_MAPPED); } - if (!atomic_inc_and_test(compound_mapcount_ptr(page))) - goto out; - - /* - * It is racy to ClearPageDoubleMap in page_remove_file_rmap(); - * but page lock is held by all page_add_file_rmap() compound - * callers, and SetPageDoubleMap below warns if !PageLocked: - * so here is a place that DoubleMap can be safely cleared. - */ - VM_WARN_ON_ONCE(!PageLocked(page)); - if (nr == nr_pages && PageDoubleMap(page)) - ClearPageDoubleMap(page); - - if (PageSwapBacked(page)) - __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED, - nr_pages); - else - __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED, - nr_pages); - } else { - if (PageTransCompound(page) && page_mapping(page)) { - VM_WARN_ON_ONCE(!PageLocked(page)); - SetPageDoubleMap(compound_head(page)); + } else if (PageTransHuge(page)) { + /* That test is redundant: it's for safety or to optimize out */ + + first = atomic_inc_and_test(compound_mapcount_ptr(page)); + if (first) { + mapped = subpages_mapcount_ptr(page); + nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); + if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) { + nr_pmdmapped = thp_nr_pages(page); + nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); + /* Raced ahead of a remove and another add? */ + if (unlikely(nr < 0)) + nr = 0; + } else { + /* Raced ahead of a remove of COMPOUND_MAPPED */ + nr = 0; + } } - if (atomic_inc_and_test(&page->_mapcount)) - nr++; } -out: + + if (nr_pmdmapped) + __mod_lruvec_page_state(page, PageSwapBacked(page) ? + NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, nr_pmdmapped); if (nr) __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); unlock_page_memcg(page); @@ -1339,132 +1370,87 @@ out: mlock_vma_page(page, vma, compound); } -static void page_remove_file_rmap(struct page *page, bool compound) +/** + * page_remove_rmap - take down pte mapping from a page + * @page: page to remove mapping from + * @vma: the vm area from which the mapping is removed + * @compound: uncharge the page as compound or small page + * + * The caller needs to hold the pte lock. + */ +void page_remove_rmap(struct page *page, + struct vm_area_struct *vma, bool compound) { - int i, nr = 0; + atomic_t *mapped; + int nr = 0, nr_pmdmapped = 0; + bool last; VM_BUG_ON_PAGE(compound && !PageHead(page), page); - /* Hugepages are not counted in NR_FILE_MAPPED for now. */ + /* Hugetlb pages are not counted in NR_*MAPPED */ if (unlikely(PageHuge(page))) { /* hugetlb pages are always mapped with pmds */ atomic_dec(compound_mapcount_ptr(page)); return; } - /* page still mapped by someone else? */ - if (compound && PageTransHuge(page)) { - int nr_pages = thp_nr_pages(page); + lock_page_memcg(page); - for (i = 0; i < nr_pages; i++) { - if (atomic_add_negative(-1, &page[i]._mapcount)) - nr++; + /* Is page being unmapped by PTE? Is this its last map to be removed? */ + if (likely(!compound)) { + last = atomic_add_negative(-1, &page->_mapcount); + nr = last; + if (last && PageCompound(page)) { + mapped = subpages_mapcount_ptr(compound_head(page)); + nr = atomic_dec_return_relaxed(mapped); + nr = (nr < COMPOUND_MAPPED); } - if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) - goto out; - if (PageSwapBacked(page)) - __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED, - -nr_pages); - else - __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED, - -nr_pages); - } else { - if (atomic_add_negative(-1, &page->_mapcount)) - nr++; - } -out: - if (nr) - __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr); -} - -static void page_remove_anon_compound_rmap(struct page *page) -{ - int i, nr; - - if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) - return; - - /* Hugepages are not counted in NR_ANON_PAGES for now. */ - if (unlikely(PageHuge(page))) - return; - - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) - return; - - __mod_lruvec_page_state(page, NR_ANON_THPS, -thp_nr_pages(page)); - - if (TestClearPageDoubleMap(page)) { - /* - * Subpages can be mapped with PTEs too. Check how many of - * them are still mapped. - */ - for (i = 0, nr = 0; i < thp_nr_pages(page); i++) { - if (atomic_add_negative(-1, &page[i]._mapcount)) - nr++; + } else if (PageTransHuge(page)) { + /* That test is redundant: it's for safety or to optimize out */ + + last = atomic_add_negative(-1, compound_mapcount_ptr(page)); + if (last) { + mapped = subpages_mapcount_ptr(page); + nr = atomic_sub_return_relaxed(COMPOUND_MAPPED, mapped); + if (likely(nr < COMPOUND_MAPPED)) { + nr_pmdmapped = thp_nr_pages(page); + nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); + /* Raced ahead of another remove and an add? */ + if (unlikely(nr < 0)) + nr = 0; + } else { + /* An add of COMPOUND_MAPPED raced ahead */ + nr = 0; + } } + } + if (nr_pmdmapped) { + __mod_lruvec_page_state(page, PageAnon(page) ? NR_ANON_THPS : + (PageSwapBacked(page) ? NR_SHMEM_PMDMAPPED : + NR_FILE_PMDMAPPED), -nr_pmdmapped); + } + if (nr) { + __mod_lruvec_page_state(page, PageAnon(page) ? NR_ANON_MAPPED : + NR_FILE_MAPPED, -nr); /* - * Queue the page for deferred split if at least one small + * Queue anon THP for deferred split if at least one small * page of the compound page is unmapped, but at least one * small page is still mapped. */ - if (nr && nr < thp_nr_pages(page)) - deferred_split_huge_page(page); - } else { - nr = thp_nr_pages(page); - } - - if (nr) - __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr); -} - -/** - * page_remove_rmap - take down pte mapping from a page - * @page: page to remove mapping from - * @vma: the vm area from which the mapping is removed - * @compound: uncharge the page as compound or small page - * - * The caller needs to hold the pte lock. - */ -void page_remove_rmap(struct page *page, - struct vm_area_struct *vma, bool compound) -{ - lock_page_memcg(page); - - if (!PageAnon(page)) { - page_remove_file_rmap(page, compound); - goto out; + if (PageTransCompound(page) && PageAnon(page)) + if (!compound || nr < nr_pmdmapped) + deferred_split_huge_page(compound_head(page)); } - if (compound) { - page_remove_anon_compound_rmap(page); - goto out; - } - - /* page still mapped by someone else? */ - if (!atomic_add_negative(-1, &page->_mapcount)) - goto out; - - /* - * We use the irq-unsafe __{inc|mod}_zone_page_stat because - * these counters are not modified in interrupt context, and - * pte lock(a spinlock) is held, which implies preemption disabled. - */ - __dec_lruvec_page_state(page, NR_ANON_MAPPED); - - if (PageTransCompound(page)) - deferred_split_huge_page(compound_head(page)); - /* - * It would be tidy to reset the PageAnon mapping here, + * It would be tidy to reset PageAnon mapping when fully unmapped, * but that might overwrite a racing page_add_anon_rmap * which increments mapcount after us but sets mapping - * before us: so leave the reset to free_unref_page, + * before us: so leave the reset to free_pages_prepare, * and remember that it's only reliable while mapped. - * Leaving it set also helps swapoff to reinstate ptes - * faster for those pages still in swapcache. */ -out: + unlock_page_memcg(page); munlock_vma_page(page, vma, compound); @@ -1560,33 +1546,45 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * To call huge_pmd_unshare, i_mmap_rwsem must be * held in write mode. Caller needs to explicitly * do this outside rmap routines. + * + * We also must hold hugetlb vma_lock in write mode. + * Lock order dictates acquiring vma_lock BEFORE + * i_mmap_rwsem. We can only try lock here and fail + * if unsuccessful. */ - VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED)); - if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) { - flush_tlb_range(vma, range.start, range.end); - mmu_notifier_invalidate_range(mm, range.start, - range.end); - - /* - * The ref count of the PMD page was dropped - * which is part of the way map counting - * is done for shared PMDs. Return 'true' - * here. When there is no other sharing, - * huge_pmd_unshare returns false and we will - * unmap the actual page and drop map count - * to zero. - */ - page_vma_mapped_walk_done(&pvmw); - break; + if (!anon) { + VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); + if (!hugetlb_vma_trylock_write(vma)) { + page_vma_mapped_walk_done(&pvmw); + ret = false; + break; + } + if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { + hugetlb_vma_unlock_write(vma); + flush_tlb_range(vma, + range.start, range.end); + mmu_notifier_invalidate_range(mm, + range.start, range.end); + /* + * The ref count of the PMD page was + * dropped which is part of the way map + * counting is done for shared PMDs. + * Return 'true' here. When there is + * no other sharing, huge_pmd_unshare + * returns false and we will unmap the + * actual page and drop map count + * to zero. + */ + page_vma_mapped_walk_done(&pvmw); + break; + } + hugetlb_vma_unlock_write(vma); } pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); - /* - * Nuke the page table entry. When having to clear - * PageAnonExclusive(), we always have to flush. - */ - if (should_defer_flush(mm, flags) && !anon_exclusive) { + /* Nuke the page table entry. */ + if (should_defer_flush(mm, flags)) { /* * We clear the PTE but do not flush so potentially * a remote CPU could still be writing to the folio. @@ -1717,6 +1715,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, page_vma_mapped_walk_done(&pvmw); break; } + + /* See page_try_share_anon_rmap(): clear PTE first. */ if (anon_exclusive && page_try_share_anon_rmap(subpage)) { swap_free(entry); @@ -1793,7 +1793,7 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) return vma_is_temporary_stack(vma); } -static int page_not_mapped(struct folio *folio) +static int folio_not_mapped(struct folio *folio) { return !folio_mapped(folio); } @@ -1814,7 +1814,7 @@ void try_to_unmap(struct folio *folio, enum ttu_flags flags) struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, .arg = (void *)flags, - .done = page_not_mapped, + .done = folio_not_mapped, .anon_lock = folio_lock_anon_vma_read, }; @@ -1936,26 +1936,41 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * To call huge_pmd_unshare, i_mmap_rwsem must be * held in write mode. Caller needs to explicitly * do this outside rmap routines. + * + * We also must hold hugetlb vma_lock in write mode. + * Lock order dictates acquiring vma_lock BEFORE + * i_mmap_rwsem. We can only try lock here and + * fail if unsuccessful. */ - VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED)); - if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) { - flush_tlb_range(vma, range.start, range.end); - mmu_notifier_invalidate_range(mm, range.start, - range.end); - - /* - * The ref count of the PMD page was dropped - * which is part of the way map counting - * is done for shared PMDs. Return 'true' - * here. When there is no other sharing, - * huge_pmd_unshare returns false and we will - * unmap the actual page and drop map count - * to zero. - */ - page_vma_mapped_walk_done(&pvmw); - break; + if (!anon) { + VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); + if (!hugetlb_vma_trylock_write(vma)) { + page_vma_mapped_walk_done(&pvmw); + ret = false; + break; + } + if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { + hugetlb_vma_unlock_write(vma); + flush_tlb_range(vma, + range.start, range.end); + mmu_notifier_invalidate_range(mm, + range.start, range.end); + + /* + * The ref count of the PMD page was + * dropped which is part of the way map + * counting is done for shared PMDs. + * Return 'true' here. When there is + * no other sharing, huge_pmd_unshare + * returns false and we will unmap the + * actual page and drop map count + * to zero. + */ + page_vma_mapped_walk_done(&pvmw); + break; + } + hugetlb_vma_unlock_write(vma); } - /* Nuke the hugetlb page table entry */ pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { @@ -2048,6 +2063,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, } VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) && !anon_exclusive, subpage); + + /* See page_try_share_anon_rmap(): clear PTE first. */ if (anon_exclusive && page_try_share_anon_rmap(subpage)) { if (folio_test_hugetlb(folio)) @@ -2073,7 +2090,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, else entry = make_readable_migration_entry( page_to_pfn(subpage)); - + if (pte_young(pteval)) + entry = make_migration_entry_young(entry); + if (pte_dirty(pteval)) + entry = make_migration_entry_dirty(entry); swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); @@ -2122,7 +2142,7 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags) struct rmap_walk_control rwc = { .rmap_one = try_to_migrate_one, .arg = (void *)flags, - .done = page_not_mapped, + .done = folio_not_mapped, .anon_lock = folio_lock_anon_vma_read, }; @@ -2269,7 +2289,7 @@ static bool folio_make_device_exclusive(struct folio *folio, }; struct rmap_walk_control rwc = { .rmap_one = page_make_device_exclusive_one, - .done = page_not_mapped, + .done = folio_not_mapped, .anon_lock = folio_lock_anon_vma_read, .arg = &args, }; @@ -2541,9 +2561,9 @@ void hugepage_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { BUG_ON(address < vma->vm_start || address >= vma->vm_end); + /* increment count (starts at -1) */ atomic_set(compound_mapcount_ptr(page), 0); - atomic_set(compound_pincount_ptr(page), 0); - + ClearHPageRestoreReserve(page); __page_set_anon_rmap(page, vma, address, 1); } #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/mm/rodata_test.c b/mm/rodata_test.c index 2613371945b7..6d783436951f 100644 --- a/mm/rodata_test.c +++ b/mm/rodata_test.c @@ -9,13 +9,13 @@ #include <linux/rodata_test.h> #include <linux/uaccess.h> +#include <linux/mm.h> #include <asm/sections.h> static const int rodata_test_data = 0xC3; void rodata_test(void) { - unsigned long start, end; int zero = 0; /* test 1: read the value */ @@ -39,13 +39,11 @@ void rodata_test(void) } /* test 4: check if the rodata section is PAGE_SIZE aligned */ - start = (unsigned long)__start_rodata; - end = (unsigned long)__end_rodata; - if (start & (PAGE_SIZE - 1)) { + if (!PAGE_ALIGNED(__start_rodata)) { pr_err("start of .rodata is not page size aligned\n"); return; } - if (end & (PAGE_SIZE - 1)) { + if (!PAGE_ALIGNED(__end_rodata)) { pr_err("end of .rodata is not page size aligned\n"); return; } diff --git a/mm/secretmem.c b/mm/secretmem.c index e3e9590c6fb3..04c3ac9448a1 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -276,20 +276,18 @@ static struct file_system_type secretmem_fs = { .kill_sb = kill_anon_super, }; -static int secretmem_init(void) +static int __init secretmem_init(void) { - int ret = 0; - if (!secretmem_enable) - return ret; + return 0; secretmem_mnt = kern_mount(&secretmem_fs); if (IS_ERR(secretmem_mnt)) - ret = PTR_ERR(secretmem_mnt); + return PTR_ERR(secretmem_mnt); /* prevent secretmem mappings from ever getting PROT_EXEC */ secretmem_mnt->mnt_flags |= MNT_NOEXEC; - return ret; + return 0; } fs_initcall(secretmem_init); diff --git a/mm/shmem.c b/mm/shmem.c index 42e5888bf84d..c301487be5fb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -38,6 +38,7 @@ #include <linux/hugetlb.h> #include <linux/fs_parser.h> #include <linux/swapfile.h> +#include <linux/iversion.h> #include "swap.h" static struct vfsmount *shm_mnt; @@ -139,17 +140,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, vm_fault_t *fault_type); -static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, - gfp_t gfp, struct vm_area_struct *vma, - struct vm_fault *vmf, vm_fault_t *fault_type); - -int shmem_getpage(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp) -{ - return shmem_getpage_gfp(inode, index, pagep, sgp, - mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); -} static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) { @@ -190,7 +180,7 @@ static inline int shmem_reacct_size(unsigned long flags, /* * ... whereas tmpfs objects are accounted incrementally as * pages are allocated, in order to allow large sparse files. - * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, + * shmem_get_folio reports shmem_acct_block failure as -ENOSPC not -ENOMEM, * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. */ static inline int shmem_acct_block(unsigned long flags, long pages) @@ -247,11 +237,17 @@ static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; static const struct vm_operations_struct shmem_vm_ops; +static const struct vm_operations_struct shmem_anon_vm_ops; static struct file_system_type shmem_fs_type; +bool vma_is_anon_shmem(struct vm_area_struct *vma) +{ + return vma->vm_ops == &shmem_anon_vm_ops; +} + bool vma_is_shmem(struct vm_area_struct *vma) { - return vma->vm_ops == &shmem_vm_ops; + return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops; } static LIST_HEAD(shmem_swaplist); @@ -472,20 +468,22 @@ static bool shmem_confirm_swap(struct address_space *mapping, static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; -bool shmem_is_huge(struct vm_area_struct *vma, - struct inode *inode, pgoff_t index) +bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, + pgoff_t index, bool shmem_huge_force) { loff_t i_size; if (!S_ISREG(inode->i_mode)) return false; - if (shmem_huge == SHMEM_HUGE_DENY) - return false; if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))) return false; + if (shmem_huge_force) + return true; if (shmem_huge == SHMEM_HUGE_FORCE) return true; + if (shmem_huge == SHMEM_HUGE_DENY) + return false; switch (SHMEM_SB(inode->i_sb)->huge) { case SHMEM_HUGE_ALWAYS: @@ -629,7 +627,7 @@ next: goto move_back; } - ret = split_huge_page(&folio->page); + ret = split_folio(folio); folio_unlock(folio); folio_put(folio); @@ -680,8 +678,8 @@ static long shmem_unused_huge_count(struct super_block *sb, #define shmem_huge SHMEM_HUGE_DENY -bool shmem_is_huge(struct vm_area_struct *vma, - struct inode *inode, pgoff_t index) +bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, + pgoff_t index, bool shmem_huge_force) { return false; } @@ -763,23 +761,22 @@ error: } /* - * Like delete_from_page_cache, but substitutes swap for page. + * Like delete_from_page_cache, but substitutes swap for @folio. */ -static void shmem_delete_from_page_cache(struct page *page, void *radswap) +static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; + long nr = folio_nr_pages(folio); int error; - VM_BUG_ON_PAGE(PageCompound(page), page); - xa_lock_irq(&mapping->i_pages); - error = shmem_replace_entry(mapping, page->index, page, radswap); - page->mapping = NULL; - mapping->nrpages--; - __dec_lruvec_page_state(page, NR_FILE_PAGES); - __dec_lruvec_page_state(page, NR_SHMEM); + error = shmem_replace_entry(mapping, folio->index, folio, radswap); + folio->mapping = NULL; + mapping->nrpages -= nr; + __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); + __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); xa_unlock_irq(&mapping->i_pages); - put_page(page); + folio_put(folio); BUG_ON(error); } @@ -886,10 +883,9 @@ void shmem_unlock_mapping(struct address_space *mapping) static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) { struct folio *folio; - struct page *page; /* - * At first avoid shmem_getpage(,,,SGP_READ): that fails + * At first avoid shmem_get_folio(,,,SGP_READ): that fails * beyond i_size, and reports fallocated pages as holes. */ folio = __filemap_get_folio(inode->i_mapping, index, @@ -900,9 +896,9 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) * But read a page back from swap if any of it is within i_size * (although in some cases this is just a waste of time). */ - page = NULL; - shmem_getpage(inode, index, &page, SGP_READ); - return page ? page_folio(page) : NULL; + folio = NULL; + shmem_get_folio(inode, index, &folio, SGP_READ); + return folio; } /* @@ -932,21 +928,18 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, folio_batch_init(&fbatch); index = start; - while (index < end && find_lock_entries(mapping, index, end - 1, + while (index < end && find_lock_entries(mapping, &index, end - 1, &fbatch, indices)) { for (i = 0; i < folio_batch_count(&fbatch); i++) { folio = fbatch.folios[i]; - index = indices[i]; - if (xa_is_value(folio)) { if (unfalloc) continue; nr_swaps_freed += !shmem_free_swap(mapping, - index, folio); + indices[i], folio); continue; } - index += folio_nr_pages(folio) - 1; if (!unfalloc || !folio_test_uptodate(folio)) truncate_inode_folio(mapping, folio); @@ -955,9 +948,17 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); cond_resched(); - index++; } + /* + * When undoing a failed fallocate, we want none of the partial folio + * zeroing and splitting below, but shall want to truncate the whole + * folio when !uptodate indicates that it was added by this fallocate, + * even when [lstart, lend] covers only a part of the folio. + */ + if (unfalloc) + goto whole_folios; + same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT); if (folio) { @@ -983,11 +984,13 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, folio_put(folio); } +whole_folios: + index = start; while (index < end) { cond_resched(); - if (!find_get_entries(mapping, index, end - 1, &fbatch, + if (!find_get_entries(mapping, &index, end - 1, &fbatch, indices)) { /* If all gone or hole-punch or unfalloc, we're done */ if (index == start || end != -1) @@ -999,13 +1002,12 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, for (i = 0; i < folio_batch_count(&fbatch); i++) { folio = fbatch.folios[i]; - index = indices[i]; if (xa_is_value(folio)) { if (unfalloc) continue; - if (shmem_free_swap(mapping, index, folio)) { + if (shmem_free_swap(mapping, indices[i], folio)) { /* Swap was replaced by page: retry */ - index--; + index = indices[i]; break; } nr_swaps_freed++; @@ -1018,19 +1020,17 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (folio_mapping(folio) != mapping) { /* Page was replaced by swap: retry */ folio_unlock(folio); - index--; + index = indices[i]; break; } VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); truncate_inode_folio(mapping, folio); } - index = folio->index + folio_nr_pages(folio) - 1; folio_unlock(folio); } folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); - index++; } spin_lock_irq(&info->lock); @@ -1043,6 +1043,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) { shmem_undo_range(inode, lstart, lend, false); inode->i_ctime = inode->i_mtime = current_time(inode); + inode_inc_iversion(inode); } EXPORT_SYMBOL_GPL(shmem_truncate_range); @@ -1069,7 +1070,7 @@ static int shmem_getattr(struct user_namespace *mnt_userns, STATX_ATTR_NODUMP); generic_fillattr(&init_user_ns, inode, stat); - if (shmem_is_huge(NULL, inode, 0)) + if (shmem_is_huge(NULL, inode, 0, false)) stat->blksize = HPAGE_PMD_SIZE; if (request_mask & STATX_BTIME) { @@ -1087,6 +1088,8 @@ static int shmem_setattr(struct user_namespace *mnt_userns, struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); int error; + bool update_mtime = false; + bool update_ctime = true; error = setattr_prepare(&init_user_ns, dentry, attr); if (error) @@ -1107,7 +1110,9 @@ static int shmem_setattr(struct user_namespace *mnt_userns, if (error) return error; i_size_write(inode, newsize); - inode->i_ctime = inode->i_mtime = current_time(inode); + update_mtime = true; + } else { + update_ctime = false; } if (newsize <= oldsize) { loff_t holebegin = round_up(newsize, PAGE_SIZE); @@ -1126,7 +1131,13 @@ static int shmem_setattr(struct user_namespace *mnt_userns, setattr_copy(&init_user_ns, inode, attr); if (attr->ia_valid & ATTR_MODE) - error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); + error = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); + if (!error && update_ctime) { + inode->i_ctime = current_time(inode); + if (update_mtime) + inode->i_mtime = inode->i_ctime; + inode_inc_iversion(inode); + } return error; } @@ -1328,17 +1339,18 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages, * and its shmem_writeback() needs them to be split when swapping. */ - if (PageTransCompound(page)) { + if (folio_test_large(folio)) { /* Ensure the subpages are still dirty */ - SetPageDirty(page); + folio_test_set_dirty(folio); if (split_huge_page(page) < 0) goto redirty; - ClearPageDirty(page); + folio = page_folio(page); + folio_clear_dirty(folio); } - BUG_ON(!PageLocked(page)); - mapping = page->mapping; - index = page->index; + BUG_ON(!folio_test_locked(folio)); + mapping = folio->mapping; + index = folio->index; inode = mapping->host; info = SHMEM_I(inode); if (info->flags & VM_LOCKED) @@ -1361,15 +1373,15 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) /* * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC * value into swapfile.c, the only way we can correctly account for a - * fallocated page arriving here is now to initialize it and write it. + * fallocated folio arriving here is now to initialize it and write it. * - * That's okay for a page already fallocated earlier, but if we have + * That's okay for a folio already fallocated earlier, but if we have * not yet completed the fallocation, then (a) we want to keep track - * of this page in case we have to undo it, and (b) it may not be a + * of this folio in case we have to undo it, and (b) it may not be a * good idea to continue anyway, once we're pushing into swap. So - * reactivate the page, and let shmem_fallocate() quit when too many. + * reactivate the folio, and let shmem_fallocate() quit when too many. */ - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { if (inode->i_private) { struct shmem_falloc *shmem_falloc; spin_lock(&inode->i_lock); @@ -1385,9 +1397,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (shmem_falloc) goto redirty; } - clear_highpage(page); - flush_dcache_page(page); - SetPageUptodate(page); + folio_zero_range(folio, 0, folio_size(folio)); + flush_dcache_folio(folio); + folio_mark_uptodate(folio); } swap = folio_alloc_swap(folio); @@ -1396,7 +1408,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) /* * Add inode to shmem_unuse()'s list of swapped-out inodes, - * if it's not already there. Do it now before the page is + * if it's not already there. Do it now before the folio is * moved to swap cache, when its pagelock no longer protects * the inode from eviction. But don't unlock the mutex until * we've incremented swapped, because shmem_unuse_inode() will @@ -1406,7 +1418,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (list_empty(&info->swaplist)) list_add(&info->swaplist, &shmem_swaplist); - if (add_to_swap_cache(page, swap, + if (add_to_swap_cache(folio, swap, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, NULL) == 0) { spin_lock_irq(&info->lock); @@ -1415,21 +1427,21 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) spin_unlock_irq(&info->lock); swap_shmem_alloc(swap); - shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); + shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap)); mutex_unlock(&shmem_swaplist_mutex); - BUG_ON(page_mapped(page)); - swap_writepage(page, wbc); + BUG_ON(folio_mapped(folio)); + swap_writepage(&folio->page, wbc); return 0; } mutex_unlock(&shmem_swaplist_mutex); - put_swap_page(page, swap); + put_swap_folio(folio, swap); redirty: - set_page_dirty(page); + folio_mark_dirty(folio); if (wbc->for_reclaim) - return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */ - unlock_page(page); + return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */ + folio_unlock(folio); return 0; } @@ -1486,7 +1498,7 @@ static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma) mpol_cond_put(vma->vm_policy); } -static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, +static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; @@ -1499,7 +1511,9 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, page = swap_cluster_readahead(swap, gfp, &vmf); shmem_pseudo_vma_destroy(&pvma); - return page; + if (!page) + return NULL; + return page_folio(page); } /* @@ -1560,12 +1574,6 @@ static struct folio *shmem_alloc_folio(gfp_t gfp, return folio; } -static struct page *shmem_alloc_page(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) -{ - return &shmem_alloc_folio(gfp, info, index)->page; -} - static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, pgoff_t index, bool huge) { @@ -1599,7 +1607,7 @@ failed: /* * When a page is moved from swapcache to shmem filecache (either by the - * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of + * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of * shmem_unuse_inode()), it may have been read in earlier from swap, in * ignorance of the mapping it belongs to. If that mapping has special * constraints (like the gma500 GEM driver, which requires RAM below 4GB), @@ -1614,54 +1622,52 @@ static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) return folio_zonenum(folio) > gfp_zone(gfp); } -static int shmem_replace_page(struct page **pagep, gfp_t gfp, +static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { - struct page *oldpage, *newpage; struct folio *old, *new; struct address_space *swap_mapping; swp_entry_t entry; pgoff_t swap_index; int error; - oldpage = *pagep; - entry.val = page_private(oldpage); + old = *foliop; + entry = folio_swap_entry(old); swap_index = swp_offset(entry); - swap_mapping = page_mapping(oldpage); + swap_mapping = swap_address_space(entry); /* * We have arrived here because our zones are constrained, so don't * limit chance of success by further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; - newpage = shmem_alloc_page(gfp, info, index); - if (!newpage) + VM_BUG_ON_FOLIO(folio_test_large(old), old); + new = shmem_alloc_folio(gfp, info, index); + if (!new) return -ENOMEM; - get_page(newpage); - copy_highpage(newpage, oldpage); - flush_dcache_page(newpage); + folio_get(new); + folio_copy(new, old); + flush_dcache_folio(new); - __SetPageLocked(newpage); - __SetPageSwapBacked(newpage); - SetPageUptodate(newpage); - set_page_private(newpage, entry.val); - SetPageSwapCache(newpage); + __folio_set_locked(new); + __folio_set_swapbacked(new); + folio_mark_uptodate(new); + folio_set_swap_entry(new, entry); + folio_set_swapcache(new); /* * Our caller will very soon move newpage out of swapcache, but it's * a nice clean interface for us to replace oldpage by newpage there. */ xa_lock_irq(&swap_mapping->i_pages); - error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage); + error = shmem_replace_entry(swap_mapping, swap_index, old, new); if (!error) { - old = page_folio(oldpage); - new = page_folio(newpage); mem_cgroup_migrate(old, new); - __inc_lruvec_page_state(newpage, NR_FILE_PAGES); - __inc_lruvec_page_state(newpage, NR_SHMEM); - __dec_lruvec_page_state(oldpage, NR_FILE_PAGES); - __dec_lruvec_page_state(oldpage, NR_SHMEM); + __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1); + __lruvec_stat_mod_folio(new, NR_SHMEM, 1); + __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1); + __lruvec_stat_mod_folio(old, NR_SHMEM, -1); } xa_unlock_irq(&swap_mapping->i_pages); @@ -1671,18 +1677,17 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, * both PageSwapCache and page_private after getting page lock; * but be defensive. Reverse old to newpage for clear and free. */ - oldpage = newpage; + old = new; } else { - lru_cache_add(newpage); - *pagep = newpage; + folio_add_lru(new); + *foliop = new; } - ClearPageSwapCache(oldpage); - set_page_private(oldpage, 0); + folio_clear_swapcache(old); + old->private = NULL; - unlock_page(oldpage); - put_page(oldpage); - put_page(oldpage); + folio_unlock(old); + folio_put_refs(old, 2); return error; } @@ -1694,7 +1699,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, swp_entry_t swapin_error; void *old; - swapin_error = make_swapin_error_entry(&folio->page); + swapin_error = make_swapin_error_entry(); old = xa_cmpxchg_irq(&mapping->i_pages, index, swp_to_radix_entry(swap), swp_to_radix_entry(swapin_error), 0); @@ -1730,7 +1735,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL; - struct page *page; struct folio *folio = NULL; swp_entry_t swap; int error; @@ -1743,8 +1747,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, return -EIO; /* Look it up and read it in.. */ - page = lookup_swap_cache(swap, NULL, 0); - if (!page) { + folio = swap_cache_get_folio(swap, NULL, 0); + if (!folio) { /* Or update major stats only when swapin succeeds?? */ if (fault_type) { *fault_type |= VM_FAULT_MAJOR; @@ -1752,13 +1756,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, count_memcg_event_mm(charge_mm, PGMAJFAULT); } /* Here we actually start the io */ - page = shmem_swapin(swap, gfp, info, index); - if (!page) { + folio = shmem_swapin(swap, gfp, info, index); + if (!folio) { error = -ENOMEM; goto failed; } } - folio = page_folio(page); /* We have to do this with folio locked to prevent races */ folio_lock(folio); @@ -1781,8 +1784,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, arch_swap_restore(swap, folio); if (shmem_should_replace_folio(folio, gfp)) { - error = shmem_replace_page(&page, gfp, info, index); - folio = page_folio(page); + error = shmem_replace_folio(&folio, gfp, info, index); if (error) goto failed; } @@ -1822,7 +1824,7 @@ unlock: } /* - * shmem_getpage_gfp - find page in cache, or get from swap, or allocate + * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate * * If we allocate a new one we do not mark it dirty. That's up to the * vm. If we swap it in we mark it dirty since we also free the swap @@ -1831,17 +1833,17 @@ unlock: * vma, vmf, and fault_type are only supplied by shmem_fault: * otherwise they are NULL. */ -static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, gfp_t gfp, - struct vm_area_struct *vma, struct vm_fault *vmf, - vm_fault_t *fault_type) +static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, + struct folio **foliop, enum sgp_type sgp, gfp_t gfp, + struct vm_area_struct *vma, struct vm_fault *vmf, + vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo; struct mm_struct *charge_mm; struct folio *folio; - pgoff_t hindex = index; + pgoff_t hindex; gfp_t huge_gfp; int error; int once = 0; @@ -1874,17 +1876,16 @@ repeat: if (error == -EEXIST) goto repeat; - *pagep = &folio->page; + *foliop = folio; return error; } if (folio) { - hindex = folio->index; if (sgp == SGP_WRITE) folio_mark_accessed(folio); if (folio_test_uptodate(folio)) goto out; - /* fallocated page */ + /* fallocated folio */ if (sgp != SGP_READ) goto clear; folio_unlock(folio); @@ -1892,10 +1893,10 @@ repeat: } /* - * SGP_READ: succeed on hole, with NULL page, letting caller zero. - * SGP_NOALLOC: fail on hole, with NULL page, letting caller fail. + * SGP_READ: succeed on hole, with NULL folio, letting caller zero. + * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail. */ - *pagep = NULL; + *foliop = NULL; if (sgp == SGP_READ) return 0; if (sgp == SGP_NOALLOC) @@ -1910,7 +1911,7 @@ repeat: return 0; } - if (!shmem_is_huge(vma, inode, index)) + if (!shmem_is_huge(vma, inode, index, false)) goto alloc_nohuge; huge_gfp = vma_thp_gfp_mask(vma); @@ -1928,7 +1929,7 @@ alloc_nohuge: if (error != -ENOSPC) goto unlock; /* - * Try to reclaim some space by splitting a huge page + * Try to reclaim some space by splitting a large folio * beyond i_size on the filesystem. */ while (retry--) { @@ -1964,9 +1965,9 @@ alloc_nohuge: if (folio_test_pmd_mappable(folio) && DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < - hindex + HPAGE_PMD_NR - 1) { + folio_next_index(folio) - 1) { /* - * Part of the huge page is beyond i_size: subject + * Part of the large folio is beyond i_size: subject * to shrink under memory pressure. */ spin_lock(&sbinfo->shrinklist_lock); @@ -1983,14 +1984,14 @@ alloc_nohuge: } /* - * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. + * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio. */ if (sgp == SGP_FALLOC) sgp = SGP_WRITE; clear: /* - * Let SGP_WRITE caller clear ends if write does not fill page; - * but SGP_FALLOC on a page fallocated earlier must initialize + * Let SGP_WRITE caller clear ends if write does not fill folio; + * but SGP_FALLOC on a folio fallocated earlier must initialize * it now, lest undo on failure cancel our earlier guarantee. */ if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) { @@ -2016,7 +2017,7 @@ clear: goto unlock; } out: - *pagep = folio_page(folio, index - hindex); + *foliop = folio; return 0; /* @@ -2046,6 +2047,13 @@ unlock: return error; } +int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, + enum sgp_type sgp) +{ + return shmem_get_folio_gfp(inode, index, foliop, sgp, + mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); +} + /* * This is like autoremove_wake_function, but it removes the wait queue * entry unconditionally - even if something else had already woken the @@ -2063,6 +2071,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); + struct folio *folio = NULL; int err; vm_fault_t ret = VM_FAULT_LOCKED; @@ -2125,10 +2134,12 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) spin_unlock(&inode->i_lock); } - err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE, + err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, gfp, vma, vmf, &ret); if (err) return vmf_error(err); + if (folio) + vmf->page = folio_file_page(folio, vmf->pgoff); return ret; } @@ -2269,7 +2280,8 @@ out_nomem: static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { - struct shmem_inode_info *info = SHMEM_I(file_inode(file)); + struct inode *inode = file_inode(file); + struct shmem_inode_info *info = SHMEM_I(inode); int ret; ret = seal_check_future_write(info->seals, vma); @@ -2280,7 +2292,11 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_flags |= VM_MTE_ALLOWED; file_accessed(file); - vma->vm_ops = &shmem_vm_ops; + /* This is anonymous shared memory if it is unlinked at the time of mmap */ + if (inode->i_nlink) + vma->vm_ops = &shmem_vm_ops; + else + vma->vm_ops = &shmem_anon_vm_ops; return 0; } @@ -2330,7 +2346,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir, inode_init_owner(&init_user_ns, inode, dir, mode); inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); - inode->i_generation = prandom_u32(); + inode->i_generation = get_random_u32(); info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); spin_lock_init(&info->lock); @@ -2398,7 +2414,6 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); void *page_kaddr; struct folio *folio; - struct page *page; int ret; pgoff_t max_off; @@ -2417,53 +2432,70 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (!*pagep) { ret = -ENOMEM; - page = shmem_alloc_page(gfp, info, pgoff); - if (!page) + folio = shmem_alloc_folio(gfp, info, pgoff); + if (!folio) goto out_unacct_blocks; if (!zeropage) { /* COPY */ - page_kaddr = kmap_atomic(page); + page_kaddr = kmap_local_folio(folio, 0); + /* + * The read mmap_lock is held here. Despite the + * mmap_lock being read recursive a deadlock is still + * possible if a writer has taken a lock. For example: + * + * process A thread 1 takes read lock on own mmap_lock + * process A thread 2 calls mmap, blocks taking write lock + * process B thread 1 takes page fault, read lock on own mmap lock + * process B thread 2 calls mmap, blocks taking write lock + * process A thread 1 blocks taking read lock on process B + * process B thread 1 blocks taking read lock on process A + * + * Disable page faults to prevent potential deadlock + * and retry the copy outside the mmap_lock. + */ + pagefault_disable(); ret = copy_from_user(page_kaddr, (const void __user *)src_addr, PAGE_SIZE); - kunmap_atomic(page_kaddr); + pagefault_enable(); + kunmap_local(page_kaddr); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { - *pagep = page; + *pagep = &folio->page; ret = -ENOENT; /* don't free the page */ goto out_unacct_blocks; } - flush_dcache_page(page); + flush_dcache_folio(folio); } else { /* ZEROPAGE */ - clear_user_highpage(page, dst_addr); + clear_user_highpage(&folio->page, dst_addr); } } else { - page = *pagep; + folio = page_folio(*pagep); + VM_BUG_ON_FOLIO(folio_test_large(folio), folio); *pagep = NULL; } - VM_BUG_ON(PageLocked(page)); - VM_BUG_ON(PageSwapBacked(page)); - __SetPageLocked(page); - __SetPageSwapBacked(page); - __SetPageUptodate(page); + VM_BUG_ON(folio_test_locked(folio)); + VM_BUG_ON(folio_test_swapbacked(folio)); + __folio_set_locked(folio); + __folio_set_swapbacked(folio); + __folio_mark_uptodate(folio); ret = -EFAULT; max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(pgoff >= max_off)) goto out_release; - folio = page_folio(page); ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp & GFP_RECLAIM_MASK, dst_mm); if (ret) goto out_release; ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, - page, true, wp_copy); + &folio->page, true, wp_copy); if (ret) goto out_delete_from_cache; @@ -2473,13 +2505,13 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); - unlock_page(page); + folio_unlock(folio); return 0; out_delete_from_cache: - delete_from_page_cache(page); + filemap_remove_folio(folio); out_release: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); out_unacct_blocks: shmem_inode_unacct_blocks(inode, 1); return ret; @@ -2498,6 +2530,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t index = pos >> PAGE_SHIFT; + struct folio *folio; int ret = 0; /* i_rwsem is held by caller */ @@ -2509,14 +2542,15 @@ shmem_write_begin(struct file *file, struct address_space *mapping, return -EPERM; } - ret = shmem_getpage(inode, index, pagep, SGP_WRITE); + ret = shmem_get_folio(inode, index, &folio, SGP_WRITE); if (ret) return ret; + *pagep = folio_file_page(folio, index); if (PageHWPoison(*pagep)) { - unlock_page(*pagep); - put_page(*pagep); + folio_unlock(folio); + folio_put(folio); *pagep = NULL; return -EIO; } @@ -2575,6 +2609,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) offset = *ppos & ~PAGE_MASK; for (;;) { + struct folio *folio = NULL; struct page *page = NULL; pgoff_t end_index; unsigned long nr, ret; @@ -2589,17 +2624,18 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) break; } - error = shmem_getpage(inode, index, &page, SGP_READ); + error = shmem_get_folio(inode, index, &folio, SGP_READ); if (error) { if (error == -EINVAL) error = 0; break; } - if (page) { - unlock_page(page); + if (folio) { + folio_unlock(folio); + page = folio_file_page(folio, index); if (PageHWPoison(page)) { - put_page(page); + folio_put(folio); error = -EIO; break; } @@ -2615,14 +2651,14 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (index == end_index) { nr = i_size & ~PAGE_MASK; if (nr <= offset) { - if (page) - put_page(page); + if (folio) + folio_put(folio); break; } } nr -= offset; - if (page) { + if (folio) { /* * If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing @@ -2634,13 +2670,13 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * Mark the page accessed if we read the beginning. */ if (!offset) - mark_page_accessed(page); + folio_mark_accessed(folio); /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... */ ret = copy_page_to_iter(page, offset, nr, to); - put_page(page); + folio_put(folio); } else if (user_backed_iter(to)) { /* @@ -2783,7 +2819,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, info->fallocend = end; for (index = start; index < end; ) { - struct page *page; + struct folio *folio; /* * Good, the fallocate(2) manpage permits EINTR: we may have @@ -2794,10 +2830,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) error = -ENOMEM; else - error = shmem_getpage(inode, index, &page, SGP_FALLOC); + error = shmem_get_folio(inode, index, &folio, + SGP_FALLOC); if (error) { info->fallocend = undo_fallocend; - /* Remove the !PageUptodate pages we added */ + /* Remove the !uptodate folios we added */ if (index > start) { shmem_undo_range(inode, (loff_t)start << PAGE_SHIFT, @@ -2806,37 +2843,34 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, goto undone; } - index++; /* * Here is a more important optimization than it appears: - * a second SGP_FALLOC on the same huge page will clear it, - * making it PageUptodate and un-undoable if we fail later. + * a second SGP_FALLOC on the same large folio will clear it, + * making it uptodate and un-undoable if we fail later. */ - if (PageTransCompound(page)) { - index = round_up(index, HPAGE_PMD_NR); - /* Beware 32-bit wraparound */ - if (!index) - index--; - } + index = folio_next_index(folio); + /* Beware 32-bit wraparound */ + if (!index) + index--; /* * Inform shmem_writepage() how far we have reached. * No need for lock or barrier: we have the page lock. */ - if (!PageUptodate(page)) + if (!folio_test_uptodate(folio)) shmem_falloc.nr_falloced += index - shmem_falloc.next; shmem_falloc.next = index; /* - * If !PageUptodate, leave it that way so that freeable pages + * If !uptodate, leave it that way so that freeable folios * can be recognized if we need to rollback on error later. - * But set_page_dirty so that memory pressure will swap rather - * than free the pages we are allocating (and SGP_CACHE pages + * But mark it dirty so that memory pressure will swap rather + * than free the folios we are allocating (and SGP_CACHE folios * might still be clean: we now need to mark those dirty too). */ - set_page_dirty(page); - unlock_page(page); - put_page(page); + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); cond_resched(); } @@ -2901,6 +2935,7 @@ shmem_mknod(struct user_namespace *mnt_userns, struct inode *dir, error = 0; dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = current_time(dir); + inode_inc_iversion(dir); d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ } @@ -2912,7 +2947,7 @@ out_iput: static int shmem_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct file *file, umode_t mode) { struct inode *inode; int error = -ENOSPC; @@ -2927,9 +2962,9 @@ shmem_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, error = simple_acl_create(dir, inode); if (error) goto out_iput; - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); } - return error; + return finish_open_simple(file, error); out_iput: iput(inode); return error; @@ -2976,6 +3011,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr dir->i_size += BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); + inode_inc_iversion(dir); inc_nlink(inode); ihold(inode); /* New dentry reference */ dget(dentry); /* Extra pinning count for the created dentry */ @@ -2993,6 +3029,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) dir->i_size -= BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); + inode_inc_iversion(dir); drop_nlink(inode); dput(dentry); /* Undo the count from "create" - this does all the work */ return 0; @@ -3082,6 +3119,8 @@ static int shmem_rename2(struct user_namespace *mnt_userns, old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime = new_dir->i_mtime = inode->i_ctime = current_time(old_dir); + inode_inc_iversion(old_dir); + inode_inc_iversion(new_dir); return 0; } @@ -3091,7 +3130,7 @@ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir, int error; int len; struct inode *inode; - struct page *page; + struct folio *folio; len = strlen(symname) + 1; if (len > PAGE_SIZE) @@ -3119,21 +3158,22 @@ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir, inode->i_op = &shmem_short_symlink_operations; } else { inode_nohighmem(inode); - error = shmem_getpage(inode, 0, &page, SGP_WRITE); + error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); if (error) { iput(inode); return error; } inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; - memcpy(page_address(page), symname, len); - SetPageUptodate(page); - set_page_dirty(page); - unlock_page(page); - put_page(page); + memcpy(folio_address(folio), symname, len); + folio_mark_uptodate(folio); + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); } dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = current_time(dir); + inode_inc_iversion(dir); d_instantiate(dentry, inode); dget(dentry); return 0; @@ -3141,40 +3181,41 @@ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir, static void shmem_put_link(void *arg) { - mark_page_accessed(arg); - put_page(arg); + folio_mark_accessed(arg); + folio_put(arg); } static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct page *page = NULL; + struct folio *folio = NULL; int error; + if (!dentry) { - page = find_get_page(inode->i_mapping, 0); - if (!page) + folio = filemap_get_folio(inode->i_mapping, 0); + if (!folio) return ERR_PTR(-ECHILD); - if (PageHWPoison(page) || - !PageUptodate(page)) { - put_page(page); + if (PageHWPoison(folio_page(folio, 0)) || + !folio_test_uptodate(folio)) { + folio_put(folio); return ERR_PTR(-ECHILD); } } else { - error = shmem_getpage(inode, 0, &page, SGP_READ); + error = shmem_get_folio(inode, 0, &folio, SGP_READ); if (error) return ERR_PTR(error); - if (!page) + if (!folio) return ERR_PTR(-ECHILD); - if (PageHWPoison(page)) { - unlock_page(page); - put_page(page); + if (PageHWPoison(folio_page(folio, 0))) { + folio_unlock(folio); + folio_put(folio); return ERR_PTR(-ECHILD); } - unlock_page(page); + folio_unlock(folio); } - set_delayed_call(done, shmem_put_link, page); - return page_address(page); + set_delayed_call(done, shmem_put_link, folio); + return folio_address(folio); } #ifdef CONFIG_TMPFS_XATTR @@ -3204,6 +3245,7 @@ static int shmem_fileattr_set(struct user_namespace *mnt_userns, shmem_set_inode_flags(inode, info->fsflags); inode->i_ctime = current_time(inode); + inode_inc_iversion(inode); return 0; } @@ -3244,7 +3286,7 @@ static int shmem_initxattrs(struct inode *inode, memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, xattr->name, len); - simple_xattr_list_add(&info->xattrs, new_xattr); + simple_xattr_add(&info->xattrs, new_xattr); } return 0; @@ -3267,9 +3309,15 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler, size_t size, int flags) { struct shmem_inode_info *info = SHMEM_I(inode); + int err; name = xattr_full_name(handler, name); - return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL); + err = simple_xattr_set(&info->xattrs, name, value, size, flags, NULL); + if (!err) { + inode->i_ctime = current_time(inode); + inode_inc_iversion(inode); + } + return err; } static const struct xattr_handler shmem_security_xattr_handler = { @@ -3732,7 +3780,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_flags |= SB_NOUSER; } sb->s_export_op = &shmem_export_ops; - sb->s_flags |= SB_NOSEC; + sb->s_flags |= SB_NOSEC | SB_I_VERSION; #else sb->s_flags |= SB_NOUSER; #endif @@ -3876,6 +3924,7 @@ EXPORT_SYMBOL(shmem_aops); static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, + .open = generic_file_open, .get_unmapped_area = shmem_get_unmapped_area, #ifdef CONFIG_TMPFS .llseek = shmem_file_llseek, @@ -3961,6 +4010,15 @@ static const struct vm_operations_struct shmem_vm_ops = { #endif }; +static const struct vm_operations_struct shmem_anon_vm_ops = { + .fault = shmem_fault, + .map_pages = filemap_map_pages, +#ifdef CONFIG_NUMA + .set_policy = shmem_set_policy, + .get_policy = shmem_get_policy, +#endif +}; + int shmem_init_fs_context(struct fs_context *fc) { struct shmem_options *ctx; @@ -4136,6 +4194,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) EXPORT_SYMBOL_GPL(shmem_truncate_range); #define shmem_vm_ops generic_file_vm_ops +#define shmem_anon_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) #define shmem_acct_size(flags, size) 0 @@ -4241,7 +4300,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) if (vma->vm_file) fput(vma->vm_file); vma->vm_file = file; - vma->vm_ops = &shmem_vm_ops; + vma->vm_ops = &shmem_anon_vm_ops; return 0; } @@ -4266,18 +4325,20 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, { #ifdef CONFIG_SHMEM struct inode *inode = mapping->host; + struct folio *folio; struct page *page; int error; BUG_ON(!shmem_mapping(mapping)); - error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, + error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, gfp, NULL, NULL, NULL); if (error) return ERR_PTR(error); - unlock_page(page); + folio_unlock(folio); + page = folio_file_page(folio, index); if (PageHWPoison(page)) { - put_page(page); + folio_put(folio); return ERR_PTR(-EIO); } diff --git a/mm/shuffle.c b/mm/shuffle.c index c13c33b247e8..fb1393b8b3a9 100644 --- a/mm/shuffle.c +++ b/mm/shuffle.c @@ -12,23 +12,22 @@ DEFINE_STATIC_KEY_FALSE(page_alloc_shuffle_key); static bool shuffle_param; -static int shuffle_show(char *buffer, const struct kernel_param *kp) -{ - return sprintf(buffer, "%c\n", shuffle_param ? 'Y' : 'N'); -} -static __meminit int shuffle_store(const char *val, +static __meminit int shuffle_param_set(const char *val, const struct kernel_param *kp) { - int rc = param_set_bool(val, kp); - - if (rc < 0) - return rc; - if (shuffle_param) + if (param_set_bool(val, kp)) + return -EINVAL; + if (*(bool *)kp->arg) static_branch_enable(&page_alloc_shuffle_key); return 0; } -module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400); + +static const struct kernel_param_ops shuffle_param_ops = { + .set = shuffle_param_set, + .get = param_get_bool, +}; +module_param_cb(shuffle, &shuffle_param_ops, &shuffle_param, 0400); /* * For two pages to be swapped in the shuffle, they must be free (on a diff --git a/mm/slab.c b/mm/slab.c index 10e96137b44f..7a269db050ee 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -234,7 +234,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) parent->shared = NULL; parent->alien = NULL; parent->colour_next = 0; - spin_lock_init(&parent->list_lock); + raw_spin_lock_init(&parent->list_lock); parent->free_objects = 0; parent->free_touched = 0; } @@ -559,9 +559,9 @@ static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep, slab_node = slab_nid(slab); n = get_node(cachep, slab_node); - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); free_block(cachep, &objp, 1, slab_node, &list); - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); slabs_destroy(cachep, &list); } @@ -684,7 +684,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, struct kmem_cache_node *n = get_node(cachep, node); if (ac->avail) { - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); /* * Stuff objects into the remote nodes shared array first. * That way we could avoid the overhead of putting the objects @@ -695,7 +695,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, free_block(cachep, ac->entry, ac->avail, node, list); ac->avail = 0; - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); } } @@ -768,9 +768,9 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp, slabs_destroy(cachep, &list); } else { n = get_node(cachep, slab_node); - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); free_block(cachep, &objp, 1, slab_node, &list); - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); slabs_destroy(cachep, &list); } return 1; @@ -811,10 +811,10 @@ static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp) */ n = get_node(cachep, node); if (n) { - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); return 0; } @@ -893,7 +893,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep, goto fail; n = get_node(cachep, node); - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); if (n->shared && force_change) { free_block(cachep, n->shared->entry, n->shared->avail, node, &list); @@ -911,7 +911,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep, new_alien = NULL; } - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); slabs_destroy(cachep, &list); /* @@ -950,7 +950,7 @@ static void cpuup_canceled(long cpu) if (!n) continue; - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); /* Free limit for this kmem_cache_node */ n->free_limit -= cachep->batchcount; @@ -961,7 +961,7 @@ static void cpuup_canceled(long cpu) nc->avail = 0; if (!cpumask_empty(mask)) { - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); goto free_slab; } @@ -975,7 +975,7 @@ static void cpuup_canceled(long cpu) alien = n->alien; n->alien = NULL; - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); kfree(shared); if (alien) { @@ -1159,7 +1159,7 @@ static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node * /* * Do not assume that spinlocks can be initialized via memcpy: */ - spin_lock_init(&ptr->list_lock); + raw_spin_lock_init(&ptr->list_lock); MAKE_ALL_LISTS(cachep, ptr, nodeid); cachep->node[nodeid] = ptr; @@ -1330,11 +1330,11 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) for_each_kmem_cache_node(cachep, node, n) { unsigned long total_slabs, free_slabs, free_objs; - spin_lock_irqsave(&n->list_lock, flags); + raw_spin_lock_irqsave(&n->list_lock, flags); total_slabs = n->total_slabs; free_slabs = n->free_slabs; free_objs = n->free_objects; - spin_unlock_irqrestore(&n->list_lock, flags); + raw_spin_unlock_irqrestore(&n->list_lock, flags); pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n", node, total_slabs - free_slabs, total_slabs, @@ -1370,6 +1370,8 @@ static struct slab *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, account_slab(slab, cachep->gfporder, cachep, flags); __folio_set_slab(folio); + /* Make the flag visible before any changes to folio->mapping */ + smp_wmb(); /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ if (sk_memalloc_socks() && page_is_pfmemalloc(folio_page(folio, 0))) slab_set_pfmemalloc(slab); @@ -1387,9 +1389,11 @@ static void kmem_freepages(struct kmem_cache *cachep, struct slab *slab) BUG_ON(!folio_test_slab(folio)); __slab_clear_pfmemalloc(slab); - __folio_clear_slab(folio); page_mapcount_reset(folio_page(folio, 0)); folio->mapping = NULL; + /* Make the mapping reset visible before clearing the flag */ + smp_wmb(); + __folio_clear_slab(folio); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += 1 << order; @@ -1619,7 +1623,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slab) * although actual page can be freed in rcu context */ if (OFF_SLAB(cachep)) - kmem_cache_free(cachep->freelist_cache, freelist); + kfree(freelist); } /* @@ -1671,21 +1675,27 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, if (flags & CFLGS_OFF_SLAB) { struct kmem_cache *freelist_cache; size_t freelist_size; + size_t freelist_cache_size; freelist_size = num * sizeof(freelist_idx_t); - freelist_cache = kmalloc_slab(freelist_size, 0u); - if (!freelist_cache) - continue; - - /* - * Needed to avoid possible looping condition - * in cache_grow_begin() - */ - if (OFF_SLAB(freelist_cache)) - continue; + if (freelist_size > KMALLOC_MAX_CACHE_SIZE) { + freelist_cache_size = PAGE_SIZE << get_order(freelist_size); + } else { + freelist_cache = kmalloc_slab(freelist_size, 0u); + if (!freelist_cache) + continue; + freelist_cache_size = freelist_cache->size; + + /* + * Needed to avoid possible looping condition + * in cache_grow_begin() + */ + if (OFF_SLAB(freelist_cache)) + continue; + } /* check if off slab has enough benefit */ - if (freelist_cache->size > cachep->size / 2) + if (freelist_cache_size > cachep->size / 2) continue; } @@ -2061,11 +2071,6 @@ done: cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); #endif - if (OFF_SLAB(cachep)) { - cachep->freelist_cache = - kmalloc_slab(cachep->freelist_size, 0u); - } - err = setup_cpu_cache(cachep, gfp); if (err) { __kmem_cache_release(cachep); @@ -2095,7 +2100,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); + assert_raw_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); #endif } @@ -2103,7 +2108,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&get_node(cachep, node)->list_lock); + assert_raw_spin_locked(&get_node(cachep, node)->list_lock); #endif } @@ -2143,9 +2148,9 @@ static void do_drain(void *arg) check_irq_off(); ac = cpu_cache_get(cachep); n = get_node(cachep, node); - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); free_block(cachep, ac->entry, ac->avail, node, &list); - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); ac->avail = 0; slabs_destroy(cachep, &list); } @@ -2163,9 +2168,9 @@ static void drain_cpu_caches(struct kmem_cache *cachep) drain_alien_cache(cachep, n->alien); for_each_kmem_cache_node(cachep, node, n) { - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); drain_array_locked(cachep, n->shared, node, true, &list); - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); slabs_destroy(cachep, &list); } @@ -2187,10 +2192,10 @@ static int drain_freelist(struct kmem_cache *cache, nr_freed = 0; while (nr_freed < tofree && !list_empty(&n->slabs_free)) { - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); p = n->slabs_free.prev; if (p == &n->slabs_free) { - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); goto out; } @@ -2203,7 +2208,7 @@ static int drain_freelist(struct kmem_cache *cache, * to the cache. */ n->free_objects -= cache->num; - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); slab_destroy(cache, slab); nr_freed++; } @@ -2292,7 +2297,7 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, freelist = NULL; else if (OFF_SLAB(cachep)) { /* Slab management obj is off-slab. */ - freelist = kmem_cache_alloc_node(cachep->freelist_cache, + freelist = kmalloc_node(cachep->freelist_size, local_flags, nodeid); } else { /* We will use last bytes at the slab for freelist */ @@ -2380,7 +2385,7 @@ static bool freelist_state_initialize(union freelist_init_state *state, unsigned int rand; /* Use best entropy available to define a random shift */ - rand = get_random_int(); + rand = get_random_u32(); /* Use a random state if the pre-computed list is not available */ if (!cachep->random_seq) { @@ -2628,7 +2633,7 @@ static void cache_grow_end(struct kmem_cache *cachep, struct slab *slab) INIT_LIST_HEAD(&slab->slab_list); n = get_node(cachep, slab_nid(slab)); - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); n->total_slabs++; if (!slab->active) { list_add_tail(&slab->slab_list, &n->slabs_free); @@ -2638,7 +2643,7 @@ static void cache_grow_end(struct kmem_cache *cachep, struct slab *slab) STATS_INC_GROWN(cachep); n->free_objects += cachep->num - slab->active; - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); } @@ -2804,7 +2809,7 @@ static struct slab *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) { struct slab *slab; - assert_spin_locked(&n->list_lock); + assert_raw_spin_locked(&n->list_lock); slab = list_first_entry_or_null(&n->slabs_partial, struct slab, slab_list); if (!slab) { @@ -2831,10 +2836,10 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, if (!gfp_pfmemalloc_allowed(flags)) return NULL; - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); slab = get_first_slab(n, true); if (!slab) { - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); return NULL; } @@ -2843,7 +2848,7 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, fixup_slab_list(cachep, n, slab, &list); - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); return obj; @@ -2902,7 +2907,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) if (!n->free_objects && (!shared || !shared->avail)) goto direct_grow; - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); shared = READ_ONCE(n->shared); /* See if we can refill from the shared array */ @@ -2926,7 +2931,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) must_grow: n->free_objects -= ac->avail; alloc_done: - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); direct_grow: @@ -3146,7 +3151,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, BUG_ON(!n); check_irq_off(); - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); slab = get_first_slab(n, false); if (!slab) goto must_grow; @@ -3164,12 +3169,12 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, fixup_slab_list(cachep, n, slab, &list); - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); return obj; must_grow: - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); slab = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid); if (slab) { /* This slab isn't counted yet so don't update free_objects */ @@ -3181,84 +3186,46 @@ must_grow: } static __always_inline void * -slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_size, - unsigned long caller) +__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - unsigned long save_flags; - void *ptr; + void *objp = NULL; int slab_node = numa_mem_id(); - struct obj_cgroup *objcg = NULL; - bool init = false; - - flags &= gfp_allowed_mask; - cachep = slab_pre_alloc_hook(cachep, NULL, &objcg, 1, flags); - if (unlikely(!cachep)) - return NULL; - - ptr = kfence_alloc(cachep, orig_size, flags); - if (unlikely(ptr)) - goto out_hooks; - local_irq_save(save_flags); - - if (nodeid == NUMA_NO_NODE) - nodeid = slab_node; - - if (unlikely(!get_node(cachep, nodeid))) { - /* Node not bootstrapped yet */ - ptr = fallback_alloc(cachep, flags); - goto out; - } - - if (nodeid == slab_node) { + if (nodeid == NUMA_NO_NODE) { + if (current->mempolicy || cpuset_do_slab_mem_spread()) { + objp = alternate_node_alloc(cachep, flags); + if (objp) + goto out; + } /* * Use the locally cached objects if possible. * However ____cache_alloc does not allow fallback * to other nodes. It may fail while we still have * objects on other nodes available. */ - ptr = ____cache_alloc(cachep, flags); - if (ptr) - goto out; - } - /* ___cache_alloc_node can fall back to other nodes */ - ptr = ____cache_alloc_node(cachep, flags, nodeid); -out: - local_irq_restore(save_flags); - ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); - init = slab_want_init_on_alloc(flags, cachep); - -out_hooks: - slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr, init); - return ptr; -} - -static __always_inline void * -__do_cache_alloc(struct kmem_cache *cache, gfp_t flags) -{ - void *objp; - - if (current->mempolicy || cpuset_do_slab_mem_spread()) { - objp = alternate_node_alloc(cache, flags); - if (objp) - goto out; + objp = ____cache_alloc(cachep, flags); + nodeid = slab_node; + } else if (nodeid == slab_node) { + objp = ____cache_alloc(cachep, flags); + } else if (!get_node(cachep, nodeid)) { + /* Node not bootstrapped yet */ + objp = fallback_alloc(cachep, flags); + goto out; } - objp = ____cache_alloc(cache, flags); /* * We may just have run out of memory on the local node. * ____cache_alloc_node() knows how to locate memory on other nodes */ if (!objp) - objp = ____cache_alloc_node(cache, flags, numa_mem_id()); - + objp = ____cache_alloc_node(cachep, flags, nodeid); out: return objp; } #else static __always_inline void * -__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) +__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid __maybe_unused) { return ____cache_alloc(cachep, flags); } @@ -3266,8 +3233,8 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) #endif /* CONFIG_NUMA */ static __always_inline void * -slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, - size_t orig_size, unsigned long caller) +slab_alloc_node(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, + int nodeid, size_t orig_size, unsigned long caller) { unsigned long save_flags; void *objp; @@ -3284,17 +3251,26 @@ slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, goto out; local_irq_save(save_flags); - objp = __do_cache_alloc(cachep, flags); + objp = __do_cache_alloc(cachep, flags, nodeid); local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); prefetchw(objp); init = slab_want_init_on_alloc(flags, cachep); out: - slab_post_alloc_hook(cachep, objcg, flags, 1, &objp, init); + slab_post_alloc_hook(cachep, objcg, flags, 1, &objp, init, + cachep->object_size); return objp; } +static __always_inline void * +slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, + size_t orig_size, unsigned long caller) +{ + return slab_alloc_node(cachep, lru, flags, NUMA_NO_NODE, orig_size, + caller); +} + /* * Caller needs to acquire correct kmem_cache_node's list_lock * @list: List of detached free slabs should be freed by caller @@ -3354,7 +3330,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) check_irq_off(); n = get_node(cachep, node); - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); if (n->shared) { struct array_cache *shared_array = n->shared; int max = shared_array->limit - shared_array->avail; @@ -3383,7 +3359,7 @@ free_done: STATS_SET_FREEABLE(cachep, i); } #endif - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); ac->avail -= batchcount; memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); slabs_destroy(cachep, &list); @@ -3470,22 +3446,11 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, { void *ret = slab_alloc(cachep, lru, flags, cachep->object_size, _RET_IP_); - trace_kmem_cache_alloc(_RET_IP_, ret, cachep, - cachep->object_size, cachep->size, flags); + trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, NUMA_NO_NODE); return ret; } -/** - * kmem_cache_alloc - Allocate an object - * @cachep: The cache to allocate from. - * @flags: See kmalloc(). - * - * Allocate an object from this cache. The flags are only relevant - * if the cache has no available objects. - * - * Return: pointer to the new object or %NULL in case of error - */ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { return __kmem_cache_alloc_lru(cachep, NULL, flags); @@ -3521,7 +3486,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, local_irq_disable(); for (i = 0; i < size; i++) { - void *objp = kfence_alloc(s, s->object_size, flags) ?: __do_cache_alloc(s, flags); + void *objp = kfence_alloc(s, s->object_size, flags) ?: + __do_cache_alloc(s, flags, NUMA_NO_NODE); if (unlikely(!objp)) goto error; @@ -3536,35 +3502,18 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, * Done outside of the IRQ disabled section. */ slab_post_alloc_hook(s, objcg, flags, size, p, - slab_want_init_on_alloc(flags, s)); + slab_want_init_on_alloc(flags, s), s->object_size); /* FIXME: Trace call missing. Christoph would like a bulk variant */ return size; error: local_irq_enable(); cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_); - slab_post_alloc_hook(s, objcg, flags, i, p, false); + slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size); kmem_cache_free_bulk(s, i, p); return 0; } EXPORT_SYMBOL(kmem_cache_alloc_bulk); -#ifdef CONFIG_TRACING -void * -kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) -{ - void *ret; - - ret = slab_alloc(cachep, NULL, flags, size, _RET_IP_); - - ret = kasan_kmalloc(cachep, ret, size, flags); - trace_kmalloc(_RET_IP_, ret, cachep, - size, cachep->size, flags); - return ret; -} -EXPORT_SYMBOL(kmem_cache_alloc_trace); -#endif - -#ifdef CONFIG_NUMA /** * kmem_cache_alloc_node - Allocate an object on the specified node * @cachep: The cache to allocate from. @@ -3580,65 +3529,21 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace); */ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - void *ret = slab_alloc_node(cachep, flags, nodeid, cachep->object_size, _RET_IP_); + void *ret = slab_alloc_node(cachep, NULL, flags, nodeid, cachep->object_size, _RET_IP_); - trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep, - cachep->object_size, cachep->size, - flags, nodeid); + trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, nodeid); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node); -#ifdef CONFIG_TRACING -void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, - gfp_t flags, - int nodeid, - size_t size) +void *__kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, + int nodeid, size_t orig_size, + unsigned long caller) { - void *ret; - - ret = slab_alloc_node(cachep, flags, nodeid, size, _RET_IP_); - - ret = kasan_kmalloc(cachep, ret, size, flags); - trace_kmalloc_node(_RET_IP_, ret, cachep, - size, cachep->size, - flags, nodeid); - return ret; + return slab_alloc_node(cachep, NULL, flags, nodeid, + orig_size, caller); } -EXPORT_SYMBOL(kmem_cache_alloc_node_trace); -#endif - -static __always_inline void * -__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) -{ - struct kmem_cache *cachep; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) - return NULL; - cachep = kmalloc_slab(size, flags); - if (unlikely(ZERO_OR_NULL_PTR(cachep))) - return cachep; - ret = kmem_cache_alloc_node_trace(cachep, flags, node, size); - ret = kasan_kmalloc(cachep, ret, size, flags); - - return ret; -} - -void *__kmalloc_node(size_t size, gfp_t flags, int node) -{ - return __do_kmalloc_node(size, flags, node, _RET_IP_); -} -EXPORT_SYMBOL(__kmalloc_node); - -void *__kmalloc_node_track_caller(size_t size, gfp_t flags, - int node, unsigned long caller) -{ - return __do_kmalloc_node(size, flags, node, caller); -} -EXPORT_SYMBOL(__kmalloc_node_track_caller); -#endif /* CONFIG_NUMA */ #ifdef CONFIG_PRINTK void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) @@ -3662,45 +3567,25 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) } #endif -/** - * __do_kmalloc - allocate memory - * @size: how many bytes of memory are required. - * @flags: the type of memory to allocate (see kmalloc). - * @caller: function caller for debug tracking of the caller - * - * Return: pointer to the allocated memory or %NULL in case of error - */ -static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, - unsigned long caller) +static __always_inline +void __do_kmem_cache_free(struct kmem_cache *cachep, void *objp, + unsigned long caller) { - struct kmem_cache *cachep; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) - return NULL; - cachep = kmalloc_slab(size, flags); - if (unlikely(ZERO_OR_NULL_PTR(cachep))) - return cachep; - ret = slab_alloc(cachep, NULL, flags, size, caller); - - ret = kasan_kmalloc(cachep, ret, size, flags); - trace_kmalloc(caller, ret, cachep, - size, cachep->size, flags); - - return ret; -} + unsigned long flags; -void *__kmalloc(size_t size, gfp_t flags) -{ - return __do_kmalloc(size, flags, _RET_IP_); + local_irq_save(flags); + debug_check_no_locks_freed(objp, cachep->object_size); + if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(objp, cachep->object_size); + __cache_free(cachep, objp, caller); + local_irq_restore(flags); } -EXPORT_SYMBOL(__kmalloc); -void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) +void __kmem_cache_free(struct kmem_cache *cachep, void *objp, + unsigned long caller) { - return __do_kmalloc(size, flags, caller); + __do_kmem_cache_free(cachep, objp, caller); } -EXPORT_SYMBOL(__kmalloc_track_caller); /** * kmem_cache_free - Deallocate an object @@ -3712,34 +3597,38 @@ EXPORT_SYMBOL(__kmalloc_track_caller); */ void kmem_cache_free(struct kmem_cache *cachep, void *objp) { - unsigned long flags; cachep = cache_from_obj(cachep, objp); if (!cachep) return; - trace_kmem_cache_free(_RET_IP_, objp, cachep->name); - local_irq_save(flags); - debug_check_no_locks_freed(objp, cachep->object_size); - if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(objp, cachep->object_size); - __cache_free(cachep, objp, _RET_IP_); - local_irq_restore(flags); + trace_kmem_cache_free(_RET_IP_, objp, cachep); + __do_kmem_cache_free(cachep, objp, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_free); void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) { - struct kmem_cache *s; - size_t i; local_irq_disable(); - for (i = 0; i < size; i++) { + for (int i = 0; i < size; i++) { void *objp = p[i]; + struct kmem_cache *s; - if (!orig_s) /* called via kfree_bulk */ - s = virt_to_cache(objp); - else + if (!orig_s) { + struct folio *folio = virt_to_folio(objp); + + /* called via kfree_bulk */ + if (!folio_test_slab(folio)) { + local_irq_enable(); + free_large_kmalloc(folio, objp); + local_irq_disable(); + continue; + } + s = folio_slab(folio)->slab_cache; + } else { s = cache_from_obj(orig_s, objp); + } + if (!s) continue; @@ -3755,39 +3644,6 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) } EXPORT_SYMBOL(kmem_cache_free_bulk); -/** - * kfree - free previously allocated memory - * @objp: pointer returned by kmalloc. - * - * If @objp is NULL, no operation is performed. - * - * Don't free memory not originally allocated by kmalloc() - * or you will run into trouble. - */ -void kfree(const void *objp) -{ - struct kmem_cache *c; - unsigned long flags; - - trace_kfree(_RET_IP_, objp); - - if (unlikely(ZERO_OR_NULL_PTR(objp))) - return; - local_irq_save(flags); - kfree_debugcheck(objp); - c = virt_to_cache(objp); - if (!c) { - local_irq_restore(flags); - return; - } - debug_check_no_locks_freed(objp, c->object_size); - - debug_check_no_obj_freed(objp, c->object_size); - __cache_free(c, (void *)objp, _RET_IP_); - local_irq_restore(flags); -} -EXPORT_SYMBOL(kfree); - /* * This initializes kmem_cache_node or resizes various caches for all nodes. */ @@ -3860,9 +3716,9 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, node = cpu_to_mem(cpu); n = get_node(cachep, node); - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); free_block(cachep, ac->entry, ac->avail, node, &list); - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); slabs_destroy(cachep, &list); } free_percpu(prev); @@ -3954,9 +3810,9 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, return; } - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); drain_array_locked(cachep, ac, node, false, &list); - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); slabs_destroy(cachep, &list); } @@ -4040,7 +3896,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) for_each_kmem_cache_node(cachep, node, n) { check_irq_on(); - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); total_slabs += n->total_slabs; free_slabs += n->free_slabs; @@ -4049,7 +3905,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) if (n->shared) shared_avail += n->shared->avail; - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); } num_objs = total_slabs * cachep->num; active_slabs = total_slabs - free_slabs; @@ -4190,28 +4046,3 @@ void __check_heap_object(const void *ptr, unsigned long n, usercopy_abort("SLAB object", cachep->name, to_user, offset, n); } #endif /* CONFIG_HARDENED_USERCOPY */ - -/** - * __ksize -- Uninstrumented ksize. - * @objp: pointer to the object - * - * Unlike ksize(), __ksize() is uninstrumented, and does not provide the same - * safety checks as ksize() with KASAN instrumentation enabled. - * - * Return: size of the actual memory used by @objp in bytes - */ -size_t __ksize(const void *objp) -{ - struct kmem_cache *c; - size_t size; - - BUG_ON(!objp); - if (unlikely(objp == ZERO_SIZE_PTR)) - return 0; - - c = virt_to_cache(objp); - size = c ? c->object_size : 0; - - return size; -} -EXPORT_SYMBOL(__ksize); diff --git a/mm/slab.h b/mm/slab.h index 4ec82bec15ec..7cc432969945 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -11,37 +11,43 @@ struct slab { #if defined(CONFIG_SLAB) + struct kmem_cache *slab_cache; union { - struct list_head slab_list; + struct { + struct list_head slab_list; + void *freelist; /* array of free object indexes */ + void *s_mem; /* first object */ + }; struct rcu_head rcu_head; }; - struct kmem_cache *slab_cache; - void *freelist; /* array of free object indexes */ - void *s_mem; /* first object */ unsigned int active; #elif defined(CONFIG_SLUB) - union { - struct list_head slab_list; - struct rcu_head rcu_head; -#ifdef CONFIG_SLUB_CPU_PARTIAL - struct { - struct slab *next; - int slabs; /* Nr of slabs left */ - }; -#endif - }; struct kmem_cache *slab_cache; - /* Double-word boundary */ - void *freelist; /* first free object */ union { - unsigned long counters; struct { - unsigned inuse:16; - unsigned objects:15; - unsigned frozen:1; + union { + struct list_head slab_list; +#ifdef CONFIG_SLUB_CPU_PARTIAL + struct { + struct slab *next; + int slabs; /* Nr of slabs left */ + }; +#endif + }; + /* Double-word boundary */ + void *freelist; /* first free object */ + union { + unsigned long counters; + struct { + unsigned inuse:16; + unsigned objects:15; + unsigned frozen:1; + }; + }; }; + struct rcu_head rcu_head; }; unsigned int __unused; @@ -66,9 +72,10 @@ struct slab { #define SLAB_MATCH(pg, sl) \ static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl)) SLAB_MATCH(flags, __page_flags); -SLAB_MATCH(compound_head, slab_list); /* Ensure bit 0 is clear */ #ifndef CONFIG_SLOB -SLAB_MATCH(rcu_head, rcu_head); +SLAB_MATCH(compound_head, slab_cache); /* Ensure bit 0 is clear */ +#else +SLAB_MATCH(compound_head, slab_list); /* Ensure bit 0 is clear */ #endif SLAB_MATCH(_refcount, __page_refcount); #ifdef CONFIG_MEMCG @@ -76,6 +83,9 @@ SLAB_MATCH(memcg_data, memcg_data); #endif #undef SLAB_MATCH static_assert(sizeof(struct slab) <= sizeof(struct page)); +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && defined(CONFIG_SLUB) +static_assert(IS_ALIGNED(offsetof(struct slab, freelist), 2*sizeof(void *))); +#endif /** * folio_slab - Converts from folio to slab. @@ -207,8 +217,6 @@ struct kmem_cache { unsigned int size; /* The aligned/padded/added on size */ unsigned int align; /* Alignment as calculated */ slab_flags_t flags; /* Active flags on the slab */ - unsigned int useroffset;/* Usercopy region offset */ - unsigned int usersize; /* Usercopy region size */ const char *name; /* Slab name for sysfs */ int refcount; /* Use counter */ void (*ctor)(void *); /* Called on object slot creation */ @@ -273,6 +281,11 @@ void create_kmalloc_caches(slab_flags_t); /* Find the kmalloc slab corresponding for a certain size */ struct kmem_cache *kmalloc_slab(size_t, gfp_t); + +void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t orig_size, + unsigned long caller); +void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller); #endif gfp_t kmalloc_fix_flags(gfp_t flags); @@ -331,7 +344,8 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size, SLAB_ACCOUNT) #elif defined(CONFIG_SLUB) #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_TEMPORARY | SLAB_ACCOUNT | SLAB_NO_USER_FLAGS) + SLAB_TEMPORARY | SLAB_ACCOUNT | \ + SLAB_NO_USER_FLAGS | SLAB_KMALLOC) #else #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE) #endif @@ -351,6 +365,7 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size, SLAB_RECLAIM_ACCOUNT | \ SLAB_TEMPORARY | \ SLAB_ACCOUNT | \ + SLAB_KMALLOC | \ SLAB_NO_USER_FLAGS) bool __kmem_cache_empty(struct kmem_cache *); @@ -658,8 +673,13 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) print_tracking(cachep, x); return cachep; } + +void free_large_kmalloc(struct folio *folio, void *object); + #endif /* CONFIG_SLOB */ +size_t __ksize(const void *objp); + static inline size_t slab_ksize(const struct kmem_cache *s) { #ifndef CONFIG_SLUB @@ -710,13 +730,27 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, static inline void slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, gfp_t flags, - size_t size, void **p, bool init) + size_t size, void **p, bool init, + unsigned int orig_size) { + unsigned int zero_size = s->object_size; size_t i; flags &= gfp_allowed_mask; /* + * For kmalloc object, the allocated memory size(object_size) is likely + * larger than the requested size(orig_size). If redzone check is + * enabled for the extra space, don't zero it, as it will be redzoned + * soon. The redzone operation for this extra space could be seen as a + * replacement of current poisoning under certain debug option, and + * won't break other sanity checks. + */ + if (kmem_cache_debug_flags(s, SLAB_STORE_USER | SLAB_RED_ZONE) && + (s->flags & SLAB_KMALLOC)) + zero_size = orig_size; + + /* * As memory initialization might be integrated into KASAN, * kasan_slab_alloc and initialization memset must be * kept together to avoid discrepancies in behavior. @@ -726,9 +760,10 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, for (i = 0; i < size; i++) { p[i] = kasan_slab_alloc(s, p[i], flags, init); if (p[i] && init && !kasan_has_integrated_init()) - memset(p[i], 0, s->object_size); + memset(p[i], 0, zero_size); kmemleak_alloc_recursive(p[i], s->object_size, 1, s->flags, flags); + kmsan_slab_alloc(s, p[i], flags); } memcg_slab_post_alloc_hook(s, objcg, flags, size, p); @@ -739,9 +774,8 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, * The slab lists for all objects. */ struct kmem_cache_node { - spinlock_t list_lock; - #ifdef CONFIG_SLAB + raw_spinlock_t list_lock; struct list_head slabs_partial; /* partial list first, better asm code */ struct list_head slabs_full; struct list_head slabs_free; @@ -757,6 +791,7 @@ struct kmem_cache_node { #endif #ifdef CONFIG_SLUB + spinlock_t list_lock; unsigned long nr_partial; struct list_head partial; #ifdef CONFIG_SLUB_DEBUG @@ -860,4 +895,8 @@ void __check_heap_object(const void *ptr, unsigned long n, } #endif +#ifdef CONFIG_SLUB_DEBUG +void skip_orig_size_check(struct kmem_cache *s, const void *object); +#endif + #endif /* MM_SLAB_H */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 07b948288f84..1cba98acc486 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -143,8 +143,10 @@ int slab_unmergeable(struct kmem_cache *s) if (s->ctor) return 1; +#ifdef CONFIG_HARDENED_USERCOPY if (s->usersize) return 1; +#endif /* * We may have set a slab to be unmergeable during bootstrap. @@ -223,8 +225,10 @@ static struct kmem_cache *create_cache(const char *name, s->size = s->object_size = object_size; s->align = align; s->ctor = ctor; +#ifdef CONFIG_HARDENED_USERCOPY s->useroffset = useroffset; s->usersize = usersize; +#endif err = __kmem_cache_create(s, flags); if (err) @@ -317,7 +321,8 @@ kmem_cache_create_usercopy(const char *name, flags &= CACHE_CREATE_MASK; /* Fail closed on bad usersize of useroffset values. */ - if (WARN_ON(!usersize && useroffset) || + if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) || + WARN_ON(!usersize && useroffset) || WARN_ON(size < usersize || size - usersize < useroffset)) usersize = useroffset = 0; @@ -475,6 +480,7 @@ void slab_kmem_cache_release(struct kmem_cache *s) void kmem_cache_destroy(struct kmem_cache *s) { int refcnt; + bool rcu_set; if (unlikely(!s) || !kasan_check_byte(s)) return; @@ -482,6 +488,8 @@ void kmem_cache_destroy(struct kmem_cache *s) cpus_read_lock(); mutex_lock(&slab_mutex); + rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU; + refcnt = --s->refcount; if (refcnt) goto out_unlock; @@ -492,7 +500,7 @@ void kmem_cache_destroy(struct kmem_cache *s) out_unlock: mutex_unlock(&slab_mutex); cpus_read_unlock(); - if (!refcnt && !(s->flags & SLAB_TYPESAFE_BY_RCU)) + if (!refcnt && !rcu_set) kmem_cache_release(s); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -508,13 +516,9 @@ EXPORT_SYMBOL(kmem_cache_destroy); */ int kmem_cache_shrink(struct kmem_cache *cachep) { - int ret; - - kasan_cache_shrink(cachep); - ret = __kmem_cache_shrink(cachep); - return ret; + return __kmem_cache_shrink(cachep); } EXPORT_SYMBOL(kmem_cache_shrink); @@ -596,8 +600,8 @@ void kmem_dump_obj(void *object) ptroffset = ((char *)object - (char *)kp.kp_objp) - kp.kp_data_offset; pr_cont(" pointer offset %lu", ptroffset); } - if (kp.kp_slab_cache && kp.kp_slab_cache->usersize) - pr_cont(" size %u", kp.kp_slab_cache->usersize); + if (kp.kp_slab_cache && kp.kp_slab_cache->object_size) + pr_cont(" size %u", kp.kp_slab_cache->object_size); if (kp.kp_ret) pr_cont(" allocated at %pS\n", kp.kp_ret); else @@ -641,8 +645,10 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, align = max(align, size); s->align = calculate_alignment(flags, align, size); +#ifdef CONFIG_HARDENED_USERCOPY s->useroffset = useroffset; s->usersize = usersize; +#endif err = __kmem_cache_create(s, flags); @@ -662,7 +668,8 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, if (!s) panic("Out of memory when creating slab %s\n", name); - create_boot_cache(s, name, size, flags, useroffset, usersize); + create_boot_cache(s, name, size, flags | SLAB_KMALLOC, useroffset, + usersize); kasan_cache_create_kmalloc(s); list_add(&s->list, &slab_caches); s->refcount = 1; @@ -734,6 +741,26 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) return kmalloc_caches[kmalloc_type(flags)][index]; } +size_t kmalloc_size_roundup(size_t size) +{ + struct kmem_cache *c; + + /* Short-circuit the 0 size case. */ + if (unlikely(size == 0)) + return 0; + /* Short-circuit saturated "too-large" case. */ + if (unlikely(size == SIZE_MAX)) + return SIZE_MAX; + /* Above the smaller buckets, size is a multiple of page size. */ + if (size > KMALLOC_MAX_CACHE_SIZE) + return PAGE_SIZE << get_order(size); + + /* The flags don't matter since size_index is common to all. */ + c = kmalloc_slab(size, GFP_KERNEL); + return c ? c->object_size : 0; +} +EXPORT_SYMBOL(kmalloc_size_roundup); + #ifdef CONFIG_ZONE_DMA #define KMALLOC_DMA_NAME(sz) .name[KMALLOC_DMA] = "dma-kmalloc-" #sz, #else @@ -746,10 +773,16 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) #define KMALLOC_CGROUP_NAME(sz) #endif +#ifndef CONFIG_SLUB_TINY +#define KMALLOC_RCL_NAME(sz) .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #sz, +#else +#define KMALLOC_RCL_NAME(sz) +#endif + #define INIT_KMALLOC_INFO(__size, __short_size) \ { \ .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \ - .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \ + KMALLOC_RCL_NAME(__short_size) \ KMALLOC_CGROUP_NAME(__short_size) \ KMALLOC_DMA_NAME(__short_size) \ .size = __size, \ @@ -757,8 +790,8 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) /* * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time. - * kmalloc_index() supports up to 2^25=32MB, so the final entry of the table is - * kmalloc-32M. + * kmalloc_index() supports up to 2^21=2MB, so the final entry of the table is + * kmalloc-2M. */ const struct kmalloc_info_struct kmalloc_info[] __initconst = { INIT_KMALLOC_INFO(0, 0), @@ -782,11 +815,7 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = { INIT_KMALLOC_INFO(262144, 256k), INIT_KMALLOC_INFO(524288, 512k), INIT_KMALLOC_INFO(1048576, 1M), - INIT_KMALLOC_INFO(2097152, 2M), - INIT_KMALLOC_INFO(4194304, 4M), - INIT_KMALLOC_INFO(8388608, 8M), - INIT_KMALLOC_INFO(16777216, 16M), - INIT_KMALLOC_INFO(33554432, 32M) + INIT_KMALLOC_INFO(2097152, 2M) }; /* @@ -839,7 +868,7 @@ void __init setup_kmalloc_cache_index_table(void) static void __init new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags) { - if (type == KMALLOC_RECLAIM) { + if ((KMALLOC_RECLAIM != KMALLOC_NORMAL) && (type == KMALLOC_RECLAIM)) { flags |= SLAB_RECLAIM_ACCOUNT; } else if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_CGROUP)) { if (mem_cgroup_kmem_disabled()) { @@ -899,6 +928,158 @@ void __init create_kmalloc_caches(slab_flags_t flags) /* Kmalloc array is now usable */ slab_state = UP; } + +void free_large_kmalloc(struct folio *folio, void *object) +{ + unsigned int order = folio_order(folio); + + if (WARN_ON_ONCE(order == 0)) + pr_warn_once("object pointer: 0x%p\n", object); + + kmemleak_free(object); + kasan_kfree_large(object); + kmsan_kfree_large(object); + + mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B, + -(PAGE_SIZE << order)); + __free_pages(folio_page(folio, 0), order); +} + +static void *__kmalloc_large_node(size_t size, gfp_t flags, int node); +static __always_inline +void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) +{ + struct kmem_cache *s; + void *ret; + + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { + ret = __kmalloc_large_node(size, flags, node); + trace_kmalloc(caller, ret, size, + PAGE_SIZE << get_order(size), flags, node); + return ret; + } + + s = kmalloc_slab(size, flags); + + if (unlikely(ZERO_OR_NULL_PTR(s))) + return s; + + ret = __kmem_cache_alloc_node(s, flags, node, size, caller); + ret = kasan_kmalloc(s, ret, size, flags); + trace_kmalloc(caller, ret, size, s->size, flags, node); + return ret; +} + +void *__kmalloc_node(size_t size, gfp_t flags, int node) +{ + return __do_kmalloc_node(size, flags, node, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc_node); + +void *__kmalloc(size_t size, gfp_t flags) +{ + return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc); + +void *__kmalloc_node_track_caller(size_t size, gfp_t flags, + int node, unsigned long caller) +{ + return __do_kmalloc_node(size, flags, node, caller); +} +EXPORT_SYMBOL(__kmalloc_node_track_caller); + +/** + * kfree - free previously allocated memory + * @object: pointer returned by kmalloc. + * + * If @object is NULL, no operation is performed. + * + * Don't free memory not originally allocated by kmalloc() + * or you will run into trouble. + */ +void kfree(const void *object) +{ + struct folio *folio; + struct slab *slab; + struct kmem_cache *s; + + trace_kfree(_RET_IP_, object); + + if (unlikely(ZERO_OR_NULL_PTR(object))) + return; + + folio = virt_to_folio(object); + if (unlikely(!folio_test_slab(folio))) { + free_large_kmalloc(folio, (void *)object); + return; + } + + slab = folio_slab(folio); + s = slab->slab_cache; + __kmem_cache_free(s, (void *)object, _RET_IP_); +} +EXPORT_SYMBOL(kfree); + +/** + * __ksize -- Report full size of underlying allocation + * @object: pointer to the object + * + * This should only be used internally to query the true size of allocations. + * It is not meant to be a way to discover the usable size of an allocation + * after the fact. Instead, use kmalloc_size_roundup(). Using memory beyond + * the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS, + * and/or FORTIFY_SOURCE. + * + * Return: size of the actual memory used by @object in bytes + */ +size_t __ksize(const void *object) +{ + struct folio *folio; + + if (unlikely(object == ZERO_SIZE_PTR)) + return 0; + + folio = virt_to_folio(object); + + if (unlikely(!folio_test_slab(folio))) { + if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE)) + return 0; + if (WARN_ON(object != folio_address(folio))) + return 0; + return folio_size(folio); + } + +#ifdef CONFIG_SLUB_DEBUG + skip_orig_size_check(folio_slab(folio)->slab_cache, object); +#endif + + return slab_ksize(folio_slab(folio)->slab_cache); +} + +void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) +{ + void *ret = __kmem_cache_alloc_node(s, gfpflags, NUMA_NO_NODE, + size, _RET_IP_); + + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, NUMA_NO_NODE); + + ret = kasan_kmalloc(s, ret, size, gfpflags); + return ret; +} +EXPORT_SYMBOL(kmalloc_trace); + +void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) +{ + void *ret = __kmem_cache_alloc_node(s, gfpflags, node, size, _RET_IP_); + + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, node); + + ret = kasan_kmalloc(s, ret, size, gfpflags); + return ret; +} +EXPORT_SYMBOL(kmalloc_node_trace); #endif /* !CONFIG_SLOB */ gfp_t kmalloc_fix_flags(gfp_t flags) @@ -918,37 +1099,51 @@ gfp_t kmalloc_fix_flags(gfp_t flags) * directly to the page allocator. We use __GFP_COMP, because we will need to * know the allocation order to free the pages properly in kfree. */ -void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) + +static void *__kmalloc_large_node(size_t size, gfp_t flags, int node) { - void *ret = NULL; struct page *page; + void *ptr = NULL; + unsigned int order = get_order(size); if (unlikely(flags & GFP_SLAB_BUG_MASK)) flags = kmalloc_fix_flags(flags); flags |= __GFP_COMP; - page = alloc_pages(flags, order); - if (likely(page)) { - ret = page_address(page); + page = alloc_pages_node(node, flags, order); + if (page) { + ptr = page_address(page); mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, PAGE_SIZE << order); } - ret = kasan_kmalloc_large(ret, size, flags); - /* As ret might get tagged, call kmemleak hook after KASAN. */ - kmemleak_alloc(ret, size, 1, flags); + + ptr = kasan_kmalloc_large(ptr, size, flags); + /* As ptr might get tagged, call kmemleak hook after KASAN. */ + kmemleak_alloc(ptr, size, 1, flags); + kmsan_kmalloc_large(ptr, size, flags); + + return ptr; +} + +void *kmalloc_large(size_t size, gfp_t flags) +{ + void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE); + + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), + flags, NUMA_NO_NODE); return ret; } -EXPORT_SYMBOL(kmalloc_order); +EXPORT_SYMBOL(kmalloc_large); -#ifdef CONFIG_TRACING -void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) +void *kmalloc_large_node(size_t size, gfp_t flags, int node) { - void *ret = kmalloc_order(size, flags, order); - trace_kmalloc(_RET_IP_, ret, NULL, size, PAGE_SIZE << order, flags); + void *ret = __kmalloc_large_node(size, flags, node); + + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), + flags, node); return ret; } -EXPORT_SYMBOL(kmalloc_order_trace); -#endif +EXPORT_SYMBOL(kmalloc_large_node); #ifdef CONFIG_SLAB_FREELIST_RANDOM /* Randomize a generic freelist */ @@ -1147,17 +1342,17 @@ module_init(slab_proc_init); #endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */ -static __always_inline void *__do_krealloc(const void *p, size_t new_size, - gfp_t flags) +static __always_inline __realloc_size(2) void * +__do_krealloc(const void *p, size_t new_size, gfp_t flags) { void *ret; size_t ks; - /* Don't use instrumented ksize to allow precise KASAN poisoning. */ + /* Check for double-free before calling ksize. */ if (likely(!ZERO_OR_NULL_PTR(p))) { if (!kasan_check_byte(p)) return NULL; - ks = kfence_ksize(p) ?: __ksize(p); + ks = ksize(p); } else ks = 0; @@ -1225,35 +1420,21 @@ void kfree_sensitive(const void *p) void *mem = (void *)p; ks = ksize(mem); - if (ks) + if (ks) { + kasan_unpoison_range(mem, ks); memzero_explicit(mem, ks); + } kfree(mem); } EXPORT_SYMBOL(kfree_sensitive); -/** - * ksize - get the actual amount of memory allocated for a given object - * @objp: Pointer to the object - * - * kmalloc may internally round up allocations and return more memory - * than requested. ksize() can be used to determine the actual amount of - * memory allocated. The caller may use this additional memory, even though - * a smaller amount of memory was initially specified with the kmalloc call. - * The caller must guarantee that objp points to a valid object previously - * allocated with either kmalloc() or kmem_cache_alloc(). The object - * must not be freed during the duration of the call. - * - * Return: size of the actual memory used by @objp in bytes - */ size_t ksize(const void *objp) { - size_t size; - /* - * We need to first check that the pointer to the object is valid, and - * only then unpoison the memory. The report printed from ksize() is - * more useful, then when it's printed later when the behaviour could - * be undefined due to a potential use-after-free or double-free. + * We need to first check that the pointer to the object is valid. + * The KASAN report printed from ksize() is more useful, then when + * it's printed later when the behaviour could be undefined due to + * a potential use-after-free or double-free. * * We use kasan_check_byte(), which is supported for the hardware * tag-based KASAN mode, unlike kasan_check_read/write(). @@ -1267,21 +1448,13 @@ size_t ksize(const void *objp) if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp)) return 0; - size = kfence_ksize(objp) ?: __ksize(objp); - /* - * We assume that ksize callers could use whole allocated area, - * so we need to unpoison this area. - */ - kasan_unpoison_range(objp, size); - return size; + return kfence_ksize(objp) ?: __ksize(objp); } EXPORT_SYMBOL(ksize); /* Tracepoints definitions. */ EXPORT_TRACEPOINT_SYMBOL(kmalloc); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); -EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); -EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); EXPORT_TRACEPOINT_SYMBOL(kfree); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); diff --git a/mm/slob.c b/mm/slob.c index 2bd4f476c340..fe567fcfa3a3 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -507,8 +507,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) *m = size; ret = (void *)m + minalign; - trace_kmalloc_node(caller, ret, NULL, - size, size + minalign, gfp, node); + trace_kmalloc(caller, ret, size, size + minalign, gfp, node); } else { unsigned int order = get_order(size); @@ -516,8 +515,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) gfp |= __GFP_COMP; ret = slob_new_pages(gfp, order, node); - trace_kmalloc_node(caller, ret, NULL, - size, PAGE_SIZE << order, gfp, node); + trace_kmalloc(caller, ret, size, PAGE_SIZE << order, gfp, node); } kmemleak_alloc(ret, size, 1, gfp); @@ -530,20 +528,12 @@ void *__kmalloc(size_t size, gfp_t gfp) } EXPORT_SYMBOL(__kmalloc); -void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller) -{ - return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller); -} -EXPORT_SYMBOL(__kmalloc_track_caller); - -#ifdef CONFIG_NUMA void *__kmalloc_node_track_caller(size_t size, gfp_t gfp, int node, unsigned long caller) { return __do_kmalloc_node(size, gfp, node, caller); } EXPORT_SYMBOL(__kmalloc_node_track_caller); -#endif void kfree(const void *block) { @@ -574,6 +564,20 @@ void kfree(const void *block) } EXPORT_SYMBOL(kfree); +size_t kmalloc_size_roundup(size_t size) +{ + /* Short-circuit the 0 size case. */ + if (unlikely(size == 0)) + return 0; + /* Short-circuit saturated "too-large" case. */ + if (unlikely(size == SIZE_MAX)) + return SIZE_MAX; + + return ALIGN(size, ARCH_KMALLOC_MINALIGN); +} + +EXPORT_SYMBOL(kmalloc_size_roundup); + /* can't use ksize for kmem_cache_alloc memory, only kmalloc */ size_t __ksize(const void *block) { @@ -594,7 +598,6 @@ size_t __ksize(const void *block) m = (unsigned int *)(block - align); return SLOB_UNITS(*m) * SLOB_UNIT; } -EXPORT_SYMBOL(__ksize); int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags) { @@ -602,6 +605,9 @@ int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags) /* leave room for rcu footer at the end of object */ c->size += sizeof(struct slob_rcu); } + + /* Actual size allocated */ + c->size = SLOB_UNITS(c->size) * SLOB_UNIT; c->flags = flags; return 0; } @@ -616,14 +622,10 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) if (c->size < PAGE_SIZE) { b = slob_alloc(c->size, flags, c->align, node, 0); - trace_kmem_cache_alloc_node(_RET_IP_, b, NULL, c->object_size, - SLOB_UNITS(c->size) * SLOB_UNIT, - flags, node); + trace_kmem_cache_alloc(_RET_IP_, b, c, flags, node); } else { b = slob_new_pages(flags, get_order(c->size), node); - trace_kmem_cache_alloc_node(_RET_IP_, b, NULL, c->object_size, - PAGE_SIZE << get_order(c->size), - flags, node); + trace_kmem_cache_alloc(_RET_IP_, b, c, flags, node); } if (b && c->ctor) { @@ -647,7 +649,7 @@ void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, gfp_ return slob_alloc_node(cachep, flags, NUMA_NO_NODE); } EXPORT_SYMBOL(kmem_cache_alloc_lru); -#ifdef CONFIG_NUMA + void *__kmalloc_node(size_t size, gfp_t gfp, int node) { return __do_kmalloc_node(size, gfp, node, _RET_IP_); @@ -659,7 +661,6 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t gfp, int node) return slob_alloc_node(cachep, gfp, node); } EXPORT_SYMBOL(kmem_cache_alloc_node); -#endif static void __kmem_cache_free(void *b, int size) { @@ -680,7 +681,7 @@ static void kmem_rcu_free(struct rcu_head *head) void kmem_cache_free(struct kmem_cache *c, void *b) { kmemleak_free_recursive(b, c->flags); - trace_kmem_cache_free(_RET_IP_, b, c->name); + trace_kmem_cache_free(_RET_IP_, b, c); if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) { struct slob_rcu *slob_rcu; slob_rcu = b + (c->size - sizeof(struct slob_rcu)); diff --git a/mm/slub.c b/mm/slub.c index 862dbd9af4f5..13459c69095a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -22,6 +22,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kasan.h> +#include <linux/kmsan.h> #include <linux/cpu.h> #include <linux/cpuset.h> #include <linux/mempolicy.h> @@ -38,6 +39,7 @@ #include <linux/memcontrol.h> #include <linux/random.h> #include <kunit/test.h> +#include <kunit/test-bug.h> #include <linux/sort.h> #include <linux/debugfs.h> @@ -50,7 +52,7 @@ * 1. slab_mutex (Global Mutex) * 2. node->list_lock (Spinlock) * 3. kmem_cache->cpu_slab->lock (Local lock) - * 4. slab_lock(slab) (Only on some arches or for debugging) + * 4. slab_lock(slab) (Only on some arches) * 5. object_map_lock (Only for debugging) * * slab_mutex @@ -64,8 +66,9 @@ * The slab_lock is a wrapper around the page lock, thus it is a bit * spinlock. * - * The slab_lock is only used for debugging and on arches that do not - * have the ability to do a cmpxchg_double. It only protects: + * The slab_lock is only used on arches that do not have the ability + * to do a cmpxchg_double. It only protects: + * * A. slab->freelist -> List of free objects in a slab * B. slab->inuse -> Number of objects in use * C. slab->objects -> Number of objects in slab @@ -94,15 +97,20 @@ * allocating a long series of objects that fill up slabs does not require * the list lock. * + * For debug caches, all allocations are forced to go through a list_lock + * protected region to serialize against concurrent validation. + * * cpu_slab->lock local lock * * This locks protect slowpath manipulation of all kmem_cache_cpu fields * except the stat counters. This is a percpu structure manipulated only by * the local cpu, so the lock protects against being preempted or interrupted * by an irq. Fast path operations rely on lockless operations instead. - * On PREEMPT_RT, the local lock does not actually disable irqs (and thus - * prevent the lockless operations), so fastpath operations also need to take - * the lock and are no longer lockless. + * + * On PREEMPT_RT, the local lock neither disables interrupts nor preemption + * which means the lockless fastpath cannot be used as it might interfere with + * an in-progress slow path operations. In this case the local lock is always + * taken but it still utilizes the freelist for the common operations. * * lockless fastpaths * @@ -163,8 +171,9 @@ * function call even on !PREEMPT_RT, use inline preempt_disable() there. */ #ifndef CONFIG_PREEMPT_RT -#define slub_get_cpu_ptr(var) get_cpu_ptr(var) -#define slub_put_cpu_ptr(var) put_cpu_ptr(var) +#define slub_get_cpu_ptr(var) get_cpu_ptr(var) +#define slub_put_cpu_ptr(var) put_cpu_ptr(var) +#define USE_LOCKLESS_FAST_PATH() (true) #else #define slub_get_cpu_ptr(var) \ ({ \ @@ -176,6 +185,13 @@ do { \ (void)(var); \ migrate_enable(); \ } while (0) +#define USE_LOCKLESS_FAST_PATH() (false) +#endif + +#ifndef CONFIG_SLUB_TINY +#define __fastpath_inline __always_inline +#else +#define __fastpath_inline #endif #ifdef CONFIG_SLUB_DEBUG @@ -186,11 +202,24 @@ DEFINE_STATIC_KEY_FALSE(slub_debug_enabled); #endif #endif /* CONFIG_SLUB_DEBUG */ +/* Structure holding parameters for get_partial() call chain */ +struct partial_context { + struct slab **slab; + gfp_t flags; + unsigned int orig_size; +}; + static inline bool kmem_cache_debug(struct kmem_cache *s) { return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS); } +static inline bool slub_debug_orig_size(struct kmem_cache *s) +{ + return (kmem_cache_debug_flags(s, SLAB_STORE_USER) && + (s->flags & SLAB_KMALLOC)); +} + void *fixup_red_left(struct kmem_cache *s, void *p) { if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) @@ -219,6 +248,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) /* Enable to log cmpxchg failures */ #undef SLUB_DEBUG_CMPXCHG +#ifndef CONFIG_SLUB_TINY /* * Minimum number of partial slabs. These will be left on the partial * lists even if they are empty. kmem_cache_shrink may reclaim them. @@ -231,6 +261,10 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) * sort the partial list by the number of objects in use. */ #define MAX_PARTIAL 10 +#else +#define MIN_PARTIAL 0 +#define MAX_PARTIAL 0 +#endif #define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \ SLAB_POISON | SLAB_STORE_USER) @@ -276,7 +310,7 @@ struct track { enum track_item { TRACK_ALLOC, TRACK_FREE }; -#ifdef CONFIG_SYSFS +#ifdef SLAB_SUPPORTS_SYSFS static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); #else @@ -310,6 +344,13 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) */ static nodemask_t slab_nodes; +#ifndef CONFIG_SLUB_TINY +/* + * Workqueue used for flush_cpu_slab(). + */ +static struct workqueue_struct *flushwq; +#endif + /******************************************************************** * Core slab cache functions *******************************************************************/ @@ -354,11 +395,24 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object) return freelist_dereference(s, object + s->offset); } +#ifndef CONFIG_SLUB_TINY static void prefetch_freepointer(const struct kmem_cache *s, void *object) { prefetchw(object + s->offset); } +#endif +/* + * When running under KMSAN, get_freepointer_safe() may return an uninitialized + * pointer value in the case the current thread loses the race for the next + * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in + * slab_alloc_node() will fail, so the uninitialized value won't be used, but + * KMSAN will still check all arguments of cmpxchg because of imperfect + * handling of inline assembly. + * To work around this problem, we apply __no_kmsan_checks to ensure that + * get_freepointer_safe() returns initialized memory. + */ +__no_kmsan_checks static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) { unsigned long freepointer_addr; @@ -442,7 +496,7 @@ slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) /* * Per slab locking using the pagelock */ -static __always_inline void __slab_lock(struct slab *slab) +static __always_inline void slab_lock(struct slab *slab) { struct page *page = slab_page(slab); @@ -450,7 +504,7 @@ static __always_inline void __slab_lock(struct slab *slab) bit_spin_lock(PG_locked, &page->flags); } -static __always_inline void __slab_unlock(struct slab *slab) +static __always_inline void slab_unlock(struct slab *slab) { struct page *page = slab_page(slab); @@ -458,31 +512,19 @@ static __always_inline void __slab_unlock(struct slab *slab) __bit_spin_unlock(PG_locked, &page->flags); } -static __always_inline void slab_lock(struct slab *slab, unsigned long *flags) -{ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - local_irq_save(*flags); - __slab_lock(slab); -} - -static __always_inline void slab_unlock(struct slab *slab, unsigned long *flags) -{ - __slab_unlock(slab); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - local_irq_restore(*flags); -} - /* * Interrupts must be disabled (for the fallback code to work right), typically - * by an _irqsave() lock variant. Except on PREEMPT_RT where locks are different - * so we disable interrupts as part of slab_[un]lock(). + * by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is + * part of bit_spin_lock(), is sufficient because the policy is not to allow any + * allocation/ free operation in hardirq context. Therefore nothing can + * interrupt the operation. */ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, void *freelist_old, unsigned long counters_old, void *freelist_new, unsigned long counters_new, const char *n) { - if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + if (USE_LOCKLESS_FAST_PATH()) lockdep_assert_irqs_disabled(); #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) @@ -494,18 +536,15 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab } else #endif { - /* init to 0 to prevent spurious warnings */ - unsigned long flags = 0; - - slab_lock(slab, &flags); + slab_lock(slab); if (slab->freelist == freelist_old && slab->counters == counters_old) { slab->freelist = freelist_new; slab->counters = counters_new; - slab_unlock(slab, &flags); + slab_unlock(slab); return true; } - slab_unlock(slab, &flags); + slab_unlock(slab); } cpu_relax(); @@ -536,16 +575,16 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, unsigned long flags; local_irq_save(flags); - __slab_lock(slab); + slab_lock(slab); if (slab->freelist == freelist_old && slab->counters == counters_old) { slab->freelist = freelist_new; slab->counters = counters_new; - __slab_unlock(slab); + slab_unlock(slab); local_irq_restore(flags); return true; } - __slab_unlock(slab); + slab_unlock(slab); local_irq_restore(flags); } @@ -561,7 +600,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, #ifdef CONFIG_SLUB_DEBUG static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; -static DEFINE_RAW_SPINLOCK(object_map_lock); +static DEFINE_SPINLOCK(object_map_lock); static void __fill_map(unsigned long *obj_map, struct kmem_cache *s, struct slab *slab) @@ -580,7 +619,7 @@ static bool slab_add_kunit_errors(void) { struct kunit_resource *resource; - if (likely(!current->kunit_test)) + if (!kunit_get_current_test()) return false; resource = kunit_find_named_resource(current->kunit_test, "slab_errors"); @@ -595,30 +634,6 @@ static bool slab_add_kunit_errors(void) static inline bool slab_add_kunit_errors(void) { return false; } #endif -/* - * Determine a map of objects in use in a slab. - * - * Node listlock must be held to guarantee that the slab does - * not vanish from under us. - */ -static unsigned long *get_map(struct kmem_cache *s, struct slab *slab) - __acquires(&object_map_lock) -{ - VM_BUG_ON(!irqs_disabled()); - - raw_spin_lock(&object_map_lock); - - __fill_map(object_map, s, slab); - - return object_map; -} - -static void put_map(unsigned long *map) __releases(&object_map_lock) -{ - VM_BUG_ON(map != object_map); - raw_spin_unlock(&object_map_lock); -} - static inline unsigned int size_from_object(struct kmem_cache *s) { if (s->flags & SLAB_RED_ZONE) @@ -816,6 +831,55 @@ static void print_slab_info(const struct slab *slab) folio_flags(folio, 0)); } +/* + * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API + * family will round up the real request size to these fixed ones, so + * there could be an extra area than what is requested. Save the original + * request size in the meta data area, for better debug and sanity check. + */ +static inline void set_orig_size(struct kmem_cache *s, + void *object, unsigned int orig_size) +{ + void *p = kasan_reset_tag(object); + + if (!slub_debug_orig_size(s)) + return; + +#ifdef CONFIG_KASAN_GENERIC + /* + * KASAN could save its free meta data in object's data area at + * offset 0, if the size is larger than 'orig_size', it will + * overlap the data redzone in [orig_size+1, object_size], and + * the check should be skipped. + */ + if (kasan_metadata_size(s, true) > orig_size) + orig_size = s->object_size; +#endif + + p += get_info_end(s); + p += sizeof(struct track) * 2; + + *(unsigned int *)p = orig_size; +} + +static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) +{ + void *p = kasan_reset_tag(object); + + if (!slub_debug_orig_size(s)) + return s->object_size; + + p += get_info_end(s); + p += sizeof(struct track) * 2; + + return *(unsigned int *)p; +} + +void skip_orig_size_check(struct kmem_cache *s, const void *object) +{ + set_orig_size(s, (void *)object, s->object_size); +} + static void slab_bug(struct kmem_cache *s, char *fmt, ...) { struct va_format vaf; @@ -875,7 +939,10 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p) if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); - off += kasan_metadata_size(s); + if (slub_debug_orig_size(s)) + off += sizeof(unsigned int); + + off += kasan_metadata_size(s, false); if (off != size_from_object(s)) /* Beginning of the filler is the free pointer */ @@ -931,17 +998,28 @@ static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab, static void init_object(struct kmem_cache *s, void *object, u8 val) { u8 *p = kasan_reset_tag(object); + unsigned int poison_size = s->object_size; - if (s->flags & SLAB_RED_ZONE) + if (s->flags & SLAB_RED_ZONE) { memset(p - s->red_left_pad, val, s->red_left_pad); + if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) { + /* + * Redzone the extra allocated space by kmalloc than + * requested, and the poison size will be limited to + * the original request size accordingly. + */ + poison_size = get_orig_size(s, object); + } + } + if (s->flags & __OBJECT_POISON) { - memset(p, POISON_FREE, s->object_size - 1); - p[s->object_size - 1] = POISON_END; + memset(p, POISON_FREE, poison_size - 1); + p[poison_size - 1] = POISON_END; } if (s->flags & SLAB_RED_ZONE) - memset(p + s->object_size, val, s->inuse - s->object_size); + memset(p + poison_size, val, s->inuse - poison_size); } static void restore_bytes(struct kmem_cache *s, char *message, u8 data, @@ -1008,7 +1086,8 @@ skip_bug_print: * * A. Free pointer (if we cannot overwrite object on free) * B. Tracking data for SLAB_STORE_USER - * C. Padding to reach required alignment boundary or at minimum + * C. Original request size for kmalloc object (SLAB_STORE_USER enabled) + * D. Padding to reach required alignment boundary or at minimum * one word if debugging is on to be able to detect writes * before the word boundary. * @@ -1026,11 +1105,15 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) { unsigned long off = get_info_end(s); /* The end of info */ - if (s->flags & SLAB_STORE_USER) + if (s->flags & SLAB_STORE_USER) { /* We also have user information there */ off += 2 * sizeof(struct track); - off += kasan_metadata_size(s); + if (s->flags & SLAB_KMALLOC) + off += sizeof(unsigned int); + } + + off += kasan_metadata_size(s, false); if (size_from_object(s) == off) return 1; @@ -1080,6 +1163,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab, { u8 *p = object; u8 *endobject = object + s->object_size; + unsigned int orig_size; if (s->flags & SLAB_RED_ZONE) { if (!check_bytes_and_report(s, slab, object, "Left Redzone", @@ -1089,6 +1173,17 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if (!check_bytes_and_report(s, slab, object, "Right Redzone", endobject, val, s->inuse - s->object_size)) return 0; + + if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) { + orig_size = get_orig_size(s, object); + + if (s->object_size > orig_size && + !check_bytes_and_report(s, slab, object, + "kmalloc Redzone", p + orig_size, + val, s->object_size - orig_size)) { + return 0; + } + } } else { if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { check_bytes_and_report(s, slab, p, "Alignment padding", @@ -1323,21 +1418,19 @@ static inline int alloc_consistency_checks(struct kmem_cache *s, return 1; } -static noinline int alloc_debug_processing(struct kmem_cache *s, - struct slab *slab, - void *object, unsigned long addr) +static noinline bool alloc_debug_processing(struct kmem_cache *s, + struct slab *slab, void *object, int orig_size) { if (s->flags & SLAB_CONSISTENCY_CHECKS) { if (!alloc_consistency_checks(s, slab, object)) goto bad; } - /* Success perform special debug activities for allocs */ - if (s->flags & SLAB_STORE_USER) - set_track(s, object, TRACK_ALLOC, addr); + /* Success. Perform special debug activities for allocs */ trace(s, slab, object, 1); + set_orig_size(s, object, orig_size); init_object(s, object, SLUB_RED_ACTIVE); - return 1; + return true; bad: if (folio_test_slab(slab_folio(slab))) { @@ -1350,7 +1443,7 @@ bad: slab->inuse = slab->objects; slab->freelist = NULL; } - return 0; + return false; } static inline int free_consistency_checks(struct kmem_cache *s, @@ -1385,63 +1478,6 @@ static inline int free_consistency_checks(struct kmem_cache *s, return 1; } -/* Supports checking bulk free of a constructed freelist */ -static noinline int free_debug_processing( - struct kmem_cache *s, struct slab *slab, - void *head, void *tail, int bulk_cnt, - unsigned long addr) -{ - struct kmem_cache_node *n = get_node(s, slab_nid(slab)); - void *object = head; - int cnt = 0; - unsigned long flags, flags2; - int ret = 0; - depot_stack_handle_t handle = 0; - - if (s->flags & SLAB_STORE_USER) - handle = set_track_prepare(); - - spin_lock_irqsave(&n->list_lock, flags); - slab_lock(slab, &flags2); - - if (s->flags & SLAB_CONSISTENCY_CHECKS) { - if (!check_slab(s, slab)) - goto out; - } - -next_object: - cnt++; - - if (s->flags & SLAB_CONSISTENCY_CHECKS) { - if (!free_consistency_checks(s, slab, object, addr)) - goto out; - } - - if (s->flags & SLAB_STORE_USER) - set_track_update(s, object, TRACK_FREE, addr, handle); - trace(s, slab, object, 0); - /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */ - init_object(s, object, SLUB_RED_INACTIVE); - - /* Reached end of constructed freelist yet? */ - if (object != tail) { - object = get_freepointer(s, object); - goto next_object; - } - ret = 1; - -out: - if (cnt != bulk_cnt) - slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n", - bulk_cnt, cnt); - - slab_unlock(slab, &flags2); - spin_unlock_irqrestore(&n->list_lock, flags); - if (!ret) - slab_fix(s, "Object at 0x%p not freed", object); - return ret; -} - /* * Parse a block of slub_debug options. Blocks are delimited by ';' * @@ -1660,17 +1696,19 @@ static inline void setup_object_debug(struct kmem_cache *s, void *object) {} static inline void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {} -static inline int alloc_debug_processing(struct kmem_cache *s, - struct slab *slab, void *object, unsigned long addr) { return 0; } +static inline bool alloc_debug_processing(struct kmem_cache *s, + struct slab *slab, void *object, int orig_size) { return true; } -static inline int free_debug_processing( - struct kmem_cache *s, struct slab *slab, - void *head, void *tail, int bulk_cnt, - unsigned long addr) { return 0; } +static inline bool free_debug_processing(struct kmem_cache *s, + struct slab *slab, void *head, void *tail, int *bulk_cnt, + unsigned long addr, depot_stack_handle_t handle) { return true; } static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {} static inline int check_object(struct kmem_cache *s, struct slab *slab, void *object, u8 val) { return 1; } +static inline depot_stack_handle_t set_track_prepare(void) { return 0; } +static inline void set_track(struct kmem_cache *s, void *object, + enum track_item alloc, unsigned long addr) {} static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab) {} static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, @@ -1693,35 +1731,24 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} +#ifndef CONFIG_SLUB_TINY static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, void **freelist, void *nextfree) { return false; } +#endif #endif /* CONFIG_SLUB_DEBUG */ /* * Hooks for other subsystems that check memory allocations. In a typical * production configuration these hooks all should produce no code at all. */ -static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) -{ - ptr = kasan_kmalloc_large(ptr, size, flags); - /* As ptr might get tagged, call kmemleak hook after KASAN. */ - kmemleak_alloc(ptr, size, 1, flags); - return ptr; -} - -static __always_inline void kfree_hook(void *x) -{ - kmemleak_free(x); - kasan_kfree_large(x); -} - static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x, bool init) { kmemleak_free_recursive(x, s->flags); + kmsan_slab_free(s, x); debug_check_no_locks_freed(x, s->object_size); @@ -1830,6 +1857,8 @@ static inline struct slab *alloc_slab_page(gfp_t flags, int node, slab = folio_slab(folio); __folio_set_slab(folio); + /* Make the flag visible before any changes to folio->mapping */ + smp_wmb(); if (page_is_pfmemalloc(folio_page(folio, 0))) slab_set_pfmemalloc(slab); @@ -1911,7 +1940,7 @@ static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab) return false; freelist_count = oo_objects(s->oo); - pos = get_random_int() % freelist_count; + pos = get_random_u32_below(freelist_count); page_limit = slab->objects * s->size; start = fixup_red_left(s, slab_address(slab)); @@ -1976,11 +2005,13 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) */ slab = alloc_slab_page(alloc_gfp, node, oo); if (unlikely(!slab)) - goto out; + return NULL; stat(s, ORDER_FALLBACK); } slab->objects = oo_objects(oo); + slab->inuse = 0; + slab->frozen = 0; account_slab(slab, oo_order(oo), s, flags); @@ -2007,15 +2038,6 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) set_freepointer(s, p, NULL); } - slab->inuse = slab->objects; - slab->frozen = 1; - -out: - if (!slab) - return NULL; - - inc_slabs_node(s, slab_nid(slab), slab->objects); - return slab; } @@ -2036,17 +2058,11 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab) int order = folio_order(folio); int pages = 1 << order; - if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) { - void *p; - - slab_pad_check(s, slab); - for_each_object(p, s, slab_address(slab), slab->objects) - check_object(s, slab, p, SLUB_RED_INACTIVE); - } - __slab_clear_pfmemalloc(slab); - __folio_clear_slab(folio); folio->mapping = NULL; + /* Make the mapping reset visible before clearing the flag */ + smp_wmb(); + __folio_clear_slab(folio); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; unaccount_slab(slab, order, s); @@ -2062,9 +2078,17 @@ static void rcu_free_slab(struct rcu_head *h) static void free_slab(struct kmem_cache *s, struct slab *slab) { - if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) { + if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) { + void *p; + + slab_pad_check(s, slab); + for_each_object(p, s, slab_address(slab), slab->objects) + check_object(s, slab, p, SLUB_RED_INACTIVE); + } + + if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) call_rcu(&slab->rcu_head, rcu_free_slab); - } else + else __free_slab(s, slab); } @@ -2103,6 +2127,75 @@ static inline void remove_partial(struct kmem_cache_node *n, } /* + * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a + * slab from the n->partial list. Remove only a single object from the slab, do + * the alloc_debug_processing() checks and leave the slab on the list, or move + * it to full list if it was the last free object. + */ +static void *alloc_single_from_partial(struct kmem_cache *s, + struct kmem_cache_node *n, struct slab *slab, int orig_size) +{ + void *object; + + lockdep_assert_held(&n->list_lock); + + object = slab->freelist; + slab->freelist = get_freepointer(s, object); + slab->inuse++; + + if (!alloc_debug_processing(s, slab, object, orig_size)) { + remove_partial(n, slab); + return NULL; + } + + if (slab->inuse == slab->objects) { + remove_partial(n, slab); + add_full(s, n, slab); + } + + return object; +} + +/* + * Called only for kmem_cache_debug() caches to allocate from a freshly + * allocated slab. Allocate a single object instead of whole freelist + * and put the slab to the partial (or full) list. + */ +static void *alloc_single_from_new_slab(struct kmem_cache *s, + struct slab *slab, int orig_size) +{ + int nid = slab_nid(slab); + struct kmem_cache_node *n = get_node(s, nid); + unsigned long flags; + void *object; + + + object = slab->freelist; + slab->freelist = get_freepointer(s, object); + slab->inuse = 1; + + if (!alloc_debug_processing(s, slab, object, orig_size)) + /* + * It's not really expected that this would fail on a + * freshly allocated slab, but a concurrent memory + * corruption in theory could cause that. + */ + return NULL; + + spin_lock_irqsave(&n->list_lock, flags); + + if (slab->inuse == slab->objects) + add_full(s, n, slab); + else + add_partial(n, slab, DEACTIVATE_TO_HEAD); + + inc_slabs_node(s, nid, slab->objects); + spin_unlock_irqrestore(&n->list_lock, flags); + + return object; +} + +/* * Remove slab from the partial list, freeze it and * return the pointer to the freelist. * @@ -2159,7 +2252,7 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags); * Try to allocate a partial slab from a specific node. */ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, - struct slab **ret_slab, gfp_t gfpflags) + struct partial_context *pc) { struct slab *slab, *slab2; void *object = NULL; @@ -2179,15 +2272,23 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { void *t; - if (!pfmemalloc_match(slab, gfpflags)) + if (!pfmemalloc_match(slab, pc->flags)) continue; + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { + object = alloc_single_from_partial(s, n, slab, + pc->orig_size); + if (object) + break; + continue; + } + t = acquire_slab(s, n, slab, object == NULL); if (!t) break; if (!object) { - *ret_slab = slab; + *pc->slab = slab; stat(s, ALLOC_FROM_PARTIAL); object = t; } else { @@ -2211,14 +2312,13 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, /* * Get a slab from somewhere. Search in increasing NUMA distances. */ -static void *get_any_partial(struct kmem_cache *s, gfp_t flags, - struct slab **ret_slab) +static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc) { #ifdef CONFIG_NUMA struct zonelist *zonelist; struct zoneref *z; struct zone *zone; - enum zone_type highest_zoneidx = gfp_zone(flags); + enum zone_type highest_zoneidx = gfp_zone(pc->flags); void *object; unsigned int cpuset_mems_cookie; @@ -2246,15 +2346,15 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, do { cpuset_mems_cookie = read_mems_allowed_begin(); - zonelist = node_zonelist(mempolicy_slab_node(), flags); + zonelist = node_zonelist(mempolicy_slab_node(), pc->flags); for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { struct kmem_cache_node *n; n = get_node(s, zone_to_nid(zone)); - if (n && cpuset_zone_allowed(zone, flags) && + if (n && cpuset_zone_allowed(zone, pc->flags) && n->nr_partial > s->min_partial) { - object = get_partial_node(s, n, ret_slab, flags); + object = get_partial_node(s, n, pc); if (object) { /* * Don't check read_mems_allowed_retry() @@ -2275,8 +2375,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, /* * Get a partial slab, lock it and return it. */ -static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, - struct slab **ret_slab) +static void *get_partial(struct kmem_cache *s, int node, struct partial_context *pc) { void *object; int searchnode = node; @@ -2284,13 +2383,15 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, if (node == NUMA_NO_NODE) searchnode = numa_mem_id(); - object = get_partial_node(s, get_node(s, searchnode), ret_slab, flags); + object = get_partial_node(s, get_node(s, searchnode), pc); if (object || node != NUMA_NO_NODE) return object; - return get_any_partial(s, flags, ret_slab); + return get_any_partial(s, pc); } +#ifndef CONFIG_SLUB_TINY + #ifdef CONFIG_PREEMPTION /* * Calculate the next globally unique transaction for disambiguation @@ -2304,7 +2405,7 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, * different cpus. */ #define TID_STEP 1 -#endif +#endif /* CONFIG_PREEMPTION */ static inline unsigned long next_tid(unsigned long tid) { @@ -2373,7 +2474,7 @@ static void init_kmem_cache_cpus(struct kmem_cache *s) static void deactivate_slab(struct kmem_cache *s, struct slab *slab, void *freelist) { - enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE, M_FULL_NOLIST }; + enum slab_modes { M_NONE, M_PARTIAL, M_FREE, M_FULL_NOLIST }; struct kmem_cache_node *n = get_node(s, slab_nid(slab)); int free_delta = 0; enum slab_modes mode = M_NONE; @@ -2449,14 +2550,6 @@ redo: * acquire_slab() will see a slab that is frozen */ spin_lock_irqsave(&n->list_lock, flags); - } else if (kmem_cache_debug_flags(s, SLAB_STORE_USER)) { - mode = M_FULL; - /* - * This also ensures that the scanning of full - * slabs from diagnostic functions will not see - * any frozen slabs. - */ - spin_lock_irqsave(&n->list_lock, flags); } else { mode = M_FULL_NOLIST; } @@ -2466,7 +2559,7 @@ redo: old.freelist, old.counters, new.freelist, new.counters, "unfreezing slab")) { - if (mode == M_PARTIAL || mode == M_FULL) + if (mode == M_PARTIAL) spin_unlock_irqrestore(&n->list_lock, flags); goto redo; } @@ -2480,10 +2573,6 @@ redo: stat(s, DEACTIVATE_EMPTY); discard_slab(s, slab); stat(s, FREE_SLAB); - } else if (mode == M_FULL) { - add_full(s, n, slab); - spin_unlock_irqrestore(&n->list_lock, flags); - stat(s, DEACTIVATE_FULL); } else if (mode == M_FULL_NOLIST) { stat(s, DEACTIVATE_FULL); } @@ -2730,7 +2819,7 @@ static void flush_all_cpus_locked(struct kmem_cache *s) INIT_WORK(&sfw->work, flush_cpu_slab); sfw->skip = false; sfw->s = s; - schedule_work_on(cpu, &sfw->work); + queue_work_on(cpu, flushwq, &sfw->work); } for_each_online_cpu(cpu) { @@ -2765,6 +2854,13 @@ static int slub_cpu_dead(unsigned int cpu) return 0; } +#else /* CONFIG_SLUB_TINY */ +static inline void flush_all_cpus_locked(struct kmem_cache *s) { } +static inline void flush_all(struct kmem_cache *s) { } +static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { } +static inline int slub_cpu_dead(unsigned int cpu) { return 0; } +#endif /* CONFIG_SLUB_TINY */ + /* * Check if the objects in a per cpu structure fit numa * locality expectations. @@ -2788,9 +2884,67 @@ static inline unsigned long node_nr_objs(struct kmem_cache_node *n) { return atomic_long_read(&n->total_objects); } + +/* Supports checking bulk free of a constructed freelist */ +static inline bool free_debug_processing(struct kmem_cache *s, + struct slab *slab, void *head, void *tail, int *bulk_cnt, + unsigned long addr, depot_stack_handle_t handle) +{ + bool checks_ok = false; + void *object = head; + int cnt = 0; + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!check_slab(s, slab)) + goto out; + } + + if (slab->inuse < *bulk_cnt) { + slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n", + slab->inuse, *bulk_cnt); + goto out; + } + +next_object: + + if (++cnt > *bulk_cnt) + goto out_cnt; + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!free_consistency_checks(s, slab, object, addr)) + goto out; + } + + if (s->flags & SLAB_STORE_USER) + set_track_update(s, object, TRACK_FREE, addr, handle); + trace(s, slab, object, 0); + /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */ + init_object(s, object, SLUB_RED_INACTIVE); + + /* Reached end of constructed freelist yet? */ + if (object != tail) { + object = get_freepointer(s, object); + goto next_object; + } + checks_ok = true; + +out_cnt: + if (cnt != *bulk_cnt) { + slab_err(s, slab, "Bulk free expected %d objects but found %d\n", + *bulk_cnt, cnt); + *bulk_cnt = cnt; + } + +out: + + if (!checks_ok) + slab_fix(s, "Object at 0x%p not freed", object); + + return checks_ok; +} #endif /* CONFIG_SLUB_DEBUG */ -#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS) +#if defined(CONFIG_SLUB_DEBUG) || defined(SLAB_SUPPORTS_SYSFS) static unsigned long count_partial(struct kmem_cache_node *n, int (*get_count)(struct slab *)) { @@ -2804,12 +2958,12 @@ static unsigned long count_partial(struct kmem_cache_node *n, spin_unlock_irqrestore(&n->list_lock, flags); return x; } -#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */ +#endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */ +#ifdef CONFIG_SLUB_DEBUG static noinline void slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { -#ifdef CONFIG_SLUB_DEBUG static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); int node; @@ -2840,8 +2994,11 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n", node, nr_slabs, nr_objs, nr_free); } -#endif } +#else /* CONFIG_SLUB_DEBUG */ +static inline void +slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { } +#endif static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags) { @@ -2851,6 +3008,7 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags) return true; } +#ifndef CONFIG_SLUB_TINY /* * Check the slab->freelist and either transfer the freelist to the * per cpu freelist or deactivate the slab. @@ -2905,11 +3063,12 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) * already disabled (which is the case for bulk allocation). */ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, - unsigned long addr, struct kmem_cache_cpu *c) + unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) { void *freelist; struct slab *slab; unsigned long flags; + struct partial_context pc; stat(s, ALLOC_SLOWPATH); @@ -3023,7 +3182,10 @@ new_slab: new_objects: - freelist = get_partial(s, gfpflags, node, &slab); + pc.flags = gfpflags; + pc.slab = &slab; + pc.orig_size = orig_size; + freelist = get_partial(s, node, &pc); if (freelist) goto check_new_slab; @@ -3036,36 +3198,53 @@ new_objects: return NULL; } + stat(s, ALLOC_SLAB); + + if (kmem_cache_debug(s)) { + freelist = alloc_single_from_new_slab(s, slab, orig_size); + + if (unlikely(!freelist)) + goto new_objects; + + if (s->flags & SLAB_STORE_USER) + set_track(s, freelist, TRACK_ALLOC, addr); + + return freelist; + } + /* * No other reference to the slab yet so we can * muck around with it freely without cmpxchg */ freelist = slab->freelist; slab->freelist = NULL; + slab->inuse = slab->objects; + slab->frozen = 1; - stat(s, ALLOC_SLAB); + inc_slabs_node(s, slab_nid(slab), slab->objects); check_new_slab: if (kmem_cache_debug(s)) { - if (!alloc_debug_processing(s, slab, freelist, addr)) { - /* Slab failed checks. Next slab needed */ - goto new_slab; - } else { - /* - * For debug case, we don't load freelist so that all - * allocations go through alloc_debug_processing() - */ - goto return_single; - } + /* + * For debug caches here we had to go through + * alloc_single_from_partial() so just store the tracking info + * and return the object + */ + if (s->flags & SLAB_STORE_USER) + set_track(s, freelist, TRACK_ALLOC, addr); + + return freelist; } - if (unlikely(!pfmemalloc_match(slab, gfpflags))) + if (unlikely(!pfmemalloc_match(slab, gfpflags))) { /* * For !pfmemalloc_match() case we don't load freelist so that * we don't make further mismatched allocations easier. */ - goto return_single; + deactivate_slab(s, slab, get_freepointer(s, freelist)); + return freelist; + } retry_load_slab: @@ -3089,11 +3268,6 @@ retry_load_slab: c->slab = slab; goto load_freelist; - -return_single: - - deactivate_slab(s, slab, get_freepointer(s, freelist)); - return freelist; } /* @@ -3102,7 +3276,7 @@ return_single: * pointer. */ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, - unsigned long addr, struct kmem_cache_cpu *c) + unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) { void *p; @@ -3115,52 +3289,20 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, c = slub_get_cpu_ptr(s->cpu_slab); #endif - p = ___slab_alloc(s, gfpflags, node, addr, c); + p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size); #ifdef CONFIG_PREEMPT_COUNT slub_put_cpu_ptr(s->cpu_slab); #endif return p; } -/* - * If the object has been wiped upon free, make sure it's fully initialized by - * zeroing out freelist pointer. - */ -static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, - void *obj) -{ - if (unlikely(slab_want_init_on_free(s)) && obj) - memset((void *)((char *)kasan_reset_tag(obj) + s->offset), - 0, sizeof(void *)); -} - -/* - * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) - * have the fastpath folded into their functions. So no function call - * overhead for requests that can be satisfied on the fastpath. - * - * The fastpath works by first checking if the lockless freelist can be used. - * If not then __slab_alloc is called for slow processing. - * - * Otherwise we can simply pick the next object from the lockless free list. - */ -static __always_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru, +static __always_inline void *__slab_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) { - void *object; struct kmem_cache_cpu *c; struct slab *slab; unsigned long tid; - struct obj_cgroup *objcg = NULL; - bool init = false; - - s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags); - if (!s) - return NULL; - - object = kfence_alloc(s, orig_size, gfpflags); - if (unlikely(object)) - goto out; + void *object; redo: /* @@ -3197,16 +3339,10 @@ redo: object = c->freelist; slab = c->slab; - /* - * We cannot use the lockless fastpath on PREEMPT_RT because if a - * slowpath has taken the local_lock_irqsave(), it is not protected - * against a fast path operation in an irq handler. So we need to take - * the slow path which uses local_lock. It is still relatively fast if - * there is a suitable cpu freelist. - */ - if (IS_ENABLED(CONFIG_PREEMPT_RT) || + + if (!USE_LOCKLESS_FAST_PATH() || unlikely(!object || !slab || !node_match(slab, node))) { - object = __slab_alloc(s, gfpflags, node, addr, c); + object = __slab_alloc(s, gfpflags, node, addr, c, orig_size); } else { void *next_object = get_freepointer_safe(s, object); @@ -3236,29 +3372,101 @@ redo: stat(s, ALLOC_FASTPATH); } + return object; +} +#else /* CONFIG_SLUB_TINY */ +static void *__slab_alloc_node(struct kmem_cache *s, + gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) +{ + struct partial_context pc; + struct slab *slab; + void *object; + + pc.flags = gfpflags; + pc.slab = &slab; + pc.orig_size = orig_size; + object = get_partial(s, node, &pc); + + if (object) + return object; + + slab = new_slab(s, gfpflags, node); + if (unlikely(!slab)) { + slab_out_of_memory(s, gfpflags, node); + return NULL; + } + + object = alloc_single_from_new_slab(s, slab, orig_size); + + return object; +} +#endif /* CONFIG_SLUB_TINY */ + +/* + * If the object has been wiped upon free, make sure it's fully initialized by + * zeroing out freelist pointer. + */ +static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, + void *obj) +{ + if (unlikely(slab_want_init_on_free(s)) && obj) + memset((void *)((char *)kasan_reset_tag(obj) + s->offset), + 0, sizeof(void *)); +} + +/* + * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) + * have the fastpath folded into their functions. So no function call + * overhead for requests that can be satisfied on the fastpath. + * + * The fastpath works by first checking if the lockless freelist can be used. + * If not then __slab_alloc is called for slow processing. + * + * Otherwise we can simply pick the next object from the lockless free list. + */ +static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru, + gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) +{ + void *object; + struct obj_cgroup *objcg = NULL; + bool init = false; + + s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags); + if (!s) + return NULL; + + object = kfence_alloc(s, orig_size, gfpflags); + if (unlikely(object)) + goto out; + + object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); + maybe_wipe_obj_freeptr(s, object); init = slab_want_init_on_alloc(gfpflags, s); out: - slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init); + /* + * When init equals 'true', like for kzalloc() family, only + * @orig_size bytes might be zeroed instead of s->object_size + */ + slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init, orig_size); return object; } -static __always_inline void *slab_alloc(struct kmem_cache *s, struct list_lru *lru, +static __fastpath_inline void *slab_alloc(struct kmem_cache *s, struct list_lru *lru, gfp_t gfpflags, unsigned long addr, size_t orig_size) { return slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, addr, orig_size); } -static __always_inline +static __fastpath_inline void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, gfp_t gfpflags) { void *ret = slab_alloc(s, lru, gfpflags, _RET_IP_, s->object_size); - trace_kmem_cache_alloc(_RET_IP_, ret, s, s->object_size, - s->size, gfpflags); + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE); return ret; } @@ -3276,45 +3484,84 @@ void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, } EXPORT_SYMBOL(kmem_cache_alloc_lru); -#ifdef CONFIG_TRACING -void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) +void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t orig_size, + unsigned long caller) { - void *ret = slab_alloc(s, NULL, gfpflags, _RET_IP_, size); - trace_kmalloc(_RET_IP_, ret, s, size, s->size, gfpflags); - ret = kasan_kmalloc(s, ret, size, gfpflags); - return ret; + return slab_alloc_node(s, NULL, gfpflags, node, + caller, orig_size); } -EXPORT_SYMBOL(kmem_cache_alloc_trace); -#endif -#ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size); - trace_kmem_cache_alloc_node(_RET_IP_, ret, s, - s->object_size, s->size, gfpflags, node); + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, node); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node); -#ifdef CONFIG_TRACING -void *kmem_cache_alloc_node_trace(struct kmem_cache *s, - gfp_t gfpflags, - int node, size_t size) +static noinline void free_to_partial_list( + struct kmem_cache *s, struct slab *slab, + void *head, void *tail, int bulk_cnt, + unsigned long addr) { - void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size); + struct kmem_cache_node *n = get_node(s, slab_nid(slab)); + struct slab *slab_free = NULL; + int cnt = bulk_cnt; + unsigned long flags; + depot_stack_handle_t handle = 0; - trace_kmalloc_node(_RET_IP_, ret, s, - size, s->size, gfpflags, node); + if (s->flags & SLAB_STORE_USER) + handle = set_track_prepare(); - ret = kasan_kmalloc(s, ret, size, gfpflags); - return ret; + spin_lock_irqsave(&n->list_lock, flags); + + if (free_debug_processing(s, slab, head, tail, &cnt, addr, handle)) { + void *prior = slab->freelist; + + /* Perform the actual freeing while we still hold the locks */ + slab->inuse -= cnt; + set_freepointer(s, tail, prior); + slab->freelist = head; + + /* + * If the slab is empty, and node's partial list is full, + * it should be discarded anyway no matter it's on full or + * partial list. + */ + if (slab->inuse == 0 && n->nr_partial >= s->min_partial) + slab_free = slab; + + if (!prior) { + /* was on full list */ + remove_full(s, n, slab); + if (!slab_free) { + add_partial(n, slab, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } + } else if (slab_free) { + remove_partial(n, slab); + stat(s, FREE_REMOVE_PARTIAL); + } + } + + if (slab_free) { + /* + * Update the counters while still holding n->list_lock to + * prevent spurious validation warnings + */ + dec_slabs_node(s, slab_nid(slab_free), slab_free->objects); + } + + spin_unlock_irqrestore(&n->list_lock, flags); + + if (slab_free) { + stat(s, FREE_SLAB); + free_slab(s, slab_free); + } } -EXPORT_SYMBOL(kmem_cache_alloc_node_trace); -#endif -#endif /* CONFIG_NUMA */ /* * Slow path handling. This may still be called frequently since objects @@ -3341,9 +3588,10 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, if (kfence_free(head)) return; - if (kmem_cache_debug(s) && - !free_debug_processing(s, slab, head, tail, cnt, addr)) + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { + free_to_partial_list(s, slab, head, tail, cnt, addr); return; + } do { if (unlikely(n)) { @@ -3441,6 +3689,7 @@ slab_empty: discard_slab(s, slab); } +#ifndef CONFIG_SLUB_TINY /* * Fastpath with forced inlining to produce a kfree and kmem_cache_free that * can perform fastpath freeing without additional function calls. @@ -3463,6 +3712,7 @@ static __always_inline void do_slab_free(struct kmem_cache *s, void *tail_obj = tail ? : head; struct kmem_cache_cpu *c; unsigned long tid; + void **freelist; redo: /* @@ -3477,9 +3727,13 @@ redo: /* Same with comment on barrier() in slab_alloc_node() */ barrier(); - if (likely(slab == c->slab)) { -#ifndef CONFIG_PREEMPT_RT - void **freelist = READ_ONCE(c->freelist); + if (unlikely(slab != c->slab)) { + __slab_free(s, slab, head, tail_obj, cnt, addr); + return; + } + + if (USE_LOCKLESS_FAST_PATH()) { + freelist = READ_ONCE(c->freelist); set_freepointer(s, tail_obj, freelist); @@ -3491,16 +3745,8 @@ redo: note_cmpxchg_failure("slab_free", s, tid); goto redo; } -#else /* CONFIG_PREEMPT_RT */ - /* - * We cannot use the lockless fastpath on PREEMPT_RT because if - * a slowpath has taken the local_lock_irqsave(), it is not - * protected against a fast path operation in an irq handler. So - * we need to take the local_lock. We shouldn't simply defer to - * __slab_free() as that wouldn't use the cpu freelist at all. - */ - void **freelist; - + } else { + /* Update the free list under the local lock */ local_lock(&s->cpu_slab->lock); c = this_cpu_ptr(s->cpu_slab); if (unlikely(slab != c->slab)) { @@ -3515,14 +3761,21 @@ redo: c->tid = next_tid(tid); local_unlock(&s->cpu_slab->lock); -#endif - stat(s, FREE_FASTPATH); - } else - __slab_free(s, slab, head, tail_obj, cnt, addr); + } + stat(s, FREE_FASTPATH); +} +#else /* CONFIG_SLUB_TINY */ +static void do_slab_free(struct kmem_cache *s, + struct slab *slab, void *head, void *tail, + int cnt, unsigned long addr) +{ + void *tail_obj = tail ? : head; + __slab_free(s, slab, head, tail_obj, cnt, addr); } +#endif /* CONFIG_SLUB_TINY */ -static __always_inline void slab_free(struct kmem_cache *s, struct slab *slab, +static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab, void *head, void *tail, void **p, int cnt, unsigned long addr) { @@ -3542,12 +3795,17 @@ void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) } #endif +void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller) +{ + slab_free(s, virt_to_slab(x), x, NULL, &x, 1, caller); +} + void kmem_cache_free(struct kmem_cache *s, void *x) { s = cache_from_obj(s, x); if (!s) return; - trace_kmem_cache_free(_RET_IP_, x, s->name); + trace_kmem_cache_free(_RET_IP_, x, s); slab_free(s, virt_to_slab(x), x, NULL, &x, 1, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_free); @@ -3560,19 +3818,6 @@ struct detached_freelist { struct kmem_cache *s; }; -static inline void free_large_kmalloc(struct folio *folio, void *object) -{ - unsigned int order = folio_order(folio); - - if (WARN_ON_ONCE(order == 0)) - pr_warn_once("object pointer: 0x%p\n", object); - - kfree_hook(object); - mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B, - -(PAGE_SIZE << order)); - __free_pages(folio_page(folio, 0), order); -} - /* * This function progressively scans the array with free objects (with * a limited look ahead) and extract objects belonging to the same @@ -3663,18 +3908,13 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) } EXPORT_SYMBOL(kmem_cache_free_bulk); -/* Note that interrupts must be enabled when calling this function. */ -int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) +#ifndef CONFIG_SLUB_TINY +static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p, struct obj_cgroup *objcg) { struct kmem_cache_cpu *c; int i; - struct obj_cgroup *objcg = NULL; - /* memcg and kmem_cache debug support */ - s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags); - if (unlikely(!s)) - return false; /* * Drain objects in the per cpu slab, while disabling local * IRQs, which protects against PREEMPT and interrupts @@ -3709,7 +3949,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, * of re-populating per CPU c->freelist */ p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, - _RET_IP_, c); + _RET_IP_, c, s->object_size); if (unlikely(!p[i])) goto error; @@ -3728,19 +3968,72 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, local_unlock_irq(&s->cpu_slab->lock); slub_put_cpu_ptr(s->cpu_slab); - /* - * memcg and kmem_cache debug support and memory initialization. - * Done outside of the IRQ disabled fastpath loop. - */ - slab_post_alloc_hook(s, objcg, flags, size, p, - slab_want_init_on_alloc(flags, s)); return i; + error: slub_put_cpu_ptr(s->cpu_slab); - slab_post_alloc_hook(s, objcg, flags, i, p, false); + slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size); + kmem_cache_free_bulk(s, i, p); + return 0; + +} +#else /* CONFIG_SLUB_TINY */ +static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p, struct obj_cgroup *objcg) +{ + int i; + + for (i = 0; i < size; i++) { + void *object = kfence_alloc(s, s->object_size, flags); + + if (unlikely(object)) { + p[i] = object; + continue; + } + + p[i] = __slab_alloc_node(s, flags, NUMA_NO_NODE, + _RET_IP_, s->object_size); + if (unlikely(!p[i])) + goto error; + + maybe_wipe_obj_freeptr(s, p[i]); + } + + return i; + +error: + slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size); kmem_cache_free_bulk(s, i, p); return 0; } +#endif /* CONFIG_SLUB_TINY */ + +/* Note that interrupts must be enabled when calling this function. */ +int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + int i; + struct obj_cgroup *objcg = NULL; + + if (!size) + return 0; + + /* memcg and kmem_cache debug support */ + s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags); + if (unlikely(!s)) + return 0; + + i = __kmem_cache_alloc_bulk(s, flags, size, p, objcg); + + /* + * memcg and kmem_cache debug support and memory initialization. + * Done outside of the IRQ disabled fastpath loop. + */ + if (i != 0) + slab_post_alloc_hook(s, objcg, flags, size, p, + slab_want_init_on_alloc(flags, s), s->object_size); + return i; +} EXPORT_SYMBOL(kmem_cache_alloc_bulk); @@ -3764,7 +4057,8 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk); * take the list_lock. */ static unsigned int slub_min_order; -static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; +static unsigned int slub_max_order = + IS_ENABLED(CONFIG_SLUB_TINY) ? 1 : PAGE_ALLOC_COSTLY_ORDER; static unsigned int slub_min_objects; /* @@ -3895,10 +4189,12 @@ init_kmem_cache_node(struct kmem_cache_node *n) #endif } +#ifndef CONFIG_SLUB_TINY static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) { BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < - KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu)); + NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH * + sizeof(struct kmem_cache_cpu)); /* * Must align to double word boundary for the double cmpxchg @@ -3914,6 +4210,12 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) return 1; } +#else +static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) +{ + return 1; +} +#endif /* CONFIG_SLUB_TINY */ static struct kmem_cache *kmem_cache_node; @@ -3936,6 +4238,7 @@ static void early_kmem_cache_node_alloc(int node) slab = new_slab(kmem_cache_node, GFP_NOWAIT, node); BUG_ON(!slab); + inc_slabs_node(kmem_cache_node, slab_nid(slab), slab->objects); if (slab_nid(slab) != node) { pr_err("SLUB: Unable to allocate memory from node %d\n", node); pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n"); @@ -3950,7 +4253,6 @@ static void early_kmem_cache_node_alloc(int node) n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false); slab->freelist = get_freepointer(kmem_cache_node, n); slab->inuse = 1; - slab->frozen = 0; kmem_cache_node->node[node] = n; init_kmem_cache_node(n); inc_slabs_node(kmem_cache_node, node, slab->objects); @@ -3976,7 +4278,9 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) void __kmem_cache_release(struct kmem_cache *s) { cache_random_seq_destroy(s); +#ifndef CONFIG_SLUB_TINY free_percpu(s->cpu_slab); +#endif free_kmem_cache_nodes(s); } @@ -4083,7 +4387,8 @@ static int calculate_sizes(struct kmem_cache *s) */ s->inuse = size; - if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || + if (slub_debug_orig_size(s) || + (flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) || s->ctor) { /* @@ -4112,12 +4417,17 @@ static int calculate_sizes(struct kmem_cache *s) } #ifdef CONFIG_SLUB_DEBUG - if (flags & SLAB_STORE_USER) + if (flags & SLAB_STORE_USER) { /* * Need to store information about allocs and frees after * the object. */ size += 2 * sizeof(struct track); + + /* Save the original kmalloc request size */ + if (flags & SLAB_KMALLOC) + size += sizeof(unsigned int); + } #endif kasan_cache_create(s, &size, &s->flags); @@ -4237,23 +4547,21 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab, { #ifdef CONFIG_SLUB_DEBUG void *addr = slab_address(slab); - unsigned long flags; - unsigned long *map; void *p; slab_err(s, slab, text, s->name); - slab_lock(slab, &flags); - map = get_map(s, slab); + spin_lock(&object_map_lock); + __fill_map(object_map, s, slab); + for_each_object(p, s, addr, slab->objects) { - if (!test_bit(__obj_to_index(s, addr, p), map)) { + if (!test_bit(__obj_to_index(s, addr, p), object_map)) { pr_err("Object 0x%p @offset=%tu\n", p, p - addr); print_tracking(s, p); } } - put_map(map); - slab_unlock(slab, &flags); + spin_unlock(&object_map_lock); #endif } @@ -4404,78 +4712,6 @@ static int __init setup_slub_min_objects(char *str) __setup("slub_min_objects=", setup_slub_min_objects); -void *__kmalloc(size_t size, gfp_t flags) -{ - struct kmem_cache *s; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) - return kmalloc_large(size, flags); - - s = kmalloc_slab(size, flags); - - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; - - ret = slab_alloc(s, NULL, flags, _RET_IP_, size); - - trace_kmalloc(_RET_IP_, ret, s, size, s->size, flags); - - ret = kasan_kmalloc(s, ret, size, flags); - - return ret; -} -EXPORT_SYMBOL(__kmalloc); - -#ifdef CONFIG_NUMA -static void *kmalloc_large_node(size_t size, gfp_t flags, int node) -{ - struct page *page; - void *ptr = NULL; - unsigned int order = get_order(size); - - flags |= __GFP_COMP; - page = alloc_pages_node(node, flags, order); - if (page) { - ptr = page_address(page); - mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, - PAGE_SIZE << order); - } - - return kmalloc_large_node_hook(ptr, size, flags); -} - -void *__kmalloc_node(size_t size, gfp_t flags, int node) -{ - struct kmem_cache *s; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { - ret = kmalloc_large_node(size, flags, node); - - trace_kmalloc_node(_RET_IP_, ret, NULL, - size, PAGE_SIZE << get_order(size), - flags, node); - - return ret; - } - - s = kmalloc_slab(size, flags); - - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; - - ret = slab_alloc_node(s, NULL, flags, node, _RET_IP_, size); - - trace_kmalloc_node(_RET_IP_, ret, s, size, s->size, flags, node); - - ret = kasan_kmalloc(s, ret, size, flags); - - return ret; -} -EXPORT_SYMBOL(__kmalloc_node); -#endif /* CONFIG_NUMA */ - #ifdef CONFIG_HARDENED_USERCOPY /* * Rejects incorrectly sized objects and objects that are to be copied @@ -4526,43 +4762,6 @@ void __check_heap_object(const void *ptr, unsigned long n, } #endif /* CONFIG_HARDENED_USERCOPY */ -size_t __ksize(const void *object) -{ - struct folio *folio; - - if (unlikely(object == ZERO_SIZE_PTR)) - return 0; - - folio = virt_to_folio(object); - - if (unlikely(!folio_test_slab(folio))) - return folio_size(folio); - - return slab_ksize(folio_slab(folio)->slab_cache); -} -EXPORT_SYMBOL(__ksize); - -void kfree(const void *x) -{ - struct folio *folio; - struct slab *slab; - void *object = (void *)x; - - trace_kfree(_RET_IP_, x); - - if (unlikely(ZERO_OR_NULL_PTR(x))) - return; - - folio = virt_to_folio(x); - if (unlikely(!folio_test_slab(folio))) { - free_large_kmalloc(folio, object); - return; - } - slab = folio_slab(folio); - slab_free(slab->slab_cache, slab, object, NULL, &object, 1, _RET_IP_); -} -EXPORT_SYMBOL(kfree); - #define SHRINK_PROMOTE_MAX 32 /* @@ -4611,6 +4810,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) if (free == slab->objects) { list_move(&slab->slab_list, &discard); n->nr_partial--; + dec_slabs_node(s, node, slab->objects); } else if (free <= SHRINK_PROMOTE_MAX) list_move(&slab->slab_list, promote + free - 1); } @@ -4626,7 +4826,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) /* Release empty slabs */ list_for_each_entry_safe(slab, t, &discard, slab_list) - discard_slab(s, slab); + free_slab(s, slab); if (slabs_node(s, node)) ret = 1; @@ -4757,11 +4957,6 @@ static int slab_memory_callback(struct notifier_block *self, return ret; } -static struct notifier_block slab_memory_callback_nb = { - .notifier_call = slab_memory_callback, - .priority = SLAB_CALLBACK_PRI, -}; - /******************************************************************** * Basic setup of slabs *******************************************************************/ @@ -4827,7 +5022,7 @@ void __init kmem_cache_init(void) create_boot_cache(kmem_cache_node, "kmem_cache_node", sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0); - register_hotmemory_notifier(&slab_memory_callback_nb); + hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); /* Able to allocate the per node structures */ slab_state = PARTIAL; @@ -4858,6 +5053,10 @@ void __init kmem_cache_init(void) void __init kmem_cache_init_late(void) { +#ifndef CONFIG_SLUB_TINY + flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0); + WARN_ON(!flushwq); +#endif } struct kmem_cache * @@ -4908,61 +5107,7 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) return 0; } -void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) -{ - struct kmem_cache *s; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) - return kmalloc_large(size, gfpflags); - - s = kmalloc_slab(size, gfpflags); - - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; - - ret = slab_alloc(s, NULL, gfpflags, caller, size); - - /* Honor the call site pointer we received. */ - trace_kmalloc(caller, ret, s, size, s->size, gfpflags); - - return ret; -} -EXPORT_SYMBOL(__kmalloc_track_caller); - -#ifdef CONFIG_NUMA -void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, - int node, unsigned long caller) -{ - struct kmem_cache *s; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { - ret = kmalloc_large_node(size, gfpflags, node); - - trace_kmalloc_node(caller, ret, NULL, - size, PAGE_SIZE << get_order(size), - gfpflags, node); - - return ret; - } - - s = kmalloc_slab(size, gfpflags); - - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; - - ret = slab_alloc_node(s, NULL, gfpflags, node, caller, size); - - /* Honor the call site pointer we received. */ - trace_kmalloc_node(caller, ret, s, size, s->size, gfpflags, node); - - return ret; -} -EXPORT_SYMBOL(__kmalloc_node_track_caller); -#endif - -#ifdef CONFIG_SYSFS +#ifdef SLAB_SUPPORTS_SYSFS static int count_inuse(struct slab *slab) { return slab->inuse; @@ -4980,12 +5125,9 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab, { void *p; void *addr = slab_address(slab); - unsigned long flags; - - slab_lock(slab, &flags); if (!check_slab(s, slab) || !on_freelist(s, slab, NULL)) - goto unlock; + return; /* Now we know that a valid freelist exists */ __fill_map(obj_map, s, slab); @@ -4996,8 +5138,6 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab, if (!check_object(s, slab, p, val)) break; } -unlock: - slab_unlock(slab, &flags); } static int validate_slab_node(struct kmem_cache *s, @@ -5068,6 +5208,7 @@ struct location { depot_stack_handle_t handle; unsigned long count; unsigned long addr; + unsigned long waste; long long sum_time; long min_time; long max_time; @@ -5114,13 +5255,15 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) } static int add_location(struct loc_track *t, struct kmem_cache *s, - const struct track *track) + const struct track *track, + unsigned int orig_size) { long start, end, pos; struct location *l; - unsigned long caddr, chandle; + unsigned long caddr, chandle, cwaste; unsigned long age = jiffies - track->when; depot_stack_handle_t handle = 0; + unsigned int waste = s->object_size - orig_size; #ifdef CONFIG_STACKDEPOT handle = READ_ONCE(track->handle); @@ -5138,11 +5281,13 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, if (pos == end) break; - caddr = t->loc[pos].addr; - chandle = t->loc[pos].handle; - if ((track->addr == caddr) && (handle == chandle)) { + l = &t->loc[pos]; + caddr = l->addr; + chandle = l->handle; + cwaste = l->waste; + if ((track->addr == caddr) && (handle == chandle) && + (waste == cwaste)) { - l = &t->loc[pos]; l->count++; if (track->when) { l->sum_time += age; @@ -5167,6 +5312,9 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, end = pos; else if (track->addr == caddr && handle < chandle) end = pos; + else if (track->addr == caddr && handle == chandle && + waste < cwaste) + end = pos; else start = pos; } @@ -5190,6 +5338,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, l->min_pid = track->pid; l->max_pid = track->pid; l->handle = handle; + l->waste = waste; cpumask_clear(to_cpumask(l->cpus)); cpumask_set_cpu(track->cpu, to_cpumask(l->cpus)); nodes_clear(l->nodes); @@ -5202,18 +5351,21 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s, unsigned long *obj_map) { void *addr = slab_address(slab); + bool is_alloc = (alloc == TRACK_ALLOC); void *p; __fill_map(obj_map, s, slab); for_each_object(p, s, addr, slab->objects) if (!test_bit(__obj_to_index(s, addr, p), obj_map)) - add_location(t, s, get_track(s, p, alloc)); + add_location(t, s, get_track(s, p, alloc), + is_alloc ? get_orig_size(s, p) : + s->object_size); } #endif /* CONFIG_DEBUG_FS */ #endif /* CONFIG_SLUB_DEBUG */ -#ifdef CONFIG_SYSFS +#ifdef SLAB_SUPPORTS_SYSFS enum slab_stat_type { SL_ALL, /* All slabs */ SL_PARTIAL, /* Only partially allocated slabs */ @@ -5533,11 +5685,13 @@ static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) SLAB_ATTR_RO(cache_dma); #endif +#ifdef CONFIG_HARDENED_USERCOPY static ssize_t usersize_show(struct kmem_cache *s, char *buf) { return sysfs_emit(buf, "%u\n", s->usersize); } SLAB_ATTR_RO(usersize); +#endif static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) { @@ -5601,7 +5755,7 @@ static ssize_t validate_store(struct kmem_cache *s, { int ret = -EINVAL; - if (buf[0] == '1') { + if (buf[0] == '1' && kmem_cache_debug(s)) { ret = validate_slab_cache(s); if (ret >= 0) ret = length; @@ -5617,7 +5771,21 @@ static ssize_t failslab_show(struct kmem_cache *s, char *buf) { return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); } -SLAB_ATTR_RO(failslab); + +static ssize_t failslab_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + if (s->refcount > 1) + return -EINVAL; + + if (buf[0] == '1') + WRITE_ONCE(s->flags, s->flags | SLAB_FAILSLAB); + else + WRITE_ONCE(s->flags, s->flags & ~SLAB_FAILSLAB); + + return length; +} +SLAB_ATTR(failslab); #endif static ssize_t shrink_show(struct kmem_cache *s, char *buf) @@ -5745,6 +5913,29 @@ STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); #endif /* CONFIG_SLUB_STATS */ +#ifdef CONFIG_KFENCE +static ssize_t skip_kfence_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_SKIP_KFENCE)); +} + +static ssize_t skip_kfence_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + int ret = length; + + if (buf[0] == '0') + s->flags &= ~SLAB_SKIP_KFENCE; + else if (buf[0] == '1') + s->flags |= SLAB_SKIP_KFENCE; + else + ret = -EINVAL; + + return ret; +} +SLAB_ATTR(skip_kfence); +#endif + static struct attribute *slab_attrs[] = { &slab_size_attr.attr, &object_size_attr.attr, @@ -5811,7 +6002,12 @@ static struct attribute *slab_attrs[] = { #ifdef CONFIG_FAILSLAB &failslab_attr.attr, #endif +#ifdef CONFIG_HARDENED_USERCOPY &usersize_attr.attr, +#endif +#ifdef CONFIG_KFENCE + &skip_kfence_attr.attr, +#endif NULL }; @@ -5826,7 +6022,6 @@ static ssize_t slab_attr_show(struct kobject *kobj, { struct slab_attribute *attribute; struct kmem_cache *s; - int err; attribute = to_slab_attr(attr); s = to_slab(kobj); @@ -5834,9 +6029,7 @@ static ssize_t slab_attr_show(struct kobject *kobj, if (!attribute->show) return -EIO; - err = attribute->show(s, buf); - - return err; + return attribute->show(s, buf); } static ssize_t slab_attr_store(struct kobject *kobj, @@ -5845,7 +6038,6 @@ static ssize_t slab_attr_store(struct kobject *kobj, { struct slab_attribute *attribute; struct kmem_cache *s; - int err; attribute = to_slab_attr(attr); s = to_slab(kobj); @@ -5853,8 +6045,7 @@ static ssize_t slab_attr_store(struct kobject *kobj, if (!attribute->store) return -EIO; - err = attribute->store(s, buf, len); - return err; + return attribute->store(s, buf, len); } static void kmem_cache_release(struct kobject *k) @@ -5879,7 +6070,7 @@ static inline struct kset *cache_kset(struct kmem_cache *s) return slab_kset; } -#define ID_STR_LENGTH 64 +#define ID_STR_LENGTH 32 /* Create a unique string id for a slab cache: * @@ -5890,7 +6081,8 @@ static char *create_unique_id(struct kmem_cache *s) char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); char *p = name; - BUG_ON(!name); + if (!name) + return ERR_PTR(-ENOMEM); *p++ = ':'; /* @@ -5912,9 +6104,13 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = 'A'; if (p != name + 1) *p++ = '-'; - p += sprintf(p, "%07u", s->size); + p += snprintf(p, ID_STR_LENGTH - (p - name), "%07u", s->size); - BUG_ON(p > name + ID_STR_LENGTH - 1); + if (WARN_ON(p > name + ID_STR_LENGTH - 1)) { + kfree(name); + return ERR_PTR(-EINVAL); + } + kmsan_unpoison_memory(name, p - name); return name; } @@ -5925,11 +6121,6 @@ static int sysfs_slab_add(struct kmem_cache *s) struct kset *kset = cache_kset(s); int unmergeable = slab_unmergeable(s); - if (!kset) { - kobject_init(&s->kobj, &slab_ktype); - return 0; - } - if (!unmergeable && disable_higher_order_debug && (slub_debug & DEBUG_METADATA_FLAGS)) unmergeable = 1; @@ -5948,6 +6139,8 @@ static int sysfs_slab_add(struct kmem_cache *s) * for the symlinks. */ name = create_unique_id(s); + if (IS_ERR(name)) + return PTR_ERR(name); } s->kobj.kset = kset; @@ -6016,6 +6209,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name) al->name = name; al->next = alias_list; alias_list = al; + kmsan_unpoison_memory(al, sizeof(*al)); return 0; } @@ -6056,9 +6250,8 @@ static int __init slab_sysfs_init(void) mutex_unlock(&slab_mutex); return 0; } - -__initcall(slab_sysfs_init); -#endif /* CONFIG_SYSFS */ +late_initcall(slab_sysfs_init); +#endif /* SLAB_SUPPORTS_SYSFS */ #if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS) static int slab_debugfs_show(struct seq_file *seq, void *v) @@ -6078,6 +6271,10 @@ static int slab_debugfs_show(struct seq_file *seq, void *v) else seq_puts(seq, "<not-available>"); + if (l->waste) + seq_printf(seq, " waste=%lu/%lu", + l->count * l->waste, l->waste); + if (l->sum_time != l->min_time) { seq_printf(seq, " age=%ld/%llu/%ld", l->min_time, div_u64(l->sum_time, l->count), diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 46ae542118c0..c5398a5960d0 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -196,6 +196,10 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) return pmd; } +void __weak __meminit pmd_init(void *addr) +{ +} + pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) { pud_t *pud = pud_offset(p4d, addr); @@ -203,11 +207,16 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); if (!p) return NULL; + pmd_init(p); pud_populate(&init_mm, pud, p); } return pud; } +void __weak __meminit pud_init(void *addr) +{ +} + p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) { p4d_t *p4d = p4d_offset(pgd, addr); @@ -215,6 +224,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); if (!p) return NULL; + pud_init(p); p4d_populate(&init_mm, p4d, p); } return p4d; @@ -285,6 +295,69 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end, return vmemmap_populate_range(start, end, node, altmap, NULL); } +void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, + unsigned long addr, unsigned long next) +{ +} + +int __weak __meminit vmemmap_check_pmd(pmd_t *pmd, int node, + unsigned long addr, unsigned long next) +{ + return 0; +} + +int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end, + int node, struct vmem_altmap *altmap) +{ + unsigned long addr; + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + for (addr = start; addr < end; addr = next) { + next = pmd_addr_end(addr, end); + + pgd = vmemmap_pgd_populate(addr, node); + if (!pgd) + return -ENOMEM; + + p4d = vmemmap_p4d_populate(pgd, addr, node); + if (!p4d) + return -ENOMEM; + + pud = vmemmap_pud_populate(p4d, addr, node); + if (!pud) + return -ENOMEM; + + pmd = pmd_offset(pud, addr); + if (pmd_none(READ_ONCE(*pmd))) { + void *p; + + p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); + if (p) { + vmemmap_set_pmd(pmd, p, node, addr, next); + continue; + } else if (altmap) { + /* + * No fallback: In any case we care about, the + * altmap should be reasonably sized and aligned + * such that vmemmap_alloc_block_buf() will always + * succeed. For consistency with the PTE case, + * return an error here as failure could indicate + * a configuration issue with the size of the altmap. + */ + return -ENOMEM; + } + } else if (vmemmap_check_pmd(pmd, node, addr, next)) + continue; + if (vmemmap_populate_basepages(addr, next, node, altmap)) + return -ENOMEM; + } + return 0; +} + /* * For compound pages bigger than section size (e.g. x86 1G compound * pages with 2M subsection size) fill the rest of sections as tail diff --git a/mm/sparse.c b/mm/sparse.c index e5a8a3a0edd7..2779b419ef2a 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -926,8 +926,6 @@ void sparse_remove_section(struct mem_section *ms, unsigned long pfn, unsigned long nr_pages, unsigned long map_offset, struct vmem_altmap *altmap) { - clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset, - nr_pages - map_offset); section_deactivate(pfn, nr_pages, altmap); } #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/mm/swap.c b/mm/swap.c index 9cee7f6a3809..70e2063ef43a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -43,8 +43,9 @@ #define CREATE_TRACE_POINTS #include <trace/events/pagemap.h> -/* How many pages do we try to swap or page in/out together? */ +/* How many pages do we try to swap or page in/out together? As a power of 2 */ int page_cluster; +const int page_cluster_max = 31; /* Protecting only lru_rotate.fbatch which requires disabling interrupts */ struct lru_rotate { @@ -295,8 +296,20 @@ void folio_rotate_reclaimable(struct folio *folio) } } -void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) +void lru_note_cost(struct lruvec *lruvec, bool file, + unsigned int nr_io, unsigned int nr_rotated) { + unsigned long cost; + + /* + * Reflect the relative cost of incurring IO and spending CPU + * time on rotations. This doesn't attempt to make a precise + * comparison, it just says: if reloads are about comparable + * between the LRU lists, or rotations are overwhelmingly + * different between them, adjust scan balance for CPU work. + */ + cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated; + do { unsigned long lrusize; @@ -310,9 +323,9 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) spin_lock_irq(&lruvec->lru_lock); /* Record cost event */ if (file) - lruvec->file_cost += nr_pages; + lruvec->file_cost += cost; else - lruvec->anon_cost += nr_pages; + lruvec->anon_cost += cost; /* * Decay previous events @@ -335,10 +348,10 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) } while ((lruvec = parent_lruvec(lruvec))); } -void lru_note_cost_folio(struct folio *folio) +void lru_note_cost_refault(struct folio *folio) { lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio), - folio_nr_pages(folio)); + folio_nr_pages(folio), 0); } static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio) @@ -366,7 +379,7 @@ static void folio_activate_drain(int cpu) folio_batch_move_lru(fbatch, folio_activate_fn); } -static void folio_activate(struct folio *folio) +void folio_activate(struct folio *folio) { if (folio_test_lru(folio) && !folio_test_active(folio) && !folio_test_unevictable(folio)) { @@ -385,7 +398,7 @@ static inline void folio_activate_drain(int cpu) { } -static void folio_activate(struct folio *folio) +void folio_activate(struct folio *folio) { struct lruvec *lruvec; @@ -428,6 +441,40 @@ static void __lru_cache_activate_folio(struct folio *folio) local_unlock(&cpu_fbatches.lock); } +#ifdef CONFIG_LRU_GEN +static void folio_inc_refs(struct folio *folio) +{ + unsigned long new_flags, old_flags = READ_ONCE(folio->flags); + + if (folio_test_unevictable(folio)) + return; + + if (!folio_test_referenced(folio)) { + folio_set_referenced(folio); + return; + } + + if (!folio_test_workingset(folio)) { + folio_set_workingset(folio); + return; + } + + /* see the comment on MAX_NR_TIERS */ + do { + new_flags = old_flags & LRU_REFS_MASK; + if (new_flags == LRU_REFS_MASK) + break; + + new_flags += BIT(LRU_REFS_PGOFF); + new_flags |= old_flags & ~LRU_REFS_MASK; + } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); +} +#else +static void folio_inc_refs(struct folio *folio) +{ +} +#endif /* CONFIG_LRU_GEN */ + /* * Mark a page as having seen activity. * @@ -440,6 +487,11 @@ static void __lru_cache_activate_folio(struct folio *folio) */ void folio_mark_accessed(struct folio *folio) { + if (lru_gen_enabled()) { + folio_inc_refs(folio); + return; + } + if (!folio_test_referenced(folio)) { folio_set_referenced(folio); } else if (folio_test_unevictable(folio)) { @@ -484,6 +536,11 @@ void folio_add_lru(struct folio *folio) folio_test_unevictable(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + /* see the comment in lru_gen_add_folio() */ + if (lru_gen_enabled() && !folio_test_unevictable(folio) && + lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) + folio_set_active(folio); + folio_get(folio); local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_add); @@ -493,22 +550,21 @@ void folio_add_lru(struct folio *folio) EXPORT_SYMBOL(folio_add_lru); /** - * lru_cache_add_inactive_or_unevictable - * @page: the page to be added to LRU - * @vma: vma in which page is mapped for determining reclaimability + * folio_add_lru_vma() - Add a folio to the appropate LRU list for this VMA. + * @folio: The folio to be added to the LRU. + * @vma: VMA in which the folio is mapped. * - * Place @page on the inactive or unevictable LRU list, depending on its - * evictability. + * If the VMA is mlocked, @folio is added to the unevictable list. + * Otherwise, it is treated the same way as folio_add_lru(). */ -void lru_cache_add_inactive_or_unevictable(struct page *page, - struct vm_area_struct *vma) +void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma) { - VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED)) - mlock_new_page(page); + mlock_new_page(&folio->page); else - lru_cache_add(page); + folio_add_lru(folio); } /* @@ -575,7 +631,7 @@ static void lru_deactivate_file_fn(struct lruvec *lruvec, struct folio *folio) static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio) { - if (folio_test_active(folio) && !folio_test_unevictable(folio)) { + if (!folio_test_unevictable(folio) && (folio_test_active(folio) || lru_gen_enabled())) { long nr_pages = folio_nr_pages(folio); lruvec_del_folio(lruvec, folio); @@ -688,8 +744,8 @@ void deactivate_page(struct page *page) { struct folio *folio = page_folio(page); - if (folio_test_lru(folio) && folio_test_active(folio) && - !folio_test_unevictable(folio)) { + if (folio_test_lru(folio) && !folio_test_unevictable(folio) && + (folio_test_active(folio) || lru_gen_enabled())) { struct folio_batch *fbatch; folio_get(folio); @@ -925,22 +981,30 @@ void lru_cache_disable(void) /** * release_pages - batched put_page() - * @pages: array of pages to release + * @arg: array of pages to release * @nr: number of pages * - * Decrement the reference count on all the pages in @pages. If it + * Decrement the reference count on all the pages in @arg. If it * fell to zero, remove the page from the LRU and free it. + * + * Note that the argument can be an array of pages, encoded pages, + * or folio pointers. We ignore any encoded bits, and turn any of + * them into just a folio that gets free'd. */ -void release_pages(struct page **pages, int nr) +void release_pages(release_pages_arg arg, int nr) { int i; + struct encoded_page **encoded = arg.encoded_pages; LIST_HEAD(pages_to_free); struct lruvec *lruvec = NULL; unsigned long flags = 0; unsigned int lock_batch; for (i = 0; i < nr; i++) { - struct folio *folio = page_folio(pages[i]); + struct folio *folio; + + /* Turn any of the argument types into a folio */ + folio = page_folio(encoded_page_ptr(encoded[i])); /* * Make sure the IRQ-safe lock-holding time does not get diff --git a/mm/swap.h b/mm/swap.h index 17936e068c1c..f78065c8ef52 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -18,9 +18,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug) } void swap_write_unplug(struct swap_iocb *sio); int swap_writepage(struct page *page, struct writeback_control *wbc); -void end_swap_bio_write(struct bio *bio); -int __swap_writepage(struct page *page, struct writeback_control *wbc, - bio_end_io_t end_write_func); +int __swap_writepage(struct page *page, struct writeback_control *wbc); /* linux/mm/swap_state.c */ /* One swap address space for each 64M swap space */ @@ -34,17 +32,17 @@ extern struct address_space *swapper_spaces[]; void show_swap_cache_info(void); bool add_to_swap(struct folio *folio); void *get_shadow_from_swap_cache(swp_entry_t entry); -int add_to_swap_cache(struct page *page, swp_entry_t entry, +int add_to_swap_cache(struct folio *folio, swp_entry_t entry, gfp_t gfp, void **shadowp); void __delete_from_swap_cache(struct folio *folio, swp_entry_t entry, void *shadow); void delete_from_swap_cache(struct folio *folio); void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end); -struct page *lookup_swap_cache(swp_entry_t entry, - struct vm_area_struct *vma, - unsigned long addr); -struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index); +struct folio *swap_cache_get_folio(swp_entry_t entry, + struct vm_area_struct *vma, unsigned long addr); +struct folio *filemap_get_incore_folio(struct address_space *mapping, + pgoff_t index); struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, @@ -101,17 +99,17 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc) return 0; } -static inline struct page *lookup_swap_cache(swp_entry_t swp, - struct vm_area_struct *vma, - unsigned long addr) +static inline struct folio *swap_cache_get_folio(swp_entry_t entry, + struct vm_area_struct *vma, unsigned long addr) { return NULL; } static inline -struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) +struct folio *filemap_get_incore_folio(struct address_space *mapping, + pgoff_t index) { - return find_get_page(mapping, index); + return filemap_get_folio(mapping, index); } static inline bool add_to_swap(struct folio *folio) @@ -124,7 +122,7 @@ static inline void *get_shadow_from_swap_cache(swp_entry_t entry) return NULL; } -static inline int add_to_swap_cache(struct page *page, swp_entry_t entry, +static inline int add_to_swap_cache(struct folio *folio, swp_entry_t entry, gfp_t gfp_mask, void **shadowp) { return -1; diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index 5a9442979a18..db6c4a26cf59 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -170,6 +170,9 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) unsigned long length; struct swap_cgroup_ctrl *ctrl; + if (mem_cgroup_disabled()) + return 0; + length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); array = vcalloc(length, sizeof(void *)); @@ -204,6 +207,9 @@ void swap_cgroup_swapoff(int type) unsigned long i, length; struct swap_cgroup_ctrl *ctrl; + if (mem_cgroup_disabled()) + return; + mutex_lock(&swap_cgroup_mutex); ctrl = &swap_cgroup_ctrl[type]; map = ctrl->map; diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 10b94d64cc25..0bec1f705f8e 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -343,7 +343,7 @@ repeat: get_swap_pages(1, &entry, 1); out: if (mem_cgroup_try_charge_swap(folio, entry)) { - put_swap_page(&folio->page, entry); + put_swap_folio(folio, entry); entry.val = 0; } return entry; diff --git a/mm/swap_state.c b/mm/swap_state.c index e166051566f4..2927507b43d8 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -85,21 +85,21 @@ void *get_shadow_from_swap_cache(swp_entry_t entry) * add_to_swap_cache resembles filemap_add_folio on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ -int add_to_swap_cache(struct page *page, swp_entry_t entry, +int add_to_swap_cache(struct folio *folio, swp_entry_t entry, gfp_t gfp, void **shadowp) { struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swp_offset(entry); - XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page)); - unsigned long i, nr = thp_nr_pages(page); + XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); + unsigned long i, nr = folio_nr_pages(folio); void *old; - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(PageSwapCache(page), page); - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); - page_ref_add(page, nr); - SetPageSwapCache(page); + folio_ref_add(folio, nr); + folio_set_swapcache(folio); do { xas_lock_irq(&xas); @@ -107,19 +107,19 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, if (xas_error(&xas)) goto unlock; for (i = 0; i < nr; i++) { - VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); + VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio); old = xas_load(&xas); if (xa_is_value(old)) { if (shadowp) *shadowp = old; } - set_page_private(page + i, entry.val + i); - xas_store(&xas, page); + set_page_private(folio_page(folio, i), entry.val + i); + xas_store(&xas, folio); xas_next(&xas); } address_space->nrpages += nr; - __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); - __mod_lruvec_page_state(page, NR_SWAPCACHE, nr); + __node_stat_mod_folio(folio, NR_FILE_PAGES, nr); + __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); @@ -127,8 +127,8 @@ unlock: if (!xas_error(&xas)) return 0; - ClearPageSwapCache(page); - page_ref_sub(page, nr); + folio_clear_swapcache(folio); + folio_ref_sub(folio, nr); return xas_error(&xas); } @@ -151,7 +151,7 @@ void __delete_from_swap_cache(struct folio *folio, for (i = 0; i < nr; i++) { void *entry = xas_store(&xas, shadow); - VM_BUG_ON_FOLIO(entry != folio, folio); + VM_BUG_ON_PAGE(entry != folio, entry); set_page_private(folio_page(folio, i), 0); xas_next(&xas); } @@ -194,7 +194,7 @@ bool add_to_swap(struct folio *folio) /* * Add it to the swap cache. */ - err = add_to_swap_cache(&folio->page, entry, + err = add_to_swap_cache(folio, entry, __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL); if (err) /* @@ -218,7 +218,7 @@ bool add_to_swap(struct folio *folio) return true; fail: - put_swap_page(&folio->page, entry); + put_swap_folio(folio, entry); return false; } @@ -237,7 +237,7 @@ void delete_from_swap_cache(struct folio *folio) __delete_from_swap_cache(folio, entry, NULL); xa_unlock_irq(&address_space->i_pages); - put_swap_page(&folio->page, entry); + put_swap_folio(folio, entry); folio_ref_sub(folio, folio_nr_pages(folio)); } @@ -272,16 +272,19 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, /* * If we are the only user, then try to free up the swap cache. * - * Its ok to check for PageSwapCache without the page lock + * Its ok to check the swapcache flag without the folio lock * here because we are going to recheck again inside - * try_to_free_swap() _with_ the lock. + * folio_free_swap() _with_ the lock. * - Marcelo */ void free_swap_cache(struct page *page) { - if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) { - try_to_free_swap(page); - unlock_page(page); + struct folio *folio = page_folio(page); + + if (folio_test_swapcache(folio) && !folio_mapped(folio) && + folio_trylock(folio)) { + folio_free_swap(folio); + folio_unlock(folio); } } @@ -300,15 +303,12 @@ void free_page_and_swap_cache(struct page *page) * Passed an array of pages, drop them all from swapcache and then release * them. They are removed from the LRU and freed if this is their last use. */ -void free_pages_and_swap_cache(struct page **pages, int nr) +void free_pages_and_swap_cache(struct encoded_page **pages, int nr) { - struct page **pagep = pages; - int i; - lru_add_drain(); - for (i = 0; i < nr; i++) - free_swap_cache(pagep[i]); - release_pages(pagep, nr); + for (int i = 0; i < nr; i++) + free_swap_cache(encoded_page_ptr(pages[i])); + release_pages(pages, nr); } static inline bool swap_use_vma_readahead(void) @@ -317,24 +317,24 @@ static inline bool swap_use_vma_readahead(void) } /* - * Lookup a swap entry in the swap cache. A found page will be returned + * Lookup a swap entry in the swap cache. A found folio will be returned * unlocked and with its refcount incremented - we rely on the kernel - * lock getting page table operations atomic even if we drop the page + * lock getting page table operations atomic even if we drop the folio * lock before returning. */ -struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, - unsigned long addr) +struct folio *swap_cache_get_folio(swp_entry_t entry, + struct vm_area_struct *vma, unsigned long addr) { - struct page *page; + struct folio *folio; struct swap_info_struct *si; si = get_swap_device(entry); if (!si) return NULL; - page = find_get_page(swap_address_space(entry), swp_offset(entry)); + folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); put_swap_device(si); - if (page) { + if (folio) { bool vma_ra = swap_use_vma_readahead(); bool readahead; @@ -342,10 +342,10 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, * At the moment, we don't support PG_readahead for anon THP * so let's bail out rather than confusing the readahead stat. */ - if (unlikely(PageTransCompound(page))) - return page; + if (unlikely(folio_test_large(folio))) + return folio; - readahead = TestClearPageReadahead(page); + readahead = folio_test_clear_readahead(folio); if (vma && vma_ra) { unsigned long ra_val; int win, hits; @@ -366,34 +366,32 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, } } - return page; + return folio; } /** - * find_get_incore_page - Find and get a page from the page or swap caches. + * filemap_get_incore_folio - Find and get a folio from the page or swap caches. * @mapping: The address_space to search. * @index: The page cache index. * - * This differs from find_get_page() in that it will also look for the - * page in the swap cache. + * This differs from filemap_get_folio() in that it will also look for the + * folio in the swap cache. * - * Return: The found page or %NULL. + * Return: The found folio or %NULL. */ -struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) +struct folio *filemap_get_incore_folio(struct address_space *mapping, + pgoff_t index) { swp_entry_t swp; struct swap_info_struct *si; - struct page *page = pagecache_get_page(mapping, index, - FGP_ENTRY | FGP_HEAD, 0); + struct folio *folio = __filemap_get_folio(mapping, index, FGP_ENTRY, 0); - if (!page) - return page; - if (!xa_is_value(page)) - return find_subpage(page, index); + if (!xa_is_value(folio)) + goto out; if (!shmem_mapping(mapping)) return NULL; - swp = radix_to_swp_entry(page); + swp = radix_to_swp_entry(folio); /* There might be swapin error entries in shmem mapping. */ if (non_swap_entry(swp)) return NULL; @@ -401,9 +399,11 @@ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) si = get_swap_device(swp); if (!si) return NULL; - page = find_get_page(swap_address_space(swp), swp_offset(swp)); + index = swp_offset(swp); + folio = filemap_get_folio(swap_address_space(swp), index); put_swap_device(si); - return page; +out: + return folio; } struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, @@ -411,7 +411,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, bool *new_page_allocated) { struct swap_info_struct *si; - struct page *page; + struct folio *folio; void *shadow = NULL; *new_page_allocated = false; @@ -420,17 +420,17 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, int err; /* * First check the swap cache. Since this is normally - * called after lookup_swap_cache() failed, re-calling + * called after swap_cache_get_folio() failed, re-calling * that would confuse statistics. */ si = get_swap_device(entry); if (!si) return NULL; - page = find_get_page(swap_address_space(entry), - swp_offset(entry)); + folio = filemap_get_folio(swap_address_space(entry), + swp_offset(entry)); put_swap_device(si); - if (page) - return page; + if (folio) + return folio_file_page(folio, swp_offset(entry)); /* * Just skip read ahead for unused swap slot. @@ -448,8 +448,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will * cause any racers to loop around until we add it to cache. */ - page = alloc_page_vma(gfp_mask, vma, addr); - if (!page) + folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false); + if (!folio) return NULL; /* @@ -459,7 +459,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, if (!err) break; - put_page(page); + folio_put(folio); if (err != -EEXIST) return NULL; @@ -477,30 +477,30 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * The swap entry is ours to swap in. Prepare the new page. */ - __SetPageLocked(page); - __SetPageSwapBacked(page); + __folio_set_locked(folio); + __folio_set_swapbacked(folio); - if (mem_cgroup_swapin_charge_page(page, NULL, gfp_mask, entry)) + if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry)) goto fail_unlock; /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) + if (add_to_swap_cache(folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) goto fail_unlock; mem_cgroup_swapin_uncharge_swap(entry); if (shadow) - workingset_refault(page_folio(page), shadow); + workingset_refault(folio, shadow); - /* Caller will initiate read into locked page */ - lru_cache_add(page); + /* Caller will initiate read into locked folio */ + folio_add_lru(folio); *new_page_allocated = true; - return page; + return &folio->page; fail_unlock: - put_swap_page(page, entry); - unlock_page(page); - put_page(page); + put_swap_folio(folio, entry); + folio_unlock(folio); + folio_put(folio); return NULL; } diff --git a/mm/swapfile.c b/mm/swapfile.c index 1fdccd2f1422..908a529bca12 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -63,6 +63,10 @@ EXPORT_SYMBOL_GPL(nr_swap_pages); /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; static int least_priority = -1; +unsigned long swapfile_maximum_size; +#ifdef CONFIG_MIGRATION +bool swap_migration_ad_supported; +#endif /* CONFIG_MIGRATION */ static const char Bad_file[] = "Bad swap file entry "; static const char Unused_file[] = "Unused swap file entry "; @@ -128,27 +132,27 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset, unsigned long flags) { swp_entry_t entry = swp_entry(si->type, offset); - struct page *page; + struct folio *folio; int ret = 0; - page = find_get_page(swap_address_space(entry), offset); - if (!page) + folio = filemap_get_folio(swap_address_space(entry), offset); + if (!folio) return 0; /* * When this function is called from scan_swap_map_slots() and it's - * called by vmscan.c at reclaiming pages. So, we hold a lock on a page, + * called by vmscan.c at reclaiming folios. So we hold a folio lock * here. We have to use trylock for avoiding deadlock. This is a special - * case and you should use try_to_free_swap() with explicit lock_page() + * case and you should use folio_free_swap() with explicit folio_lock() * in usual operations. */ - if (trylock_page(page)) { + if (folio_trylock(folio)) { if ((flags & TTRS_ANYWAY) || - ((flags & TTRS_UNMAPPED) && !page_mapped(page)) || - ((flags & TTRS_FULL) && mem_cgroup_swap_full(page))) - ret = try_to_free_swap(page); - unlock_page(page); + ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || + ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))) + ret = folio_free_swap(folio); + folio_unlock(folio); } - put_page(page); + folio_put(folio); return ret; } @@ -768,8 +772,7 @@ static void set_cluster_next(struct swap_info_struct *si, unsigned long next) /* No free swap slots available */ if (si->highest_bit <= si->lowest_bit) return; - next = si->lowest_bit + - prandom_u32_max(si->highest_bit - si->lowest_bit + 1); + next = get_random_u32_inclusive(si->lowest_bit, si->highest_bit); next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES); next = max_t(unsigned int, next, si->lowest_bit); } @@ -969,23 +972,23 @@ done: scan: spin_unlock(&si->lock); while (++offset <= READ_ONCE(si->highest_bit)) { - if (swap_offset_available_and_locked(si, offset)) - goto checks; if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; scanned_many = true; } + if (swap_offset_available_and_locked(si, offset)) + goto checks; } offset = si->lowest_bit; while (offset < scan_base) { - if (swap_offset_available_and_locked(si, offset)) - goto checks; if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; scanned_many = true; } + if (swap_offset_available_and_locked(si, offset)) + goto checks; offset++; } spin_lock(&si->lock); @@ -1328,7 +1331,7 @@ void swap_free(swp_entry_t entry) /* * Called after dropping swapcache to decrease refcnt to swap entries. */ -void put_swap_page(struct page *page, swp_entry_t entry) +void put_swap_folio(struct folio *folio, swp_entry_t entry) { unsigned long offset = swp_offset(entry); unsigned long idx = offset / SWAPFILE_CLUSTER; @@ -1337,7 +1340,7 @@ void put_swap_page(struct page *page, swp_entry_t entry) unsigned char *map; unsigned int i, free_entries = 0; unsigned char val; - int size = swap_entry_size(thp_nr_pages(page)); + int size = swap_entry_size(folio_nr_pages(folio)); si = _swap_info_get(entry); if (!si) @@ -1427,30 +1430,6 @@ void swapcache_free_entries(swp_entry_t *entries, int n) spin_unlock(&p->lock); } -/* - * How many references to page are currently swapped out? - * This does not give an exact answer when swap count is continued, - * but does include the high COUNT_CONTINUED flag to allow for that. - */ -static int page_swapcount(struct page *page) -{ - int count = 0; - struct swap_info_struct *p; - struct swap_cluster_info *ci; - swp_entry_t entry; - unsigned long offset; - - entry.val = page_private(page); - p = _swap_info_get(entry); - if (p) { - offset = swp_offset(entry); - ci = lock_cluster_or_swap_info(p, offset); - count = swap_count(p->swap_map[offset]); - unlock_cluster_or_swap_info(p, ci); - } - return count; -} - int __swap_count(swp_entry_t entry) { struct swap_info_struct *si; @@ -1465,11 +1444,16 @@ int __swap_count(swp_entry_t entry) return count; } +/* + * How many references to @entry are currently swapped out? + * This does not give an exact answer when swap count is continued, + * but does include the high COUNT_CONTINUED flag to allow for that. + */ static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) { - int count = 0; pgoff_t offset = swp_offset(entry); struct swap_cluster_info *ci; + int count; ci = lock_cluster_or_swap_info(si, offset); count = swap_count(si->swap_map[offset]); @@ -1570,56 +1554,59 @@ unlock_out: static bool folio_swapped(struct folio *folio) { - swp_entry_t entry; - struct swap_info_struct *si; + swp_entry_t entry = folio_swap_entry(folio); + struct swap_info_struct *si = _swap_info_get(entry); + + if (!si) + return false; if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio))) - return page_swapcount(&folio->page) != 0; + return swap_swapcount(si, entry) != 0; - entry = folio_swap_entry(folio); - si = _swap_info_get(entry); - if (si) - return swap_page_trans_huge_swapped(si, entry); - return false; + return swap_page_trans_huge_swapped(si, entry); } -/* - * If swap is getting full, or if there are no more mappings of this page, - * then try_to_free_swap is called to free its swap space. +/** + * folio_free_swap() - Free the swap space used for this folio. + * @folio: The folio to remove. + * + * If swap is getting full, or if there are no more mappings of this folio, + * then call folio_free_swap to free its swap space. + * + * Return: true if we were able to release the swap space. */ -int try_to_free_swap(struct page *page) +bool folio_free_swap(struct folio *folio) { - struct folio *folio = page_folio(page); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (!folio_test_swapcache(folio)) - return 0; + return false; if (folio_test_writeback(folio)) - return 0; + return false; if (folio_swapped(folio)) - return 0; + return false; /* * Once hibernation has begun to create its image of memory, - * there's a danger that one of the calls to try_to_free_swap() + * there's a danger that one of the calls to folio_free_swap() * - most probably a call from __try_to_reclaim_swap() while * hibernation is allocating its own swap pages for the image, * but conceivably even a call from memory reclaim - will free - * the swap from a page which has already been recorded in the - * image as a clean swapcache page, and then reuse its swap for + * the swap from a folio which has already been recorded in the + * image as a clean swapcache folio, and then reuse its swap for * another page of the image. On waking from hibernation, the - * original page might be freed under memory pressure, then + * original folio might be freed under memory pressure, then * later read back in from swap, now with the wrong data. * * Hibernation suspends storage while it is writing the image * to disk so check that here. */ if (pm_suspended_storage()) - return 0; + return false; delete_from_swap_cache(folio); folio_set_dirty(folio); - return 1; + return true; } /* @@ -1770,8 +1757,9 @@ static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) * force COW, vm_page_prot omits write permission from any private vma. */ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, swp_entry_t entry, struct page *page) + unsigned long addr, swp_entry_t entry, struct folio *folio) { + struct page *page = folio_file_page(folio, swp_offset(entry)); struct page *swapcache; spinlock_t *ptl; pte_t *pte, new_pte; @@ -1792,7 +1780,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, pte_t pteval; dec_mm_counter(vma->vm_mm, MM_SWAPENTS); - pteval = swp_entry_to_pte(make_swapin_error_entry(page)); + pteval = swp_entry_to_pte(make_swapin_error_entry()); set_pte_at(vma->vm_mm, addr, pte, pteval); swap_free(entry); ret = 0; @@ -1843,17 +1831,18 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned int type) { - struct page *page; swp_entry_t entry; pte_t *pte; struct swap_info_struct *si; - unsigned long offset; int ret = 0; volatile unsigned char *swap_map; si = swap_info[type]; pte = pte_offset_map(pmd, addr); do { + struct folio *folio; + unsigned long offset; + if (!is_swap_pte(*pte)) continue; @@ -1864,8 +1853,9 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, offset = swp_offset(entry); pte_unmap(pte); swap_map = &si->swap_map[offset]; - page = lookup_swap_cache(entry, vma, addr); - if (!page) { + folio = swap_cache_get_folio(entry, vma, addr); + if (!folio) { + struct page *page; struct vm_fault vmf = { .vma = vma, .address = addr, @@ -1875,25 +1865,27 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); + if (page) + folio = page_folio(page); } - if (!page) { + if (!folio) { if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD) goto try_next; return -ENOMEM; } - lock_page(page); - wait_on_page_writeback(page); - ret = unuse_pte(vma, pmd, addr, entry, page); + folio_lock(folio); + folio_wait_writeback(folio); + ret = unuse_pte(vma, pmd, addr, entry, folio); if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto out; } - try_to_free_swap(page); - unlock_page(page); - put_page(page); + folio_free_swap(folio); + folio_unlock(folio); + folio_put(folio); try_next: pte = pte_offset_map(pmd, addr); } while (pte++, addr += PAGE_SIZE, addr != end); @@ -1990,14 +1982,16 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type) { struct vm_area_struct *vma; int ret = 0; + VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (vma->anon_vma) { ret = unuse_vma(vma, type); if (ret) break; } + cond_resched(); } mmap_read_unlock(mm); @@ -2042,7 +2036,7 @@ static int try_to_unuse(unsigned int type) struct list_head *p; int retval = 0; struct swap_info_struct *si = swap_info[type]; - struct page *page; + struct folio *folio; swp_entry_t entry; unsigned int i; @@ -2092,21 +2086,21 @@ retry: (i = find_next_to_unuse(si, i)) != 0) { entry = swp_entry(type, i); - page = find_get_page(swap_address_space(entry), i); - if (!page) + folio = filemap_get_folio(swap_address_space(entry), i); + if (!folio) continue; /* - * It is conceivable that a racing task removed this page from - * swap cache just before we acquired the page lock. The page + * It is conceivable that a racing task removed this folio from + * swap cache just before we acquired the page lock. The folio * might even be back in swap cache on another swap area. But - * that is okay, try_to_free_swap() only removes stale pages. + * that is okay, folio_free_swap() only removes stale folios. */ - lock_page(page); - wait_on_page_writeback(page); - try_to_free_swap(page); - unlock_page(page); - put_page(page); + folio_lock(folio); + folio_wait_writeback(folio); + folio_free_swap(folio); + folio_unlock(folio); + folio_put(folio); } /* @@ -2816,7 +2810,7 @@ unsigned long generic_max_swapfile_size(void) } /* Can be overridden by an architecture for additional checks. */ -__weak unsigned long max_swapfile_size(void) +__weak unsigned long arch_max_swapfile_size(void) { return generic_max_swapfile_size(); } @@ -2856,7 +2850,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p, p->cluster_next = 1; p->cluster_nr = 0; - maxpages = max_swapfile_size(); + maxpages = swapfile_maximum_size; last_page = swap_header->info.last_page; if (!last_page) { pr_warn("Empty swap-file\n"); @@ -3094,7 +3088,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) */ for_each_possible_cpu(cpu) { per_cpu(*p->cluster_next_cpu, cpu) = - 1 + prandom_u32_max(p->highest_bit); + get_random_u32_inclusive(1, p->highest_bit); } nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); @@ -3655,7 +3649,7 @@ void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], avail_lists[nid]) { if (si->bdev) { - blkcg_schedule_throttle(bdev_get_queue(si->bdev), true); + blkcg_schedule_throttle(si->bdev->bd_disk, true); break; } } @@ -3677,6 +3671,13 @@ static int __init swapfile_init(void) for_each_node(nid) plist_head_init(&swap_avail_heads[nid]); + swapfile_maximum_size = arch_max_swapfile_size(); + +#ifdef CONFIG_MIGRATION + if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS)) + swap_migration_ad_supported = true; +#endif /* CONFIG_MIGRATION */ + return 0; } subsys_initcall(swapfile_init); diff --git a/mm/truncate.c b/mm/truncate.c index 0b0708bf935f..7b4ea4c4a46b 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -240,7 +240,7 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) folio_invalidate(folio, offset, length); if (!folio_test_large(folio)) return true; - if (split_huge_page(&folio->page) == 0) + if (split_folio(folio) == 0) return true; if (folio_test_dirty(folio)) return false; @@ -361,9 +361,8 @@ void truncate_inode_pages_range(struct address_space *mapping, folio_batch_init(&fbatch); index = start; - while (index < end && find_lock_entries(mapping, index, end - 1, + while (index < end && find_lock_entries(mapping, &index, end - 1, &fbatch, indices)) { - index = indices[folio_batch_count(&fbatch) - 1] + 1; truncate_folio_batch_exceptionals(mapping, &fbatch, indices); for (i = 0; i < folio_batch_count(&fbatch); i++) truncate_cleanup_folio(fbatch.folios[i]); @@ -401,7 +400,7 @@ void truncate_inode_pages_range(struct address_space *mapping, index = start; while (index < end) { cond_resched(); - if (!find_get_entries(mapping, index, end - 1, &fbatch, + if (!find_get_entries(mapping, &index, end - 1, &fbatch, indices)) { /* If all gone from start onwards, we're done */ if (index == start) @@ -415,21 +414,18 @@ void truncate_inode_pages_range(struct address_space *mapping, struct folio *folio = fbatch.folios[i]; /* We rely upon deletion not changing page->index */ - index = indices[i]; if (xa_is_value(folio)) continue; folio_lock(folio); - VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); + VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio); folio_wait_writeback(folio); truncate_inode_folio(mapping, folio); folio_unlock(folio); - index = folio_index(folio) + folio_nr_pages(folio) - 1; } truncate_folio_batch_exceptionals(mapping, &fbatch, indices); folio_batch_release(&fbatch); - index++; } } EXPORT_SYMBOL(truncate_inode_pages_range); @@ -510,20 +506,17 @@ unsigned long invalidate_mapping_pagevec(struct address_space *mapping, int i; folio_batch_init(&fbatch); - while (find_lock_entries(mapping, index, end, &fbatch, indices)) { + while (find_lock_entries(mapping, &index, end, &fbatch, indices)) { for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; /* We rely upon deletion not changing folio->index */ - index = indices[i]; if (xa_is_value(folio)) { count += invalidate_exceptional_entry(mapping, - index, - folio); + indices[i], folio); continue; } - index += folio_nr_pages(folio) - 1; ret = mapping_evict_folio(mapping, folio); folio_unlock(folio); @@ -542,7 +535,6 @@ unsigned long invalidate_mapping_pagevec(struct address_space *mapping, folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); cond_resched(); - index++; } return count; } @@ -573,7 +565,7 @@ EXPORT_SYMBOL(invalidate_mapping_pages); * refcount. We do this because invalidate_inode_pages2() needs stronger * invalidation guarantees, and cannot afford to leave pages behind because * shrink_page_list() has a temp ref on them, or because they're transiently - * sitting in the lru_cache_add() pagevecs. + * sitting in the folio_add_lru() pagevecs. */ static int invalidate_complete_folio2(struct address_space *mapping, struct folio *folio) @@ -641,16 +633,15 @@ int invalidate_inode_pages2_range(struct address_space *mapping, folio_batch_init(&fbatch); index = start; - while (find_get_entries(mapping, index, end, &fbatch, indices)) { + while (find_get_entries(mapping, &index, end, &fbatch, indices)) { for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; /* We rely upon deletion not changing folio->index */ - index = indices[i]; if (xa_is_value(folio)) { if (!invalidate_exceptional_entry2(mapping, - index, folio)) + indices[i], folio)) ret = -EBUSY; continue; } @@ -660,13 +651,13 @@ int invalidate_inode_pages2_range(struct address_space *mapping, * If folio is mapped, before taking its lock, * zap the rest of the file in one hit. */ - unmap_mapping_pages(mapping, index, - (1 + end - index), false); + unmap_mapping_pages(mapping, indices[i], + (1 + end - indices[i]), false); did_range_unmap = 1; } folio_lock(folio); - VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); + VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio); if (folio->mapping != mapping) { folio_unlock(folio); continue; @@ -689,7 +680,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping, folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); cond_resched(); - index++; } /* * For DAX we invalidate page tables after invalidating page cache. We diff --git a/mm/usercopy.c b/mm/usercopy.c index c1ee15a98633..4c3164beacec 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -12,6 +12,7 @@ #include <linux/mm.h> #include <linux/highmem.h> +#include <linux/kstrtox.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/task.h> @@ -258,7 +259,7 @@ static bool enable_checks __initdata = true; static int __init parse_hardened_usercopy(char *str) { - if (strtobool(str, &enable_checks)) + if (kstrtobool(str, &enable_checks)) pr_warn("Invalid option string for hardened_usercopy: '%s'\n", str); return 1; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 7327b2573f7c..0499907b6f1a 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -64,8 +64,9 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, pte_t _dst_pte, *dst_pte; bool writable = dst_vma->vm_flags & VM_WRITE; bool vm_shared = dst_vma->vm_flags & VM_SHARED; - bool page_in_cache = page->mapping; + bool page_in_cache = page_mapping(page); spinlock_t *ptl; + struct folio *folio; struct inode *inode; pgoff_t offset, max_off; @@ -113,14 +114,15 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, if (!pte_none_mostly(*dst_pte)) goto out_unlock; + folio = page_folio(page); if (page_in_cache) { /* Usually, cache pages are already added to LRU */ if (newly_allocated) - lru_cache_add(page); + folio_add_lru(folio); page_add_file_rmap(page, dst_vma, false); } else { page_add_new_anon_rmap(page, dst_vma, dst_addr); - lru_cache_add_inactive_or_unevictable(page, dst_vma); + folio_add_lru_vma(folio, dst_vma); } /* @@ -157,11 +159,28 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, if (!page) goto out; - page_kaddr = kmap_atomic(page); + page_kaddr = kmap_local_page(page); + /* + * The read mmap_lock is held here. Despite the + * mmap_lock being read recursive a deadlock is still + * possible if a writer has taken a lock. For example: + * + * process A thread 1 takes read lock on own mmap_lock + * process A thread 2 calls mmap, blocks taking write lock + * process B thread 1 takes page fault, read lock on own mmap lock + * process B thread 2 calls mmap, blocks taking write lock + * process A thread 1 blocks taking read lock on process B + * process B thread 1 blocks taking read lock on process A + * + * Disable page faults to prevent potential deadlock + * and retry the copy outside the mmap_lock. + */ + pagefault_disable(); ret = copy_from_user(page_kaddr, (const void __user *) src_addr, PAGE_SIZE); - kunmap_atomic(page_kaddr); + pagefault_enable(); + kunmap_local(page_kaddr); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { @@ -243,20 +262,22 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm, { struct inode *inode = file_inode(dst_vma->vm_file); pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); + struct folio *folio; struct page *page; int ret; - ret = shmem_getpage(inode, pgoff, &page, SGP_NOALLOC); - /* Our caller expects us to return -EFAULT if we failed to find page. */ + ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC); + /* Our caller expects us to return -EFAULT if we failed to find folio */ if (ret == -ENOENT) ret = -EFAULT; if (ret) goto out; - if (!page) { + if (!folio) { ret = -EFAULT; goto out; } + page = folio_file_page(folio, pgoff); if (PageHWPoison(page)) { ret = -EIO; goto out_release; @@ -267,13 +288,13 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm, if (ret) goto out_release; - unlock_page(page); + folio_unlock(folio); ret = 0; out: return ret; out_release: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto out; } @@ -377,30 +398,30 @@ retry: BUG_ON(dst_addr >= dst_start + len); /* - * Serialize via i_mmap_rwsem and hugetlb_fault_mutex. - * i_mmap_rwsem ensures the dst_pte remains valid even + * Serialize via vma_lock and hugetlb_fault_mutex. + * vma_lock ensures the dst_pte remains valid even * in the case of shared pmds. fault mutex prevents * races with other faulting threads. */ - mapping = dst_vma->vm_file->f_mapping; - i_mmap_lock_read(mapping); idx = linear_page_index(dst_vma, dst_addr); + mapping = dst_vma->vm_file->f_mapping; hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); + hugetlb_vma_lock_read(dst_vma); err = -ENOMEM; dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); if (!dst_pte) { + hugetlb_vma_unlock_read(dst_vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); goto out_unlock; } if (mode != MCOPY_ATOMIC_CONTINUE && !huge_pte_none_mostly(huge_ptep_get(dst_pte))) { err = -EEXIST; + hugetlb_vma_unlock_read(dst_vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); goto out_unlock; } @@ -408,8 +429,8 @@ retry: dst_addr, src_addr, mode, &page, wp_copy); + hugetlb_vma_unlock_read(dst_vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); cond_resched(); @@ -611,7 +632,7 @@ retry: break; } - dst_pmdval = pmd_read_atomic(dst_pmd); + dst_pmdval = pmdp_get_lockless(dst_pmd); /* * If the dst_pmd is mapped as THP don't * override it and just be strict. @@ -644,11 +665,11 @@ retry: mmap_read_unlock(dst_mm); BUG_ON(!page); - page_kaddr = kmap(page); + page_kaddr = kmap_local_page(page); err = copy_from_user(page_kaddr, (const void __user *) src_addr, PAGE_SIZE); - kunmap(page); + kunmap_local(page_kaddr); if (unlikely(err)) { err = -EFAULT; goto out; diff --git a/mm/util.c b/mm/util.c index c9439c66d8cf..b56c92fb910f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -272,38 +272,6 @@ void *memdup_user_nul(const void __user *src, size_t len) } EXPORT_SYMBOL(memdup_user_nul); -void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev) -{ - struct vm_area_struct *next; - - vma->vm_prev = prev; - if (prev) { - next = prev->vm_next; - prev->vm_next = vma; - } else { - next = mm->mmap; - mm->mmap = vma; - } - vma->vm_next = next; - if (next) - next->vm_prev = vma; -} - -void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma) -{ - struct vm_area_struct *prev, *next; - - next = vma->vm_next; - prev = vma->vm_prev; - if (prev) - prev->vm_next = next; - else - mm->mmap = next; - if (next) - next->vm_prev = prev; -} - /* Check if the vma is being used as a stack by this task */ int vma_is_stack_for_current(struct vm_area_struct *vma) { @@ -619,6 +587,10 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) if (ret || size <= PAGE_SIZE) return ret; + /* non-sleeping allocations are not supported by vmalloc */ + if (!gfpflags_allow_blocking(flags)) + return NULL; + /* Don't even allow crazy sizes */ if (unlikely(size > INT_MAX)) { WARN_ON_ONCE(!(flags & __GFP_NOWARN)); @@ -745,32 +717,6 @@ void *page_rmapping(struct page *page) return folio_raw_mapping(page_folio(page)); } -/** - * folio_mapped - Is this folio mapped into userspace? - * @folio: The folio. - * - * Return: True if any page in this folio is referenced by user page tables. - */ -bool folio_mapped(struct folio *folio) -{ - long i, nr; - - if (!folio_test_large(folio)) - return atomic_read(&folio->_mapcount) >= 0; - if (atomic_read(folio_mapcount_ptr(folio)) >= 0) - return true; - if (folio_test_hugetlb(folio)) - return false; - - nr = folio_nr_pages(folio); - for (i = 0; i < nr; i++) { - if (atomic_read(&folio_page(folio, i)->_mapcount) >= 0) - return true; - } - return false; -} -EXPORT_SYMBOL(folio_mapped); - struct anon_vma *folio_anon_vma(struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; @@ -811,59 +757,6 @@ struct address_space *folio_mapping(struct folio *folio) } EXPORT_SYMBOL(folio_mapping); -/* Slow path of page_mapcount() for compound pages */ -int __page_mapcount(struct page *page) -{ - int ret; - - ret = atomic_read(&page->_mapcount) + 1; - /* - * For file THP page->_mapcount contains total number of mapping - * of the page: no need to look into compound_mapcount. - */ - if (!PageAnon(page) && !PageHuge(page)) - return ret; - page = compound_head(page); - ret += atomic_read(compound_mapcount_ptr(page)) + 1; - if (PageDoubleMap(page)) - ret--; - return ret; -} -EXPORT_SYMBOL_GPL(__page_mapcount); - -/** - * folio_mapcount() - Calculate the number of mappings of this folio. - * @folio: The folio. - * - * A large folio tracks both how many times the entire folio is mapped, - * and how many times each individual page in the folio is mapped. - * This function calculates the total number of times the folio is - * mapped. - * - * Return: The number of times this folio is mapped. - */ -int folio_mapcount(struct folio *folio) -{ - int i, compound, nr, ret; - - if (likely(!folio_test_large(folio))) - return atomic_read(&folio->_mapcount) + 1; - - compound = folio_entire_mapcount(folio); - nr = folio_nr_pages(folio); - if (folio_test_hugetlb(folio)) - return compound; - ret = compound; - for (i = 0; i < nr; i++) - ret += atomic_read(&folio_page(folio, i)->_mapcount) + 1; - /* File pages has compound_mapcount included in _mapcount */ - if (!folio_test_anon(folio)) - return ret - compound * nr; - if (folio_test_double_map(folio)) - ret -= nr; - return ret; -} - /** * folio_copy - Copy the contents of one folio to another. * @dst: Folio to copy to. @@ -1052,6 +945,8 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; error: + pr_warn_ratelimited("%s: pid: %d, comm: %s, no enough memory for the allocation\n", + __func__, current->pid, current->comm); vm_unacct_memory(pages); return -ENOMEM; diff --git a/mm/vmacache.c b/mm/vmacache.c deleted file mode 100644 index 01a6e6688ec1..000000000000 --- a/mm/vmacache.c +++ /dev/null @@ -1,117 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2014 Davidlohr Bueso. - */ -#include <linux/sched/signal.h> -#include <linux/sched/task.h> -#include <linux/mm.h> -#include <linux/vmacache.h> - -/* - * Hash based on the pmd of addr if configured with MMU, which provides a good - * hit rate for workloads with spatial locality. Otherwise, use pages. - */ -#ifdef CONFIG_MMU -#define VMACACHE_SHIFT PMD_SHIFT -#else -#define VMACACHE_SHIFT PAGE_SHIFT -#endif -#define VMACACHE_HASH(addr) ((addr >> VMACACHE_SHIFT) & VMACACHE_MASK) - -/* - * This task may be accessing a foreign mm via (for example) - * get_user_pages()->find_vma(). The vmacache is task-local and this - * task's vmacache pertains to a different mm (ie, its own). There is - * nothing we can do here. - * - * Also handle the case where a kernel thread has adopted this mm via - * kthread_use_mm(). That kernel thread's vmacache is not applicable to this mm. - */ -static inline bool vmacache_valid_mm(struct mm_struct *mm) -{ - return current->mm == mm && !(current->flags & PF_KTHREAD); -} - -void vmacache_update(unsigned long addr, struct vm_area_struct *newvma) -{ - if (vmacache_valid_mm(newvma->vm_mm)) - current->vmacache.vmas[VMACACHE_HASH(addr)] = newvma; -} - -static bool vmacache_valid(struct mm_struct *mm) -{ - struct task_struct *curr; - - if (!vmacache_valid_mm(mm)) - return false; - - curr = current; - if (mm->vmacache_seqnum != curr->vmacache.seqnum) { - /* - * First attempt will always be invalid, initialize - * the new cache for this task here. - */ - curr->vmacache.seqnum = mm->vmacache_seqnum; - vmacache_flush(curr); - return false; - } - return true; -} - -struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) -{ - int idx = VMACACHE_HASH(addr); - int i; - - count_vm_vmacache_event(VMACACHE_FIND_CALLS); - - if (!vmacache_valid(mm)) - return NULL; - - for (i = 0; i < VMACACHE_SIZE; i++) { - struct vm_area_struct *vma = current->vmacache.vmas[idx]; - - if (vma) { -#ifdef CONFIG_DEBUG_VM_VMACACHE - if (WARN_ON_ONCE(vma->vm_mm != mm)) - break; -#endif - if (vma->vm_start <= addr && vma->vm_end > addr) { - count_vm_vmacache_event(VMACACHE_FIND_HITS); - return vma; - } - } - if (++idx == VMACACHE_SIZE) - idx = 0; - } - - return NULL; -} - -#ifndef CONFIG_MMU -struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, - unsigned long start, - unsigned long end) -{ - int idx = VMACACHE_HASH(start); - int i; - - count_vm_vmacache_event(VMACACHE_FIND_CALLS); - - if (!vmacache_valid(mm)) - return NULL; - - for (i = 0; i < VMACACHE_SIZE; i++) { - struct vm_area_struct *vma = current->vmacache.vmas[idx]; - - if (vma && vma->vm_start == start && vma->vm_end == end) { - count_vm_vmacache_event(VMACACHE_FIND_HITS); - return vma; - } - if (++idx == VMACACHE_SIZE) - idx = 0; - } - - return NULL; -} -#endif diff --git a/mm/vmalloc.c b/mm/vmalloc.c index dd6cdb201195..ca71de7c9d77 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -43,6 +43,9 @@ #include <asm/tlbflush.h> #include <asm/shmparam.h> +#define CREATE_TRACE_POINTS +#include <trace/events/vmalloc.h> + #include "internal.h" #include "pgalloc-track.h" @@ -320,6 +323,9 @@ int ioremap_page_range(unsigned long addr, unsigned long end, err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot), ioremap_max_page_shift); flush_cache_vmap(addr, end); + if (!err) + kmsan_ioremap_page_range(addr, end, phys_addr, prot, + ioremap_max_page_shift); return err; } @@ -416,7 +422,7 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, * * This is an internal function only. Do not use outside mm/. */ -void vunmap_range_noflush(unsigned long start, unsigned long end) +void __vunmap_range_noflush(unsigned long start, unsigned long end) { unsigned long next; pgd_t *pgd; @@ -438,6 +444,12 @@ void vunmap_range_noflush(unsigned long start, unsigned long end) arch_sync_kernel_mappings(start, end); } +void vunmap_range_noflush(unsigned long start, unsigned long end) +{ + kmsan_vunmap_range_noflush(start, end); + __vunmap_range_noflush(start, end); +} + /** * vunmap_range - unmap kernel virtual addresses * @addr: start of the VM area to unmap @@ -575,7 +587,7 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, * * This is an internal function only. Do not use outside mm/. */ -int vmap_pages_range_noflush(unsigned long addr, unsigned long end, +int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { unsigned int i, nr = (end - addr) >> PAGE_SHIFT; @@ -590,7 +602,7 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end, int err; err = vmap_range_noflush(addr, addr + (1UL << page_shift), - __pa(page_address(pages[i])), prot, + page_to_phys(pages[i]), prot, page_shift); if (err) return err; @@ -601,6 +613,13 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end, return 0; } +int vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + kmsan_vmap_pages_range_noflush(addr, end, prot, pages, page_shift); + return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift); +} + /** * vmap_pages_range - map pages to a kernel virtual address * @addr: start of the VM area to map @@ -1300,12 +1319,12 @@ find_vmap_lowest_match(struct rb_root *root, unsigned long size, #include <linux/random.h> static struct vmap_area * -find_vmap_lowest_linear_match(unsigned long size, +find_vmap_lowest_linear_match(struct list_head *head, unsigned long size, unsigned long align, unsigned long vstart) { struct vmap_area *va; - list_for_each_entry(va, &free_vmap_area_list, list) { + list_for_each_entry(va, head, list) { if (!is_within_this_va(va, size, align, vstart)) continue; @@ -1316,7 +1335,8 @@ find_vmap_lowest_linear_match(unsigned long size, } static void -find_vmap_lowest_match_check(unsigned long size, unsigned long align) +find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head, + unsigned long size, unsigned long align) { struct vmap_area *va_1, *va_2; unsigned long vstart; @@ -1325,8 +1345,8 @@ find_vmap_lowest_match_check(unsigned long size, unsigned long align) get_random_bytes(&rnd, sizeof(rnd)); vstart = VMALLOC_START + rnd; - va_1 = find_vmap_lowest_match(size, align, vstart, false); - va_2 = find_vmap_lowest_linear_match(size, align, vstart); + va_1 = find_vmap_lowest_match(root, size, align, vstart, false); + va_2 = find_vmap_lowest_linear_match(head, size, align, vstart); if (va_1 != va_2) pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", @@ -1513,7 +1533,7 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head, return vend; #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK - find_vmap_lowest_match_check(size, align); + find_vmap_lowest_match_check(root, head, size, align); #endif return nva_start_addr; @@ -1603,6 +1623,8 @@ retry: size, align, vstart, vend); spin_unlock(&free_vmap_area_lock); + trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend); + /* * If an allocation fails, the "vend" address is * returned. Therefore trigger the overflow path. @@ -1708,6 +1730,7 @@ static void purge_fragmented_blocks_allcpus(void); static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) { unsigned long resched_threshold; + unsigned int num_purged_areas = 0; struct list_head local_purge_list; struct vmap_area *va, *n_va; @@ -1719,7 +1742,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) spin_unlock(&purge_vmap_area_lock); if (unlikely(list_empty(&local_purge_list))) - return false; + goto out; start = min(start, list_first_entry(&local_purge_list, @@ -1754,12 +1777,16 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) va->va_start, va->va_end); atomic_long_sub(nr, &vmap_lazy_nr); + num_purged_areas++; if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) cond_resched_lock(&free_vmap_area_lock); } spin_unlock(&free_vmap_area_lock); - return true; + +out: + trace_purge_vmap_area_lazy(start, end, num_purged_areas); + return num_purged_areas > 0; } /* @@ -1794,6 +1821,8 @@ static void drain_vmap_area_work(struct work_struct *work) */ static void free_vmap_area_noflush(struct vmap_area *va) { + unsigned long nr_lazy_max = lazy_max_pages(); + unsigned long va_start = va->va_start; unsigned long nr_lazy; spin_lock(&vmap_area_lock); @@ -1811,8 +1840,10 @@ static void free_vmap_area_noflush(struct vmap_area *va) &purge_vmap_area_root, &purge_vmap_area_list); spin_unlock(&purge_vmap_area_lock); + trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max); + /* After this point, we may free va at any time */ - if (unlikely(nr_lazy > lazy_max_pages())) + if (unlikely(nr_lazy > nr_lazy_max)) schedule_work(&drain_vmap_work); } diff --git a/mm/vmscan.c b/mm/vmscan.c index b2b1431352dc..bd6637fcd8f9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -43,12 +43,18 @@ #include <linux/migrate.h> #include <linux/delayacct.h> #include <linux/sysctl.h> +#include <linux/memory-tiers.h> #include <linux/oom.h> #include <linux/pagevec.h> #include <linux/prefetch.h> #include <linux/printk.h> #include <linux/dax.h> #include <linux/psi.h> +#include <linux/pagewalk.h> +#include <linux/shmem_fs.h> +#include <linux/ctype.h> +#include <linux/debugfs.h> +#include <linux/khugepaged.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -85,7 +91,7 @@ struct scan_control { unsigned long anon_cost; unsigned long file_cost; - /* Can active pages be deactivated as part of reclaim? */ + /* Can active folios be deactivated as part of reclaim? */ #define DEACTIVATE_ANON 1 #define DEACTIVATE_FILE 2 unsigned int may_deactivate:2; @@ -95,10 +101,10 @@ struct scan_control { /* Writepage batching in laptop mode; RECLAIM_WRITE */ unsigned int may_writepage:1; - /* Can mapped pages be reclaimed? */ + /* Can mapped folios be reclaimed? */ unsigned int may_unmap:1; - /* Can pages be swapped as part of reclaim? */ + /* Can folios be swapped as part of reclaim? */ unsigned int may_swap:1; /* Proactive reclaim invoked by userspace through memory.reclaim */ @@ -123,19 +129,25 @@ struct scan_control { /* There is easily reclaimable cold cache in the current node */ unsigned int cache_trim_mode:1; - /* The file pages on the current node are dangerously low */ + /* The file folios on the current node are dangerously low */ unsigned int file_is_tiny:1; /* Always discard instead of demoting to lower tier memory */ unsigned int no_demotion:1; +#ifdef CONFIG_LRU_GEN + /* help kswapd make better choices among multiple memcgs */ + unsigned int memcgs_need_aging:1; + unsigned long last_reclaimed; +#endif + /* Allocation order */ s8 order; /* Scan (total_size >> priority) pages at once */ s8 priority; - /* The highest zone to isolate pages for reclaim from */ + /* The highest zone to isolate folios for reclaim from */ s8 reclaim_idx; /* This context's GFP mask */ @@ -443,7 +455,7 @@ static bool cgroup_reclaim(struct scan_control *sc) * * The normal page dirty throttling mechanism in balance_dirty_pages() is * completely broken with the legacy memcg and direct stalling in - * shrink_page_list() is used for throttling instead, which lacks all the + * shrink_folio_list() is used for throttling instead, which lacks all the * niceties such as fairness, adaptive pausing, bandwidth proportional * allocation and configurability. * @@ -564,9 +576,9 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, } /* - * This misses isolated pages which are not accounted for to save counters. + * This misses isolated folios which are not accounted for to save counters. * As the data only determines if reclaim or compaction continues, it is - * not expected that isolated pages will be a dominating factor. + * not expected that isolated folios will be a dominating factor. */ unsigned long zone_reclaimable_pages(struct zone *zone) { @@ -1009,39 +1021,60 @@ out: return freed; } -static void drop_slab_node(int nid) +static unsigned long drop_slab_node(int nid) { - unsigned long freed; - int shift = 0; + unsigned long freed = 0; + struct mem_cgroup *memcg = NULL; + memcg = mem_cgroup_iter(NULL, NULL, NULL); do { - struct mem_cgroup *memcg = NULL; + freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); - if (fatal_signal_pending(current)) - return; + return freed; +} +void drop_slab(void) +{ + int nid; + int shift = 0; + unsigned long freed; + + do { freed = 0; - memcg = mem_cgroup_iter(NULL, NULL, NULL); - do { - freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); - } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); + for_each_online_node(nid) { + if (fatal_signal_pending(current)) + return; + + freed += drop_slab_node(nid); + } } while ((freed >> shift++) > 1); } -void drop_slab(void) +static int reclaimer_offset(void) { - int nid; + BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != + PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD); + BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != + PGSCAN_DIRECT - PGSCAN_KSWAPD); + BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != + PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD); + BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != + PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD); - for_each_online_node(nid) - drop_slab_node(nid); + if (current_is_kswapd()) + return 0; + if (current_is_khugepaged()) + return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD; + return PGSTEAL_DIRECT - PGSTEAL_KSWAPD; } static inline int is_page_cache_freeable(struct folio *folio) { /* - * A freeable page cache page is referenced only by the caller - * that isolated the page, the page cache and optional buffer - * heads at page->private. + * A freeable page cache folio is referenced only by the caller + * that isolated the folio, the page cache and optional filesystem + * private data at folio->private. */ return folio_ref_count(folio) - folio_test_private(folio) == 1 + folio_nr_pages(folio); @@ -1081,8 +1114,8 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat) return true; /* - * If there are a lot of dirty/writeback pages then do not - * throttle as throttling will occur when the pages cycle + * If there are a lot of dirty/writeback folios then do not + * throttle as throttling will occur when the folios cycle * towards the end of the LRU if still under writeback. */ for (i = 0; i < MAX_NR_ZONES; i++) { @@ -1125,7 +1158,7 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) * short. Failing to make progress or waiting on writeback are * potentially long-lived events so use a longer timeout. This is shaky * logic as a failure to make progress could be due to anything from - * writeback to a slow device to excessive references pages at the tail + * writeback to a slow device to excessive referenced folios at the tail * of the inactive LRU. */ switch(reason) { @@ -1171,8 +1204,8 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) } /* - * Account for pages written if tasks are throttled waiting on dirty - * pages to clean. If enough pages have been cleaned since throttling + * Account for folios written if tasks are throttled waiting on dirty + * folios to clean. If enough folios have been cleaned since throttling * started then wakeup the throttled tasks. */ void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, @@ -1198,18 +1231,18 @@ void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, /* possible outcome of pageout() */ typedef enum { - /* failed to write page out, page is locked */ + /* failed to write folio out, folio is locked */ PAGE_KEEP, - /* move page to the active list, page is locked */ + /* move folio to the active list, folio is locked */ PAGE_ACTIVATE, - /* page has been sent to the disk successfully, page is unlocked */ + /* folio has been sent to the disk successfully, folio is unlocked */ PAGE_SUCCESS, - /* page is clean and locked */ + /* folio is clean and locked */ PAGE_CLEAN, } pageout_t; /* - * pageout is called by shrink_page_list() for each dirty page. + * pageout is called by shrink_folio_list() for each dirty folio. * Calls ->writepage(). */ static pageout_t pageout(struct folio *folio, struct address_space *mapping, @@ -1283,7 +1316,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping, } /* - * Same as remove_mapping, but if the page is removed from the mapping, it + * Same as remove_mapping, but if the folio is removed from the mapping, it * gets returned with a refcount of 0. */ static int __remove_mapping(struct address_space *mapping, struct folio *folio, @@ -1299,34 +1332,34 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); /* - * The non racy check for a busy page. + * The non racy check for a busy folio. * * Must be careful with the order of the tests. When someone has - * a ref to the page, it may be possible that they dirty it then - * drop the reference. So if PageDirty is tested before page_count - * here, then the following race may occur: + * a ref to the folio, it may be possible that they dirty it then + * drop the reference. So if the dirty flag is tested before the + * refcount here, then the following race may occur: * * get_user_pages(&page); * [user mapping goes away] * write_to(page); - * !PageDirty(page) [good] - * SetPageDirty(page); - * put_page(page); - * !page_count(page) [good, discard it] + * !folio_test_dirty(folio) [good] + * folio_set_dirty(folio); + * folio_put(folio); + * !refcount(folio) [good, discard it] * * [oops, our write_to data is lost] * * Reversing the order of the tests ensures such a situation cannot - * escape unnoticed. The smp_rmb is needed to ensure the page->flags - * load is not satisfied before that of page->_refcount. + * escape unnoticed. The smp_rmb is needed to ensure the folio->flags + * load is not satisfied before that of folio->_refcount. * - * Note that if SetPageDirty is always performed via set_page_dirty, + * Note that if the dirty flag is always set via folio_mark_dirty, * and thus under the i_pages lock, then this ordering is not required. */ refcount = 1 + folio_nr_pages(folio); if (!folio_ref_freeze(folio, refcount)) goto cannot_free; - /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */ + /* note: atomic_cmpxchg in folio_ref_freeze provides the smp_rmb */ if (unlikely(folio_test_dirty(folio))) { folio_ref_unfreeze(folio, refcount); goto cannot_free; @@ -1334,12 +1367,13 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, if (folio_test_swapcache(folio)) { swp_entry_t swap = folio_swap_entry(folio); - mem_cgroup_swapout(folio, swap); + if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(folio, target_memcg); __delete_from_swap_cache(folio, swap, shadow); + mem_cgroup_swapout(folio, swap); xa_unlock_irq(&mapping->i_pages); - put_swap_page(&folio->page, swap); + put_swap_folio(folio, swap); } else { void (*free_folio)(struct folio *); @@ -1355,7 +1389,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, * back. * * We also don't store shadows for DAX mappings because the - * only page cache pages found in these are zero pages + * only page cache folios found in these are zero pages * covering holes, and because we don't want to mix DAX * exceptional entries and shadow exceptional entries in the * same address_space. @@ -1423,14 +1457,14 @@ void folio_putback_lru(struct folio *folio) folio_put(folio); /* drop ref from isolate */ } -enum page_references { - PAGEREF_RECLAIM, - PAGEREF_RECLAIM_CLEAN, - PAGEREF_KEEP, - PAGEREF_ACTIVATE, +enum folio_references { + FOLIOREF_RECLAIM, + FOLIOREF_RECLAIM_CLEAN, + FOLIOREF_KEEP, + FOLIOREF_ACTIVATE, }; -static enum page_references folio_check_references(struct folio *folio, +static enum folio_references folio_check_references(struct folio *folio, struct scan_control *sc) { int referenced_ptes, referenced_folio; @@ -1445,11 +1479,11 @@ static enum page_references folio_check_references(struct folio *folio, * Let the folio, now marked Mlocked, be moved to the unevictable list. */ if (vm_flags & VM_LOCKED) - return PAGEREF_ACTIVATE; + return FOLIOREF_ACTIVATE; /* rmap lock contention: rotate */ if (referenced_ptes == -1) - return PAGEREF_KEEP; + return FOLIOREF_KEEP; if (referenced_ptes) { /* @@ -1469,34 +1503,34 @@ static enum page_references folio_check_references(struct folio *folio, folio_set_referenced(folio); if (referenced_folio || referenced_ptes > 1) - return PAGEREF_ACTIVATE; + return FOLIOREF_ACTIVATE; /* * Activate file-backed executable folios after first usage. */ if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) - return PAGEREF_ACTIVATE; + return FOLIOREF_ACTIVATE; - return PAGEREF_KEEP; + return FOLIOREF_KEEP; } /* Reclaim if clean, defer dirty folios to writeback */ if (referenced_folio && folio_is_file_lru(folio)) - return PAGEREF_RECLAIM_CLEAN; + return FOLIOREF_RECLAIM_CLEAN; - return PAGEREF_RECLAIM; + return FOLIOREF_RECLAIM; } -/* Check if a page is dirty or under writeback */ +/* Check if a folio is dirty or under writeback */ static void folio_check_dirty_writeback(struct folio *folio, bool *dirty, bool *writeback) { struct address_space *mapping; /* - * Anonymous pages are not handled by flushers and must be written + * Anonymous folios are not handled by flushers and must be written * from reclaim context. Do not stall reclaim based on them. - * MADV_FREE anonymous pages are put into inactive file list too. + * MADV_FREE anonymous folios are put into inactive file list too. * They could be mistakenly treated as file lru. So further anon * test is needed. */ @@ -1520,49 +1554,73 @@ static void folio_check_dirty_writeback(struct folio *folio, mapping->a_ops->is_dirty_writeback(folio, dirty, writeback); } -static struct page *alloc_demote_page(struct page *page, unsigned long node) +static struct page *alloc_demote_page(struct page *page, unsigned long private) { - struct migration_target_control mtc = { - /* - * Allocate from 'node', or fail quickly and quietly. - * When this happens, 'page' will likely just be discarded - * instead of migrated. - */ - .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | - __GFP_THISNODE | __GFP_NOWARN | - __GFP_NOMEMALLOC | GFP_NOWAIT, - .nid = node - }; + struct page *target_page; + nodemask_t *allowed_mask; + struct migration_target_control *mtc; - return alloc_migration_target(page, (unsigned long)&mtc); + mtc = (struct migration_target_control *)private; + + allowed_mask = mtc->nmask; + /* + * make sure we allocate from the target node first also trying to + * demote or reclaim pages from the target node via kswapd if we are + * low on free memory on target node. If we don't do this and if + * we have free memory on the slower(lower) memtier, we would start + * allocating pages from slower(lower) memory tiers without even forcing + * a demotion of cold pages from the target memtier. This can result + * in the kernel placing hot pages in slower(lower) memory tiers. + */ + mtc->nmask = NULL; + mtc->gfp_mask |= __GFP_THISNODE; + target_page = alloc_migration_target(page, (unsigned long)mtc); + if (target_page) + return target_page; + + mtc->gfp_mask &= ~__GFP_THISNODE; + mtc->nmask = allowed_mask; + + return alloc_migration_target(page, (unsigned long)mtc); } /* - * Take pages on @demote_list and attempt to demote them to - * another node. Pages which are not demoted are left on - * @demote_pages. + * Take folios on @demote_folios and attempt to demote them to another node. + * Folios which are not demoted are left on @demote_folios. */ -static unsigned int demote_page_list(struct list_head *demote_pages, +static unsigned int demote_folio_list(struct list_head *demote_folios, struct pglist_data *pgdat) { int target_nid = next_demotion_node(pgdat->node_id); unsigned int nr_succeeded; + nodemask_t allowed_mask; - if (list_empty(demote_pages)) + struct migration_target_control mtc = { + /* + * Allocate from 'node', or fail quickly and quietly. + * When this happens, 'page' will likely just be discarded + * instead of migrated. + */ + .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN | + __GFP_NOMEMALLOC | GFP_NOWAIT, + .nid = target_nid, + .nmask = &allowed_mask + }; + + if (list_empty(demote_folios)) return 0; if (target_nid == NUMA_NO_NODE) return 0; + node_get_allowed_targets(pgdat, &allowed_mask); + /* Demotion ignores all cpuset and mempolicy settings */ - migrate_pages(demote_pages, alloc_demote_page, NULL, - target_nid, MIGRATE_ASYNC, MR_DEMOTION, - &nr_succeeded); + migrate_pages(demote_folios, alloc_demote_page, NULL, + (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, + &nr_succeeded); - if (current_is_kswapd()) - __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded); - else - __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded); + __count_vm_events(PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded); return nr_succeeded; } @@ -1584,17 +1642,15 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) } /* - * shrink_page_list() returns the number of reclaimed pages + * shrink_folio_list() returns the number of reclaimed pages */ -static unsigned int shrink_page_list(struct list_head *page_list, - struct pglist_data *pgdat, - struct scan_control *sc, - struct reclaim_stat *stat, - bool ignore_references) -{ - LIST_HEAD(ret_pages); - LIST_HEAD(free_pages); - LIST_HEAD(demote_pages); +static unsigned int shrink_folio_list(struct list_head *folio_list, + struct pglist_data *pgdat, struct scan_control *sc, + struct reclaim_stat *stat, bool ignore_references) +{ + LIST_HEAD(ret_folios); + LIST_HEAD(free_folios); + LIST_HEAD(demote_folios); unsigned int nr_reclaimed = 0; unsigned int pgactivate = 0; bool do_demote_pass; @@ -1605,16 +1661,16 @@ static unsigned int shrink_page_list(struct list_head *page_list, do_demote_pass = can_demote(pgdat->node_id, sc); retry: - while (!list_empty(page_list)) { + while (!list_empty(folio_list)) { struct address_space *mapping; struct folio *folio; - enum page_references references = PAGEREF_RECLAIM; + enum folio_references references = FOLIOREF_RECLAIM; bool dirty, writeback; unsigned int nr_pages; cond_resched(); - folio = lru_to_folio(page_list); + folio = lru_to_folio(folio_list); list_del(&folio->lru); if (!folio_trylock(folio)) @@ -1633,6 +1689,11 @@ retry: if (!sc->may_unmap && folio_mapped(folio)) goto keep_locked; + /* folio_update_gen() tried to promote this page? */ + if (lru_gen_enabled() && !ignore_references && + folio_mapped(folio) && folio_test_referenced(folio)) + goto keep_locked; + /* * The number of dirty pages determines if a node is marked * reclaim_congested. kswapd will stall and start writing @@ -1733,7 +1794,7 @@ retry: folio_unlock(folio); folio_wait_writeback(folio); /* then go back and try same folio again */ - list_add_tail(&folio->lru, page_list); + list_add_tail(&folio->lru, folio_list); continue; } } @@ -1742,13 +1803,13 @@ retry: references = folio_check_references(folio, sc); switch (references) { - case PAGEREF_ACTIVATE: + case FOLIOREF_ACTIVATE: goto activate_locked; - case PAGEREF_KEEP: + case FOLIOREF_KEEP: stat->nr_ref_keep += nr_pages; goto keep_locked; - case PAGEREF_RECLAIM: - case PAGEREF_RECLAIM_CLEAN: + case FOLIOREF_RECLAIM: + case FOLIOREF_RECLAIM_CLEAN: ; /* try to reclaim the folio below */ } @@ -1758,7 +1819,7 @@ retry: */ if (do_demote_pass && (thp_migration_supported() || !folio_test_large(folio))) { - list_add(&folio->lru, &demote_pages); + list_add(&folio->lru, &demote_folios); folio_unlock(folio); continue; } @@ -1785,7 +1846,7 @@ retry: */ if (!folio_entire_mapcount(folio) && split_folio_to_list(folio, - page_list)) + folio_list)) goto activate_locked; } if (!add_to_swap(folio)) { @@ -1793,7 +1854,7 @@ retry: goto activate_locked_split; /* Fallback to swap normal pages */ if (split_folio_to_list(folio, - page_list)) + folio_list)) goto activate_locked; #ifdef CONFIG_TRANSPARENT_HUGEPAGE count_vm_event(THP_SWPOUT_FALLBACK); @@ -1805,7 +1866,7 @@ retry: } else if (folio_test_swapbacked(folio) && folio_test_large(folio)) { /* Split shmem folio */ - if (split_folio_to_list(folio, page_list)) + if (split_folio_to_list(folio, folio_list)) goto keep_locked; } @@ -1870,7 +1931,7 @@ retry: goto activate_locked; } - if (references == PAGEREF_RECLAIM_CLEAN) + if (references == FOLIOREF_RECLAIM_CLEAN) goto keep_locked; if (!may_enter_fs(folio, sc->gfp_mask)) goto keep_locked; @@ -1983,13 +2044,13 @@ free_it: nr_reclaimed += nr_pages; /* - * Is there need to periodically free_page_list? It would + * Is there need to periodically free_folio_list? It would * appear not as the counts should be low */ if (unlikely(folio_test_large(folio))) destroy_large_folio(folio); else - list_add(&folio->lru, &free_pages); + list_add(&folio->lru, &free_folios); continue; activate_locked_split: @@ -2004,9 +2065,8 @@ activate_locked_split: activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ if (folio_test_swapcache(folio) && - (mem_cgroup_swap_full(&folio->page) || - folio_test_mlocked(folio))) - try_to_free_swap(&folio->page); + (mem_cgroup_swap_full(folio) || folio_test_mlocked(folio))) + folio_free_swap(folio); VM_BUG_ON_FOLIO(folio_test_active(folio), folio); if (!folio_test_mlocked(folio)) { int type = folio_is_file_lru(folio); @@ -2017,29 +2077,48 @@ activate_locked: keep_locked: folio_unlock(folio); keep: - list_add(&folio->lru, &ret_pages); + list_add(&folio->lru, &ret_folios); VM_BUG_ON_FOLIO(folio_test_lru(folio) || folio_test_unevictable(folio), folio); } - /* 'page_list' is always empty here */ + /* 'folio_list' is always empty here */ /* Migrate folios selected for demotion */ - nr_reclaimed += demote_page_list(&demote_pages, pgdat); - /* Folios that could not be demoted are still in @demote_pages */ - if (!list_empty(&demote_pages)) { - /* Folios which weren't demoted go back on @page_list for retry: */ - list_splice_init(&demote_pages, page_list); - do_demote_pass = false; - goto retry; + nr_reclaimed += demote_folio_list(&demote_folios, pgdat); + /* Folios that could not be demoted are still in @demote_folios */ + if (!list_empty(&demote_folios)) { + /* Folios which weren't demoted go back on @folio_list */ + list_splice_init(&demote_folios, folio_list); + + /* + * goto retry to reclaim the undemoted folios in folio_list if + * desired. + * + * Reclaiming directly from top tier nodes is not often desired + * due to it breaking the LRU ordering: in general memory + * should be reclaimed from lower tier nodes and demoted from + * top tier nodes. + * + * However, disabling reclaim from top tier nodes entirely + * would cause ooms in edge scenarios where lower tier memory + * is unreclaimable for whatever reason, eg memory being + * mlocked or too hot to reclaim. We can disable reclaim + * from top tier nodes in proactive reclaim though as that is + * not real memory pressure. + */ + if (!sc->proactive) { + do_demote_pass = false; + goto retry; + } } pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; - mem_cgroup_uncharge_list(&free_pages); + mem_cgroup_uncharge_list(&free_folios); try_to_unmap_flush(); - free_unref_page_list(&free_pages); + free_unref_page_list(&free_folios); - list_splice(&ret_pages, page_list); + list_splice(&ret_folios, folio_list); count_vm_events(PGACTIVATE, pgactivate); if (plug) @@ -2048,7 +2127,7 @@ keep: } unsigned int reclaim_clean_pages_from_list(struct zone *zone, - struct list_head *folio_list) + struct list_head *folio_list) { struct scan_control sc = { .gfp_mask = GFP_KERNEL, @@ -2076,7 +2155,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, * change in the future. */ noreclaim_flag = memalloc_noreclaim_save(); - nr_reclaimed = shrink_page_list(&clean_folios, zone->zone_pgdat, &sc, + nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc, &stat, true); memalloc_noreclaim_restore(noreclaim_flag); @@ -2135,7 +2214,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, * * returns how many pages were moved onto *@dst. */ -static unsigned long isolate_lru_pages(unsigned long nr_to_scan, +static unsigned long isolate_lru_folios(unsigned long nr_to_scan, struct lruvec *lruvec, struct list_head *dst, unsigned long *nr_scanned, struct scan_control *sc, enum lru_list lru) @@ -2242,8 +2321,8 @@ move: * * Context: * - * (1) Must be called with an elevated refcount on the page. This is a - * fundamental difference from isolate_lru_pages() (which is called + * (1) Must be called with an elevated refcount on the folio. This is a + * fundamental difference from isolate_lru_folios() (which is called * without a stable reference). * (2) The lru_lock must not be held. * (3) Interrupts must be enabled. @@ -2315,13 +2394,13 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, } /* - * move_pages_to_lru() moves folios from private @list to appropriate LRU list. + * move_folios_to_lru() moves folios from private @list to appropriate LRU list. * On return, @list is reused as a list of folios to be freed by the caller. * * Returns the number of pages moved to the given lruvec. */ -static unsigned int move_pages_to_lru(struct lruvec *lruvec, - struct list_head *list) +static unsigned int move_folios_to_lru(struct lruvec *lruvec, + struct list_head *list) { int nr_pages, nr_moved = 0; LIST_HEAD(folios_to_free); @@ -2341,7 +2420,7 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec, /* * The folio_set_lru needs to be kept here for list integrity. * Otherwise: - * #0 move_pages_to_lru #1 release_pages + * #0 move_folios_to_lru #1 release_pages * if (!folio_put_testzero()) * if (folio_put_testzero()) * !lru //skip lru_lock @@ -2398,11 +2477,11 @@ static int current_may_throttle(void) * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages */ -static unsigned long -shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, - struct scan_control *sc, enum lru_list lru) +static unsigned long shrink_inactive_list(unsigned long nr_to_scan, + struct lruvec *lruvec, struct scan_control *sc, + enum lru_list lru) { - LIST_HEAD(page_list); + LIST_HEAD(folio_list); unsigned long nr_scanned; unsigned int nr_reclaimed = 0; unsigned long nr_taken; @@ -2429,11 +2508,11 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_lock_irq(&lruvec->lru_lock); - nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, + nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); - item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; + item = PGSCAN_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_scanned); __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); @@ -2444,36 +2523,48 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (nr_taken == 0) return 0; - nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false); + nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false); spin_lock_irq(&lruvec->lru_lock); - move_pages_to_lru(lruvec, &page_list); + move_folios_to_lru(lruvec, &folio_list); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; + item = PGSTEAL_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); spin_unlock_irq(&lruvec->lru_lock); - lru_note_cost(lruvec, file, stat.nr_pageout); - mem_cgroup_uncharge_list(&page_list); - free_unref_page_list(&page_list); + lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); + mem_cgroup_uncharge_list(&folio_list); + free_unref_page_list(&folio_list); /* - * If dirty pages are scanned that are not queued for IO, it + * If dirty folios are scanned that are not queued for IO, it * implies that flushers are not doing their job. This can - * happen when memory pressure pushes dirty pages to the end of + * happen when memory pressure pushes dirty folios to the end of * the LRU before the dirty limits are breached and the dirty * data has expired. It can also happen when the proportion of - * dirty pages grows not through writes but through memory + * dirty folios grows not through writes but through memory * pressure reclaiming all the clean cache. And in some cases, * the flushers simply cannot keep up with the allocation * rate. Nudge the flusher threads in case they are asleep. */ - if (stat.nr_unqueued_dirty == nr_taken) + if (stat.nr_unqueued_dirty == nr_taken) { wakeup_flusher_threads(WB_REASON_VMSCAN); + /* + * For cgroupv1 dirty throttling is achieved by waking up + * the kernel flusher here and later waiting on folios + * which are in writeback to finish (see shrink_folio_list()). + * + * Flusher may not be able to issue writeback quickly + * enough for cgroupv1 writeback throttling to work + * on a large system. + */ + if (!writeback_throttling_sane(sc)) + reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); + } sc->nr.dirty += stat.nr_dirty; sc->nr.congested += stat.nr_congested; @@ -2526,7 +2617,7 @@ static void shrink_active_list(unsigned long nr_to_scan, spin_lock_irq(&lruvec->lru_lock); - nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, + nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); @@ -2550,8 +2641,8 @@ static void shrink_active_list(unsigned long nr_to_scan, } if (unlikely(buffer_heads_over_limit)) { - if (folio_get_private(folio) && folio_trylock(folio)) { - if (folio_get_private(folio)) + if (folio_test_private(folio) && folio_trylock(folio)) { + if (folio_test_private(folio)) filemap_release_folio(folio, 0); folio_unlock(folio); } @@ -2586,8 +2677,8 @@ static void shrink_active_list(unsigned long nr_to_scan, */ spin_lock_irq(&lruvec->lru_lock); - nr_activate = move_pages_to_lru(lruvec, &l_active); - nr_deactivate = move_pages_to_lru(lruvec, &l_inactive); + nr_activate = move_folios_to_lru(lruvec, &l_active); + nr_deactivate = move_folios_to_lru(lruvec, &l_inactive); /* Keep all free folios in l_active list */ list_splice(&l_inactive, &l_active); @@ -2597,13 +2688,15 @@ static void shrink_active_list(unsigned long nr_to_scan, __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&lruvec->lru_lock); + if (nr_rotated) + lru_note_cost(lruvec, file, 0, nr_rotated); mem_cgroup_uncharge_list(&l_active); free_unref_page_list(&l_active); trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, nr_deactivate, nr_rotated, sc->priority, file); } -static unsigned int reclaim_page_list(struct list_head *page_list, +static unsigned int reclaim_folio_list(struct list_head *folio_list, struct pglist_data *pgdat) { struct reclaim_stat dummy_stat; @@ -2617,9 +2710,9 @@ static unsigned int reclaim_page_list(struct list_head *page_list, .no_demotion = 1, }; - nr_reclaimed = shrink_page_list(page_list, pgdat, &sc, &dummy_stat, false); - while (!list_empty(page_list)) { - folio = lru_to_folio(page_list); + nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false); + while (!list_empty(folio_list)) { + folio = lru_to_folio(folio_list); list_del(&folio->lru); folio_putback_lru(folio); } @@ -2649,11 +2742,11 @@ unsigned long reclaim_pages(struct list_head *folio_list) continue; } - nr_reclaimed += reclaim_page_list(&node_folio_list, NODE_DATA(nid)); + nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); nid = folio_nid(lru_to_folio(folio_list)); } while (!list_empty(folio_list)); - nr_reclaimed += reclaim_page_list(&node_folio_list, NODE_DATA(nid)); + nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); memalloc_noreclaim_restore(noreclaim_flag); @@ -2683,13 +2776,13 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, * but large enough to avoid thrashing the aggregate readahead window. * * Both inactive lists should also be large enough that each inactive - * page has a chance to be referenced again before it is reclaimed. + * folio has a chance to be referenced again before it is reclaimed. * * If that fails and refaulting is observed, the inactive list grows. * - * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages + * The inactive_ratio is the target ratio of ACTIVE to INACTIVE folios * on this LRU, maintained by the pageout code. An inactive_ratio - * of 3 means 3:1 or 25% of the pages are kept on the inactive list. + * of 3 means 3:1 or 25% of the folios are kept on the inactive list. * * total target max * memory ratio inactive @@ -2728,12 +2821,118 @@ enum scan_balance { SCAN_FILE, }; +static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) +{ + unsigned long file; + struct lruvec *target_lruvec; + + if (lru_gen_enabled()) + return; + + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + + /* + * Flush the memory cgroup stats, so that we read accurate per-memcg + * lruvec stats for heuristics. + */ + mem_cgroup_flush_stats(); + + /* + * Determine the scan balance between anon and file LRUs. + */ + spin_lock_irq(&target_lruvec->lru_lock); + sc->anon_cost = target_lruvec->anon_cost; + sc->file_cost = target_lruvec->file_cost; + spin_unlock_irq(&target_lruvec->lru_lock); + + /* + * Target desirable inactive:active list ratios for the anon + * and file LRU lists. + */ + if (!sc->force_deactivate) { + unsigned long refaults; + + /* + * When refaults are being observed, it means a new + * workingset is being established. Deactivate to get + * rid of any stale active pages quickly. + */ + refaults = lruvec_page_state(target_lruvec, + WORKINGSET_ACTIVATE_ANON); + if (refaults != target_lruvec->refaults[WORKINGSET_ANON] || + inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) + sc->may_deactivate |= DEACTIVATE_ANON; + else + sc->may_deactivate &= ~DEACTIVATE_ANON; + + refaults = lruvec_page_state(target_lruvec, + WORKINGSET_ACTIVATE_FILE); + if (refaults != target_lruvec->refaults[WORKINGSET_FILE] || + inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) + sc->may_deactivate |= DEACTIVATE_FILE; + else + sc->may_deactivate &= ~DEACTIVATE_FILE; + } else + sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; + + /* + * If we have plenty of inactive file pages that aren't + * thrashing, try to reclaim those first before touching + * anonymous pages. + */ + file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); + if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) + sc->cache_trim_mode = 1; + else + sc->cache_trim_mode = 0; + + /* + * Prevent the reclaimer from falling into the cache trap: as + * cache pages start out inactive, every cache fault will tip + * the scan balance towards the file LRU. And as the file LRU + * shrinks, so does the window for rotation from references. + * This means we have a runaway feedback loop where a tiny + * thrashing file LRU becomes infinitely more attractive than + * anon pages. Try to detect this based on file LRU size. + */ + if (!cgroup_reclaim(sc)) { + unsigned long total_high_wmark = 0; + unsigned long free, anon; + int z; + + free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); + file = node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_FILE); + + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = &pgdat->node_zones[z]; + + if (!managed_zone(zone)) + continue; + + total_high_wmark += high_wmark_pages(zone); + } + + /* + * Consider anon: if that's low too, this isn't a + * runaway file reclaim problem, but rather just + * extreme pressure. Reclaim as per usual then. + */ + anon = node_page_state(pgdat, NR_INACTIVE_ANON); + + sc->file_is_tiny = + file + free <= total_high_wmark && + !(sc->may_deactivate & DEACTIVATE_ANON) && + anon >> sc->priority; + } +} + /* * Determine how aggressively the anon and file LRU lists should be * scanned. * - * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan - * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan + * nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan + * nr[2] = file inactive folios to scan; nr[3] = file active folios to scan */ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long *nr) @@ -2748,7 +2947,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long ap, fp; enum lru_list lru; - /* If we have no swap space, do not bother scanning anon pages. */ + /* If we have no swap space, do not bother scanning anon folios. */ if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) { scan_balance = SCAN_FILE; goto out; @@ -2947,6 +3146,2762 @@ static bool can_age_anon_pages(struct pglist_data *pgdat, return can_demote(pgdat->node_id, sc); } +#ifdef CONFIG_LRU_GEN + +#ifdef CONFIG_LRU_GEN_ENABLED +DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS); +#define get_cap(cap) static_branch_likely(&lru_gen_caps[cap]) +#else +DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS); +#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap]) +#endif + +/****************************************************************************** + * shorthand helpers + ******************************************************************************/ + +#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) + +#define DEFINE_MAX_SEQ(lruvec) \ + unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq) + +#define DEFINE_MIN_SEQ(lruvec) \ + unsigned long min_seq[ANON_AND_FILE] = { \ + READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \ + READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \ + } + +#define for_each_gen_type_zone(gen, type, zone) \ + for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ + for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ + for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) + +static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) +{ + struct pglist_data *pgdat = NODE_DATA(nid); + +#ifdef CONFIG_MEMCG + if (memcg) { + struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; + + /* see the comment in mem_cgroup_lruvec() */ + if (!lruvec->pgdat) + lruvec->pgdat = pgdat; + + return lruvec; + } +#endif + VM_WARN_ON_ONCE(!mem_cgroup_disabled()); + + return &pgdat->__lruvec; +} + +static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) +{ + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + if (!can_demote(pgdat->node_id, sc) && + mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) + return 0; + + return mem_cgroup_swappiness(memcg); +} + +static int get_nr_gens(struct lruvec *lruvec, int type) +{ + return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1; +} + +static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) +{ + /* see the comment on lru_gen_struct */ + return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && + get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && + get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; +} + +/****************************************************************************** + * mm_struct list + ******************************************************************************/ + +static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) +{ + static struct lru_gen_mm_list mm_list = { + .fifo = LIST_HEAD_INIT(mm_list.fifo), + .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock), + }; + +#ifdef CONFIG_MEMCG + if (memcg) + return &memcg->mm_list; +#endif + VM_WARN_ON_ONCE(!mem_cgroup_disabled()); + + return &mm_list; +} + +void lru_gen_add_mm(struct mm_struct *mm) +{ + int nid; + struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + + VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list)); +#ifdef CONFIG_MEMCG + VM_WARN_ON_ONCE(mm->lru_gen.memcg); + mm->lru_gen.memcg = memcg; +#endif + spin_lock(&mm_list->lock); + + for_each_node_state(nid, N_MEMORY) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + /* the first addition since the last iteration */ + if (lruvec->mm_state.tail == &mm_list->fifo) + lruvec->mm_state.tail = &mm->lru_gen.list; + } + + list_add_tail(&mm->lru_gen.list, &mm_list->fifo); + + spin_unlock(&mm_list->lock); +} + +void lru_gen_del_mm(struct mm_struct *mm) +{ + int nid; + struct lru_gen_mm_list *mm_list; + struct mem_cgroup *memcg = NULL; + + if (list_empty(&mm->lru_gen.list)) + return; + +#ifdef CONFIG_MEMCG + memcg = mm->lru_gen.memcg; +#endif + mm_list = get_mm_list(memcg); + + spin_lock(&mm_list->lock); + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + /* where the last iteration ended (exclusive) */ + if (lruvec->mm_state.tail == &mm->lru_gen.list) + lruvec->mm_state.tail = lruvec->mm_state.tail->next; + + /* where the current iteration continues (inclusive) */ + if (lruvec->mm_state.head != &mm->lru_gen.list) + continue; + + lruvec->mm_state.head = lruvec->mm_state.head->next; + /* the deletion ends the current iteration */ + if (lruvec->mm_state.head == &mm_list->fifo) + WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1); + } + + list_del_init(&mm->lru_gen.list); + + spin_unlock(&mm_list->lock); + +#ifdef CONFIG_MEMCG + mem_cgroup_put(mm->lru_gen.memcg); + mm->lru_gen.memcg = NULL; +#endif +} + +#ifdef CONFIG_MEMCG +void lru_gen_migrate_mm(struct mm_struct *mm) +{ + struct mem_cgroup *memcg; + struct task_struct *task = rcu_dereference_protected(mm->owner, true); + + VM_WARN_ON_ONCE(task->mm != mm); + lockdep_assert_held(&task->alloc_lock); + + /* for mm_update_next_owner() */ + if (mem_cgroup_disabled()) + return; + + rcu_read_lock(); + memcg = mem_cgroup_from_task(task); + rcu_read_unlock(); + if (memcg == mm->lru_gen.memcg) + return; + + VM_WARN_ON_ONCE(!mm->lru_gen.memcg); + VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list)); + + lru_gen_del_mm(mm); + lru_gen_add_mm(mm); +} +#endif + +/* + * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when + * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of + * bits in a bitmap, k is the number of hash functions and n is the number of + * inserted items. + * + * Page table walkers use one of the two filters to reduce their search space. + * To get rid of non-leaf entries that no longer have enough leaf entries, the + * aging uses the double-buffering technique to flip to the other filter each + * time it produces a new generation. For non-leaf entries that have enough + * leaf entries, the aging carries them over to the next generation in + * walk_pmd_range(); the eviction also report them when walking the rmap + * in lru_gen_look_around(). + * + * For future optimizations: + * 1. It's not necessary to keep both filters all the time. The spare one can be + * freed after the RCU grace period and reallocated if needed again. + * 2. And when reallocating, it's worth scaling its size according to the number + * of inserted entries in the other filter, to reduce the memory overhead on + * small systems and false positives on large systems. + * 3. Jenkins' hash function is an alternative to Knuth's. + */ +#define BLOOM_FILTER_SHIFT 15 + +static inline int filter_gen_from_seq(unsigned long seq) +{ + return seq % NR_BLOOM_FILTERS; +} + +static void get_item_key(void *item, int *key) +{ + u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); + + BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); + + key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); + key[1] = hash >> BLOOM_FILTER_SHIFT; +} + +static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) +{ + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = lruvec->mm_state.filters[gen]; + if (filter) { + bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); + return; + } + + filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + WRITE_ONCE(lruvec->mm_state.filters[gen], filter); +} + +static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +{ + int key[2]; + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = READ_ONCE(lruvec->mm_state.filters[gen]); + if (!filter) + return; + + get_item_key(item, key); + + if (!test_bit(key[0], filter)) + set_bit(key[0], filter); + if (!test_bit(key[1], filter)) + set_bit(key[1], filter); +} + +static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +{ + int key[2]; + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = READ_ONCE(lruvec->mm_state.filters[gen]); + if (!filter) + return true; + + get_item_key(item, key); + + return test_bit(key[0], filter) && test_bit(key[1], filter); +} + +static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last) +{ + int i; + int hist; + + lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); + + if (walk) { + hist = lru_hist_from_seq(walk->max_seq); + + for (i = 0; i < NR_MM_STATS; i++) { + WRITE_ONCE(lruvec->mm_state.stats[hist][i], + lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]); + walk->mm_stats[i] = 0; + } + } + + if (NR_HIST_GENS > 1 && last) { + hist = lru_hist_from_seq(lruvec->mm_state.seq + 1); + + for (i = 0; i < NR_MM_STATS; i++) + WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0); + } +} + +static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) +{ + int type; + unsigned long size = 0; + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap); + + if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap)) + return true; + + clear_bit(key, &mm->lru_gen.bitmap); + + for (type = !walk->can_swap; type < ANON_AND_FILE; type++) { + size += type ? get_mm_counter(mm, MM_FILEPAGES) : + get_mm_counter(mm, MM_ANONPAGES) + + get_mm_counter(mm, MM_SHMEMPAGES); + } + + if (size < MIN_LRU_BATCH) + return true; + + return !mmget_not_zero(mm); +} + +static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, + struct mm_struct **iter) +{ + bool first = false; + bool last = true; + struct mm_struct *mm = NULL; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + struct lru_gen_mm_state *mm_state = &lruvec->mm_state; + + /* + * There are four interesting cases for this page table walker: + * 1. It tries to start a new iteration of mm_list with a stale max_seq; + * there is nothing left to do. + * 2. It's the first of the current generation, and it needs to reset + * the Bloom filter for the next generation. + * 3. It reaches the end of mm_list, and it needs to increment + * mm_state->seq; the iteration is done. + * 4. It's the last of the current generation, and it needs to reset the + * mm stats counters for the next generation. + */ + spin_lock(&mm_list->lock); + + VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq); + VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq); + VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers); + + if (walk->max_seq <= mm_state->seq) { + if (!*iter) + last = false; + goto done; + } + + if (!mm_state->nr_walkers) { + VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); + + mm_state->head = mm_list->fifo.next; + first = true; + } + + while (!mm && mm_state->head != &mm_list->fifo) { + mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); + + mm_state->head = mm_state->head->next; + + /* force scan for those added after the last iteration */ + if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) { + mm_state->tail = mm_state->head; + walk->force_scan = true; + } + + if (should_skip_mm(mm, walk)) + mm = NULL; + } + + if (mm_state->head == &mm_list->fifo) + WRITE_ONCE(mm_state->seq, mm_state->seq + 1); +done: + if (*iter && !mm) + mm_state->nr_walkers--; + if (!*iter && mm) + mm_state->nr_walkers++; + + if (mm_state->nr_walkers) + last = false; + + if (*iter || last) + reset_mm_stats(lruvec, walk, last); + + spin_unlock(&mm_list->lock); + + if (mm && first) + reset_bloom_filter(lruvec, walk->max_seq + 1); + + if (*iter) + mmput_async(*iter); + + *iter = mm; + + return last; +} + +static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq) +{ + bool success = false; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + struct lru_gen_mm_state *mm_state = &lruvec->mm_state; + + spin_lock(&mm_list->lock); + + VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq); + + if (max_seq > mm_state->seq && !mm_state->nr_walkers) { + VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); + + WRITE_ONCE(mm_state->seq, mm_state->seq + 1); + reset_mm_stats(lruvec, NULL, true); + success = true; + } + + spin_unlock(&mm_list->lock); + + return success; +} + +/****************************************************************************** + * refault feedback loop + ******************************************************************************/ + +/* + * A feedback loop based on Proportional-Integral-Derivative (PID) controller. + * + * The P term is refaulted/(evicted+protected) from a tier in the generation + * currently being evicted; the I term is the exponential moving average of the + * P term over the generations previously evicted, using the smoothing factor + * 1/2; the D term isn't supported. + * + * The setpoint (SP) is always the first tier of one type; the process variable + * (PV) is either any tier of the other type or any other tier of the same + * type. + * + * The error is the difference between the SP and the PV; the correction is to + * turn off protection when SP>PV or turn on protection when SP<PV. + * + * For future optimizations: + * 1. The D term may discount the other two terms over time so that long-lived + * generations can resist stale information. + */ +struct ctrl_pos { + unsigned long refaulted; + unsigned long total; + int gain; +}; + +static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, + struct ctrl_pos *pos) +{ + struct lru_gen_struct *lrugen = &lruvec->lrugen; + int hist = lru_hist_from_seq(lrugen->min_seq[type]); + + pos->refaulted = lrugen->avg_refaulted[type][tier] + + atomic_long_read(&lrugen->refaulted[hist][type][tier]); + pos->total = lrugen->avg_total[type][tier] + + atomic_long_read(&lrugen->evicted[hist][type][tier]); + if (tier) + pos->total += lrugen->protected[hist][type][tier - 1]; + pos->gain = gain; +} + +static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) +{ + int hist, tier; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; + unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1; + + lockdep_assert_held(&lruvec->lru_lock); + + if (!carryover && !clear) + return; + + hist = lru_hist_from_seq(seq); + + for (tier = 0; tier < MAX_NR_TIERS; tier++) { + if (carryover) { + unsigned long sum; + + sum = lrugen->avg_refaulted[type][tier] + + atomic_long_read(&lrugen->refaulted[hist][type][tier]); + WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); + + sum = lrugen->avg_total[type][tier] + + atomic_long_read(&lrugen->evicted[hist][type][tier]); + if (tier) + sum += lrugen->protected[hist][type][tier - 1]; + WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); + } + + if (clear) { + atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); + atomic_long_set(&lrugen->evicted[hist][type][tier], 0); + if (tier) + WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); + } + } +} + +static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) +{ + /* + * Return true if the PV has a limited number of refaults or a lower + * refaulted/total than the SP. + */ + return pv->refaulted < MIN_LRU_BATCH || + pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <= + (sp->refaulted + 1) * pv->total * pv->gain; +} + +/****************************************************************************** + * the aging + ******************************************************************************/ + +/* promote pages accessed through page tables */ +static int folio_update_gen(struct folio *folio, int gen) +{ + unsigned long new_flags, old_flags = READ_ONCE(folio->flags); + + VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); + VM_WARN_ON_ONCE(!rcu_read_lock_held()); + + do { + /* lru_gen_del_folio() has isolated this page? */ + if (!(old_flags & LRU_GEN_MASK)) { + /* for shrink_folio_list() */ + new_flags = old_flags | BIT(PG_referenced); + continue; + } + + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); + new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; + } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); + + return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; +} + +/* protect pages accessed multiple times through file descriptors */ +static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming) +{ + int type = folio_is_file_lru(folio); + struct lru_gen_struct *lrugen = &lruvec->lrugen; + int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); + unsigned long new_flags, old_flags = READ_ONCE(folio->flags); + + VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio); + + do { + new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; + /* folio_update_gen() has promoted this page? */ + if (new_gen >= 0 && new_gen != old_gen) + return new_gen; + + new_gen = (old_gen + 1) % MAX_NR_GENS; + + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); + new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; + /* for folio_end_writeback() */ + if (reclaiming) + new_flags |= BIT(PG_reclaim); + } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); + + lru_gen_update_size(lruvec, folio, old_gen, new_gen); + + return new_gen; +} + +static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio, + int old_gen, int new_gen) +{ + int type = folio_is_file_lru(folio); + int zone = folio_zonenum(folio); + int delta = folio_nr_pages(folio); + + VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS); + VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS); + + walk->batched++; + + walk->nr_pages[old_gen][type][zone] -= delta; + walk->nr_pages[new_gen][type][zone] += delta; +} + +static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk) +{ + int gen, type, zone; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + walk->batched = 0; + + for_each_gen_type_zone(gen, type, zone) { + enum lru_list lru = type * LRU_INACTIVE_FILE; + int delta = walk->nr_pages[gen][type][zone]; + + if (!delta) + continue; + + walk->nr_pages[gen][type][zone] = 0; + WRITE_ONCE(lrugen->nr_pages[gen][type][zone], + lrugen->nr_pages[gen][type][zone] + delta); + + if (lru_gen_is_active(lruvec, gen)) + lru += LRU_ACTIVE; + __update_lru_size(lruvec, lru, zone, delta); + } +} + +static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args) +{ + struct address_space *mapping; + struct vm_area_struct *vma = args->vma; + struct lru_gen_mm_walk *walk = args->private; + + if (!vma_is_accessible(vma)) + return true; + + if (is_vm_hugetlb_page(vma)) + return true; + + if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ)) + return true; + + if (vma == get_gate_vma(vma->vm_mm)) + return true; + + if (vma_is_anonymous(vma)) + return !walk->can_swap; + + if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) + return true; + + mapping = vma->vm_file->f_mapping; + if (mapping_unevictable(mapping)) + return true; + + if (shmem_mapping(mapping)) + return !walk->can_swap; + + /* to exclude special mappings like dax, etc. */ + return !mapping->a_ops->read_folio; +} + +/* + * Some userspace memory allocators map many single-page VMAs. Instead of + * returning back to the PGD table for each of such VMAs, finish an entire PMD + * table to reduce zigzags and improve cache performance. + */ +static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args, + unsigned long *vm_start, unsigned long *vm_end) +{ + unsigned long start = round_up(*vm_end, size); + unsigned long end = (start | ~mask) + 1; + VMA_ITERATOR(vmi, args->mm, start); + + VM_WARN_ON_ONCE(mask & size); + VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask)); + + for_each_vma(vmi, args->vma) { + if (end && end <= args->vma->vm_start) + return false; + + if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) + continue; + + *vm_start = max(start, args->vma->vm_start); + *vm_end = min(end - 1, args->vma->vm_end - 1) + 1; + + return true; + } + + return false; +} + +static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr) +{ + unsigned long pfn = pte_pfn(pte); + + VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); + + if (!pte_present(pte) || is_zero_pfn(pfn)) + return -1; + + if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte))) + return -1; + + if (WARN_ON_ONCE(!pfn_valid(pfn))) + return -1; + + return pfn; +} + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) +static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr) +{ + unsigned long pfn = pmd_pfn(pmd); + + VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); + + if (!pmd_present(pmd) || is_huge_zero_pmd(pmd)) + return -1; + + if (WARN_ON_ONCE(pmd_devmap(pmd))) + return -1; + + if (WARN_ON_ONCE(!pfn_valid(pfn))) + return -1; + + return pfn; +} +#endif + +static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, + struct pglist_data *pgdat, bool can_swap) +{ + struct folio *folio; + + /* try to avoid unnecessary memory loads */ + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) + return NULL; + + folio = pfn_folio(pfn); + if (folio_nid(folio) != pgdat->node_id) + return NULL; + + if (folio_memcg_rcu(folio) != memcg) + return NULL; + + /* file VMAs can contain anon pages from COW */ + if (!folio_is_file_lru(folio) && !can_swap) + return NULL; + + return folio; +} + +static bool suitable_to_scan(int total, int young) +{ + int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8); + + /* suitable if the average number of young PTEs per cacheline is >=1 */ + return young * n >= total; +} + +static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, + struct mm_walk *args) +{ + int i; + pte_t *pte; + spinlock_t *ptl; + unsigned long addr; + int total = 0; + int young = 0; + struct lru_gen_mm_walk *walk = args->private; + struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); + + VM_WARN_ON_ONCE(pmd_leaf(*pmd)); + + ptl = pte_lockptr(args->mm, pmd); + if (!spin_trylock(ptl)) + return false; + + arch_enter_lazy_mmu_mode(); + + pte = pte_offset_map(pmd, start & PMD_MASK); +restart: + for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { + unsigned long pfn; + struct folio *folio; + + total++; + walk->mm_stats[MM_LEAF_TOTAL]++; + + pfn = get_pte_pfn(pte[i], args->vma, addr); + if (pfn == -1) + continue; + + if (!pte_young(pte[i])) { + walk->mm_stats[MM_LEAF_OLD]++; + continue; + } + + folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); + if (!folio) + continue; + + if (!ptep_test_and_clear_young(args->vma, addr, pte + i)) + VM_WARN_ON_ONCE(true); + + young++; + walk->mm_stats[MM_LEAF_YOUNG]++; + + if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && + !(folio_test_anon(folio) && folio_test_swapbacked(folio) && + !folio_test_swapcache(folio))) + folio_mark_dirty(folio); + + old_gen = folio_update_gen(folio, new_gen); + if (old_gen >= 0 && old_gen != new_gen) + update_batch_size(walk, folio, old_gen, new_gen); + } + + if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) + goto restart; + + pte_unmap(pte); + + arch_leave_lazy_mmu_mode(); + spin_unlock(ptl); + + return suitable_to_scan(total, young); +} + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) +static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, + struct mm_walk *args, unsigned long *bitmap, unsigned long *start) +{ + int i; + pmd_t *pmd; + spinlock_t *ptl; + struct lru_gen_mm_walk *walk = args->private; + struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); + + VM_WARN_ON_ONCE(pud_leaf(*pud)); + + /* try to batch at most 1+MIN_LRU_BATCH+1 entries */ + if (*start == -1) { + *start = next; + return; + } + + i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start); + if (i && i <= MIN_LRU_BATCH) { + __set_bit(i - 1, bitmap); + return; + } + + pmd = pmd_offset(pud, *start); + + ptl = pmd_lockptr(args->mm, pmd); + if (!spin_trylock(ptl)) + goto done; + + arch_enter_lazy_mmu_mode(); + + do { + unsigned long pfn; + struct folio *folio; + unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start; + + pfn = get_pmd_pfn(pmd[i], vma, addr); + if (pfn == -1) + goto next; + + if (!pmd_trans_huge(pmd[i])) { + if (arch_has_hw_nonleaf_pmd_young() && + get_cap(LRU_GEN_NONLEAF_YOUNG)) + pmdp_test_and_clear_young(vma, addr, pmd + i); + goto next; + } + + folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); + if (!folio) + goto next; + + if (!pmdp_test_and_clear_young(vma, addr, pmd + i)) + goto next; + + walk->mm_stats[MM_LEAF_YOUNG]++; + + if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) && + !(folio_test_anon(folio) && folio_test_swapbacked(folio) && + !folio_test_swapcache(folio))) + folio_mark_dirty(folio); + + old_gen = folio_update_gen(folio, new_gen); + if (old_gen >= 0 && old_gen != new_gen) + update_batch_size(walk, folio, old_gen, new_gen); +next: + i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1; + } while (i <= MIN_LRU_BATCH); + + arch_leave_lazy_mmu_mode(); + spin_unlock(ptl); +done: + *start = -1; + bitmap_zero(bitmap, MIN_LRU_BATCH); +} +#else +static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, + struct mm_walk *args, unsigned long *bitmap, unsigned long *start) +{ +} +#endif + +static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, + struct mm_walk *args) +{ + int i; + pmd_t *pmd; + unsigned long next; + unsigned long addr; + struct vm_area_struct *vma; + unsigned long pos = -1; + struct lru_gen_mm_walk *walk = args->private; + unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; + + VM_WARN_ON_ONCE(pud_leaf(*pud)); + + /* + * Finish an entire PMD in two passes: the first only reaches to PTE + * tables to avoid taking the PMD lock; the second, if necessary, takes + * the PMD lock to clear the accessed bit in PMD entries. + */ + pmd = pmd_offset(pud, start & PUD_MASK); +restart: + /* walk_pte_range() may call get_next_vma() */ + vma = args->vma; + for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) { + pmd_t val = pmdp_get_lockless(pmd + i); + + next = pmd_addr_end(addr, end); + + if (!pmd_present(val) || is_huge_zero_pmd(val)) { + walk->mm_stats[MM_LEAF_TOTAL]++; + continue; + } + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge(val)) { + unsigned long pfn = pmd_pfn(val); + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + + walk->mm_stats[MM_LEAF_TOTAL]++; + + if (!pmd_young(val)) { + walk->mm_stats[MM_LEAF_OLD]++; + continue; + } + + /* try to avoid unnecessary memory loads */ + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) + continue; + + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); + continue; + } +#endif + walk->mm_stats[MM_NONLEAF_TOTAL]++; + + if (arch_has_hw_nonleaf_pmd_young() && + get_cap(LRU_GEN_NONLEAF_YOUNG)) { + if (!pmd_young(val)) + continue; + + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); + } + + if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) + continue; + + walk->mm_stats[MM_NONLEAF_FOUND]++; + + if (!walk_pte_range(&val, addr, next, args)) + continue; + + walk->mm_stats[MM_NONLEAF_ADDED]++; + + /* carry over to the next generation */ + update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i); + } + + walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos); + + if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end)) + goto restart; +} + +static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, + struct mm_walk *args) +{ + int i; + pud_t *pud; + unsigned long addr; + unsigned long next; + struct lru_gen_mm_walk *walk = args->private; + + VM_WARN_ON_ONCE(p4d_leaf(*p4d)); + + pud = pud_offset(p4d, start & P4D_MASK); +restart: + for (i = pud_index(start), addr = start; addr != end; i++, addr = next) { + pud_t val = READ_ONCE(pud[i]); + + next = pud_addr_end(addr, end); + + if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val))) + continue; + + walk_pmd_range(&val, addr, next, args); + + /* a racy check to curtail the waiting time */ + if (wq_has_sleeper(&walk->lruvec->mm_state.wait)) + return 1; + + if (need_resched() || walk->batched >= MAX_LRU_BATCH) { + end = (addr | ~PUD_MASK) + 1; + goto done; + } + } + + if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end)) + goto restart; + + end = round_up(end, P4D_SIZE); +done: + if (!end || !args->vma) + return 1; + + walk->next_addr = max(end, args->vma->vm_start); + + return -EAGAIN; +} + +static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk) +{ + static const struct mm_walk_ops mm_walk_ops = { + .test_walk = should_skip_vma, + .p4d_entry = walk_pud_range, + }; + + int err; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + + walk->next_addr = FIRST_USER_ADDRESS; + + do { + err = -EBUSY; + + /* folio_update_gen() requires stable folio_memcg() */ + if (!mem_cgroup_trylock_pages(memcg)) + break; + + /* the caller might be holding the lock for write */ + if (mmap_read_trylock(mm)) { + err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk); + + mmap_read_unlock(mm); + } + + mem_cgroup_unlock_pages(); + + if (walk->batched) { + spin_lock_irq(&lruvec->lru_lock); + reset_batch_size(lruvec, walk); + spin_unlock_irq(&lruvec->lru_lock); + } + + cond_resched(); + } while (err == -EAGAIN); +} + +static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat) +{ + struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; + + if (pgdat && current_is_kswapd()) { + VM_WARN_ON_ONCE(walk); + + walk = &pgdat->mm_walk; + } else if (!pgdat && !walk) { + VM_WARN_ON_ONCE(current_is_kswapd()); + + walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + } + + current->reclaim_state->mm_walk = walk; + + return walk; +} + +static void clear_mm_walk(void) +{ + struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; + + VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages))); + VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats))); + + current->reclaim_state->mm_walk = NULL; + + if (!current_is_kswapd()) + kfree(walk); +} + +static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) +{ + int zone; + int remaining = MAX_LRU_BATCH; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); + + if (type == LRU_GEN_ANON && !can_swap) + goto done; + + /* prevent cold/hot inversion if force_scan is true */ + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + struct list_head *head = &lrugen->lists[old_gen][type][zone]; + + while (!list_empty(head)) { + struct folio *folio = lru_to_folio(head); + + VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); + VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); + + new_gen = folio_inc_gen(lruvec, folio, false); + list_move_tail(&folio->lru, &lrugen->lists[new_gen][type][zone]); + + if (!--remaining) + return false; + } + } +done: + reset_ctrl_pos(lruvec, type, true); + WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); + + return true; +} + +static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) +{ + int gen, type, zone; + bool success = false; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + DEFINE_MIN_SEQ(lruvec); + + VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); + + /* find the oldest populated generation */ + for (type = !can_swap; type < ANON_AND_FILE; type++) { + while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) { + gen = lru_gen_from_seq(min_seq[type]); + + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + if (!list_empty(&lrugen->lists[gen][type][zone])) + goto next; + } + + min_seq[type]++; + } +next: + ; + } + + /* see the comment on lru_gen_struct */ + if (can_swap) { + min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]); + min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); + } + + for (type = !can_swap; type < ANON_AND_FILE; type++) { + if (min_seq[type] == lrugen->min_seq[type]) + continue; + + reset_ctrl_pos(lruvec, type, true); + WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); + success = true; + } + + return success; +} + +static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) +{ + int prev, next; + int type, zone; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + spin_lock_irq(&lruvec->lru_lock); + + VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); + + for (type = ANON_AND_FILE - 1; type >= 0; type--) { + if (get_nr_gens(lruvec, type) != MAX_NR_GENS) + continue; + + VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap)); + + while (!inc_min_seq(lruvec, type, can_swap)) { + spin_unlock_irq(&lruvec->lru_lock); + cond_resched(); + spin_lock_irq(&lruvec->lru_lock); + } + } + + /* + * Update the active/inactive LRU sizes for compatibility. Both sides of + * the current max_seq need to be covered, since max_seq+1 can overlap + * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do + * overlap, cold/hot inversion happens. + */ + prev = lru_gen_from_seq(lrugen->max_seq - 1); + next = lru_gen_from_seq(lrugen->max_seq + 1); + + for (type = 0; type < ANON_AND_FILE; type++) { + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + enum lru_list lru = type * LRU_INACTIVE_FILE; + long delta = lrugen->nr_pages[prev][type][zone] - + lrugen->nr_pages[next][type][zone]; + + if (!delta) + continue; + + __update_lru_size(lruvec, lru, zone, delta); + __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta); + } + } + + for (type = 0; type < ANON_AND_FILE; type++) + reset_ctrl_pos(lruvec, type, false); + + WRITE_ONCE(lrugen->timestamps[next], jiffies); + /* make sure preceding modifications appear */ + smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); + + spin_unlock_irq(&lruvec->lru_lock); +} + +static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + struct scan_control *sc, bool can_swap, bool force_scan) +{ + bool success; + struct lru_gen_mm_walk *walk; + struct mm_struct *mm = NULL; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq)); + + /* see the comment in iterate_mm_list() */ + if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) { + success = false; + goto done; + } + + /* + * If the hardware doesn't automatically set the accessed bit, fallback + * to lru_gen_look_around(), which only clears the accessed bit in a + * handful of PTEs. Spreading the work out over a period of time usually + * is less efficient, but it avoids bursty page faults. + */ + if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) { + success = iterate_mm_list_nowalk(lruvec, max_seq); + goto done; + } + + walk = set_mm_walk(NULL); + if (!walk) { + success = iterate_mm_list_nowalk(lruvec, max_seq); + goto done; + } + + walk->lruvec = lruvec; + walk->max_seq = max_seq; + walk->can_swap = can_swap; + walk->force_scan = force_scan; + + do { + success = iterate_mm_list(lruvec, walk, &mm); + if (mm) + walk_mm(lruvec, mm, walk); + + cond_resched(); + } while (mm); +done: + if (!success) { + if (sc->priority <= DEF_PRIORITY - 2) + wait_event_killable(lruvec->mm_state.wait, + max_seq < READ_ONCE(lrugen->max_seq)); + + return max_seq < READ_ONCE(lrugen->max_seq); + } + + VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); + + inc_max_seq(lruvec, can_swap, force_scan); + /* either this sees any waiters or they will see updated max_seq */ + if (wq_has_sleeper(&lruvec->mm_state.wait)) + wake_up_all(&lruvec->mm_state.wait); + + return true; +} + +static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq, + struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) +{ + int gen, type, zone; + unsigned long old = 0; + unsigned long young = 0; + unsigned long total = 0; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + + for (type = !can_swap; type < ANON_AND_FILE; type++) { + unsigned long seq; + + for (seq = min_seq[type]; seq <= max_seq; seq++) { + unsigned long size = 0; + + gen = lru_gen_from_seq(seq); + + for (zone = 0; zone < MAX_NR_ZONES; zone++) + size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); + + total += size; + if (seq == max_seq) + young += size; + else if (seq + MIN_NR_GENS == max_seq) + old += size; + } + } + + /* try to scrape all its memory if this memcg was deleted */ + *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; + + /* + * The aging tries to be lazy to reduce the overhead, while the eviction + * stalls when the number of generations reaches MIN_NR_GENS. Hence, the + * ideal number of generations is MIN_NR_GENS+1. + */ + if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) + return true; + if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) + return false; + + /* + * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) + * of the total number of pages for each generation. A reasonable range + * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The + * aging cares about the upper bound of hot pages, while the eviction + * cares about the lower bound of cold pages. + */ + if (young * MIN_NR_GENS > total) + return true; + if (old * (MIN_NR_GENS + 2) < total) + return true; + + return false; +} + +static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl) +{ + bool need_aging; + unsigned long nr_to_scan; + int swappiness = get_swappiness(lruvec, sc); + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); + + VM_WARN_ON_ONCE(sc->memcg_low_reclaim); + + mem_cgroup_calculate_protection(NULL, memcg); + + if (mem_cgroup_below_min(NULL, memcg)) + return false; + + need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan); + + if (min_ttl) { + int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); + unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); + + if (time_is_after_jiffies(birth + min_ttl)) + return false; + + /* the size is likely too small to be helpful */ + if (!nr_to_scan && sc->priority != DEF_PRIORITY) + return false; + } + + if (need_aging) + try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false); + + return true; +} + +/* to protect the working set of the last N jiffies */ +static unsigned long lru_gen_min_ttl __read_mostly; + +static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) +{ + struct mem_cgroup *memcg; + bool success = false; + unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); + + VM_WARN_ON_ONCE(!current_is_kswapd()); + + sc->last_reclaimed = sc->nr_reclaimed; + + /* + * To reduce the chance of going into the aging path, which can be + * costly, optimistically skip it if the flag below was cleared in the + * eviction path. This improves the overall performance when multiple + * memcgs are available. + */ + if (!sc->memcgs_need_aging) { + sc->memcgs_need_aging = true; + return; + } + + set_mm_walk(pgdat); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + + if (age_lruvec(lruvec, sc, min_ttl)) + success = true; + + cond_resched(); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); + + clear_mm_walk(); + + /* check the order to exclude compaction-induced reclaim */ + if (success || !min_ttl || sc->order) + return; + + /* + * The main goal is to OOM kill if every generation from all memcgs is + * younger than min_ttl. However, another possibility is all memcgs are + * either below min or empty. + */ + if (mutex_trylock(&oom_lock)) { + struct oom_control oc = { + .gfp_mask = sc->gfp_mask, + }; + + out_of_memory(&oc); + + mutex_unlock(&oom_lock); + } +} + +/* + * This function exploits spatial locality when shrink_folio_list() walks the + * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If + * the scan was done cacheline efficiently, it adds the PMD entry pointing to + * the PTE table to the Bloom filter. This forms a feedback loop between the + * eviction and the aging. + */ +void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +{ + int i; + pte_t *pte; + unsigned long start; + unsigned long end; + unsigned long addr; + struct lru_gen_mm_walk *walk; + int young = 0; + unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; + struct folio *folio = pfn_folio(pvmw->pfn); + struct mem_cgroup *memcg = folio_memcg(folio); + struct pglist_data *pgdat = folio_pgdat(folio); + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + DEFINE_MAX_SEQ(lruvec); + int old_gen, new_gen = lru_gen_from_seq(max_seq); + + lockdep_assert_held(pvmw->ptl); + VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); + + if (spin_is_contended(pvmw->ptl)) + return; + + /* avoid taking the LRU lock under the PTL when possible */ + walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; + + start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); + end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; + + if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { + if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2) + end = start + MIN_LRU_BATCH * PAGE_SIZE; + else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2) + start = end - MIN_LRU_BATCH * PAGE_SIZE; + else { + start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2; + end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2; + } + } + + pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; + + rcu_read_lock(); + arch_enter_lazy_mmu_mode(); + + for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { + unsigned long pfn; + + pfn = get_pte_pfn(pte[i], pvmw->vma, addr); + if (pfn == -1) + continue; + + if (!pte_young(pte[i])) + continue; + + folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap); + if (!folio) + continue; + + if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) + VM_WARN_ON_ONCE(true); + + young++; + + if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && + !(folio_test_anon(folio) && folio_test_swapbacked(folio) && + !folio_test_swapcache(folio))) + folio_mark_dirty(folio); + + old_gen = folio_lru_gen(folio); + if (old_gen < 0) + folio_set_referenced(folio); + else if (old_gen != new_gen) + __set_bit(i, bitmap); + } + + arch_leave_lazy_mmu_mode(); + rcu_read_unlock(); + + /* feedback from rmap walkers to page table walkers */ + if (suitable_to_scan(i, young)) + update_bloom_filter(lruvec, max_seq, pvmw->pmd); + + if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) { + for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { + folio = pfn_folio(pte_pfn(pte[i])); + folio_activate(folio); + } + return; + } + + /* folio_update_gen() requires stable folio_memcg() */ + if (!mem_cgroup_trylock_pages(memcg)) + return; + + if (!walk) { + spin_lock_irq(&lruvec->lru_lock); + new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); + } + + for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { + folio = pfn_folio(pte_pfn(pte[i])); + if (folio_memcg_rcu(folio) != memcg) + continue; + + old_gen = folio_update_gen(folio, new_gen); + if (old_gen < 0 || old_gen == new_gen) + continue; + + if (walk) + update_batch_size(walk, folio, old_gen, new_gen); + else + lru_gen_update_size(lruvec, folio, old_gen, new_gen); + } + + if (!walk) + spin_unlock_irq(&lruvec->lru_lock); + + mem_cgroup_unlock_pages(); +} + +/****************************************************************************** + * the eviction + ******************************************************************************/ + +static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) +{ + bool success; + int gen = folio_lru_gen(folio); + int type = folio_is_file_lru(folio); + int zone = folio_zonenum(folio); + int delta = folio_nr_pages(folio); + int refs = folio_lru_refs(folio); + int tier = lru_tier_from_refs(refs); + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio); + + /* unevictable */ + if (!folio_evictable(folio)) { + success = lru_gen_del_folio(lruvec, folio, true); + VM_WARN_ON_ONCE_FOLIO(!success, folio); + folio_set_unevictable(folio); + lruvec_add_folio(lruvec, folio); + __count_vm_events(UNEVICTABLE_PGCULLED, delta); + return true; + } + + /* dirty lazyfree */ + if (type == LRU_GEN_FILE && folio_test_anon(folio) && folio_test_dirty(folio)) { + success = lru_gen_del_folio(lruvec, folio, true); + VM_WARN_ON_ONCE_FOLIO(!success, folio); + folio_set_swapbacked(folio); + lruvec_add_folio_tail(lruvec, folio); + return true; + } + + /* promoted */ + if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { + list_move(&folio->lru, &lrugen->lists[gen][type][zone]); + return true; + } + + /* protected */ + if (tier > tier_idx) { + int hist = lru_hist_from_seq(lrugen->min_seq[type]); + + gen = folio_inc_gen(lruvec, folio, false); + list_move_tail(&folio->lru, &lrugen->lists[gen][type][zone]); + + WRITE_ONCE(lrugen->protected[hist][type][tier - 1], + lrugen->protected[hist][type][tier - 1] + delta); + __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); + return true; + } + + /* waiting for writeback */ + if (folio_test_locked(folio) || folio_test_writeback(folio) || + (type == LRU_GEN_FILE && folio_test_dirty(folio))) { + gen = folio_inc_gen(lruvec, folio, true); + list_move(&folio->lru, &lrugen->lists[gen][type][zone]); + return true; + } + + return false; +} + +static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc) +{ + bool success; + + /* unmapping inhibited */ + if (!sc->may_unmap && folio_mapped(folio)) + return false; + + /* swapping inhibited */ + if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && + (folio_test_dirty(folio) || + (folio_test_anon(folio) && !folio_test_swapcache(folio)))) + return false; + + /* raced with release_pages() */ + if (!folio_try_get(folio)) + return false; + + /* raced with another isolation */ + if (!folio_test_clear_lru(folio)) { + folio_put(folio); + return false; + } + + /* see the comment on MAX_NR_TIERS */ + if (!folio_test_referenced(folio)) + set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); + + /* for shrink_folio_list() */ + folio_clear_reclaim(folio); + folio_clear_referenced(folio); + + success = lru_gen_del_folio(lruvec, folio, true); + VM_WARN_ON_ONCE_FOLIO(!success, folio); + + return true; +} + +static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, + int type, int tier, struct list_head *list) +{ + int gen, zone; + enum vm_event_item item; + int sorted = 0; + int scanned = 0; + int isolated = 0; + int remaining = MAX_LRU_BATCH; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + + VM_WARN_ON_ONCE(!list_empty(list)); + + if (get_nr_gens(lruvec, type) == MIN_NR_GENS) + return 0; + + gen = lru_gen_from_seq(lrugen->min_seq[type]); + + for (zone = sc->reclaim_idx; zone >= 0; zone--) { + LIST_HEAD(moved); + int skipped = 0; + struct list_head *head = &lrugen->lists[gen][type][zone]; + + while (!list_empty(head)) { + struct folio *folio = lru_to_folio(head); + int delta = folio_nr_pages(folio); + + VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); + VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); + + scanned += delta; + + if (sort_folio(lruvec, folio, tier)) + sorted += delta; + else if (isolate_folio(lruvec, folio, sc)) { + list_add(&folio->lru, list); + isolated += delta; + } else { + list_move(&folio->lru, &moved); + skipped += delta; + } + + if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH) + break; + } + + if (skipped) { + list_splice(&moved, head); + __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); + } + + if (!remaining || isolated >= MIN_LRU_BATCH) + break; + } + + item = PGSCAN_KSWAPD + reclaimer_offset(); + if (!cgroup_reclaim(sc)) { + __count_vm_events(item, isolated); + __count_vm_events(PGREFILL, sorted); + } + __count_memcg_events(memcg, item, isolated); + __count_memcg_events(memcg, PGREFILL, sorted); + __count_vm_events(PGSCAN_ANON + type, isolated); + + /* + * There might not be eligible pages due to reclaim_idx, may_unmap and + * may_writepage. Check the remaining to prevent livelock if it's not + * making progress. + */ + return isolated || !remaining ? scanned : 0; +} + +static int get_tier_idx(struct lruvec *lruvec, int type) +{ + int tier; + struct ctrl_pos sp, pv; + + /* + * To leave a margin for fluctuations, use a larger gain factor (1:2). + * This value is chosen because any other tier would have at least twice + * as many refaults as the first tier. + */ + read_ctrl_pos(lruvec, type, 0, 1, &sp); + for (tier = 1; tier < MAX_NR_TIERS; tier++) { + read_ctrl_pos(lruvec, type, tier, 2, &pv); + if (!positive_ctrl_err(&sp, &pv)) + break; + } + + return tier - 1; +} + +static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) +{ + int type, tier; + struct ctrl_pos sp, pv; + int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; + + /* + * Compare the first tier of anon with that of file to determine which + * type to scan. Also need to compare other tiers of the selected type + * with the first tier of the other type to determine the last tier (of + * the selected type) to evict. + */ + read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp); + read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv); + type = positive_ctrl_err(&sp, &pv); + + read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); + for (tier = 1; tier < MAX_NR_TIERS; tier++) { + read_ctrl_pos(lruvec, type, tier, gain[type], &pv); + if (!positive_ctrl_err(&sp, &pv)) + break; + } + + *tier_idx = tier - 1; + + return type; +} + +static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, + int *type_scanned, struct list_head *list) +{ + int i; + int type; + int scanned; + int tier = -1; + DEFINE_MIN_SEQ(lruvec); + + /* + * Try to make the obvious choice first. When anon and file are both + * available from the same generation, interpret swappiness 1 as file + * first and 200 as anon first. + */ + if (!swappiness) + type = LRU_GEN_FILE; + else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) + type = LRU_GEN_ANON; + else if (swappiness == 1) + type = LRU_GEN_FILE; + else if (swappiness == 200) + type = LRU_GEN_ANON; + else + type = get_type_to_scan(lruvec, swappiness, &tier); + + for (i = !swappiness; i < ANON_AND_FILE; i++) { + if (tier < 0) + tier = get_tier_idx(lruvec, type); + + scanned = scan_folios(lruvec, sc, type, tier, list); + if (scanned) + break; + + type = !type; + tier = -1; + } + + *type_scanned = type; + + return scanned; +} + +static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, + bool *need_swapping) +{ + int type; + int scanned; + int reclaimed; + LIST_HEAD(list); + LIST_HEAD(clean); + struct folio *folio; + struct folio *next; + enum vm_event_item item; + struct reclaim_stat stat; + struct lru_gen_mm_walk *walk; + bool skip_retry = false; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + spin_lock_irq(&lruvec->lru_lock); + + scanned = isolate_folios(lruvec, sc, swappiness, &type, &list); + + scanned += try_to_inc_min_seq(lruvec, swappiness); + + if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS) + scanned = 0; + + spin_unlock_irq(&lruvec->lru_lock); + + if (list_empty(&list)) + return scanned; +retry: + reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false); + sc->nr_reclaimed += reclaimed; + + list_for_each_entry_safe_reverse(folio, next, &list, lru) { + if (!folio_evictable(folio)) { + list_del(&folio->lru); + folio_putback_lru(folio); + continue; + } + + if (folio_test_reclaim(folio) && + (folio_test_dirty(folio) || folio_test_writeback(folio))) { + /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ + if (folio_test_workingset(folio)) + folio_set_referenced(folio); + continue; + } + + if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) || + folio_mapped(folio) || folio_test_locked(folio) || + folio_test_dirty(folio) || folio_test_writeback(folio)) { + /* don't add rejected folios to the oldest generation */ + set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, + BIT(PG_active)); + continue; + } + + /* retry folios that may have missed folio_rotate_reclaimable() */ + list_move(&folio->lru, &clean); + sc->nr_scanned -= folio_nr_pages(folio); + } + + spin_lock_irq(&lruvec->lru_lock); + + move_folios_to_lru(lruvec, &list); + + walk = current->reclaim_state->mm_walk; + if (walk && walk->batched) + reset_batch_size(lruvec, walk); + + item = PGSTEAL_KSWAPD + reclaimer_offset(); + if (!cgroup_reclaim(sc)) + __count_vm_events(item, reclaimed); + __count_memcg_events(memcg, item, reclaimed); + __count_vm_events(PGSTEAL_ANON + type, reclaimed); + + spin_unlock_irq(&lruvec->lru_lock); + + mem_cgroup_uncharge_list(&list); + free_unref_page_list(&list); + + INIT_LIST_HEAD(&list); + list_splice_init(&clean, &list); + + if (!list_empty(&list)) { + skip_retry = true; + goto retry; + } + + if (need_swapping && type == LRU_GEN_ANON) + *need_swapping = true; + + return scanned; +} + +/* + * For future optimizations: + * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg + * reclaim. + */ +static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, + bool can_swap, bool *need_aging) +{ + unsigned long nr_to_scan; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); + + if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg) || + (mem_cgroup_below_low(sc->target_mem_cgroup, memcg) && + !sc->memcg_low_reclaim)) + return 0; + + *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan); + if (!*need_aging) + return nr_to_scan; + + /* skip the aging path at the default priority */ + if (sc->priority == DEF_PRIORITY) + goto done; + + /* leave the work to lru_gen_age_node() */ + if (current_is_kswapd()) + return 0; + + if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false)) + return nr_to_scan; +done: + return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; +} + +static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq, + struct scan_control *sc, bool need_swapping) +{ + int i; + DEFINE_MAX_SEQ(lruvec); + + if (!current_is_kswapd()) { + /* age each memcg at most once to ensure fairness */ + if (max_seq - seq > 1) + return true; + + /* over-swapping can increase allocation latency */ + if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping) + return true; + + /* give this thread a chance to exit and free its memory */ + if (fatal_signal_pending(current)) { + sc->nr_reclaimed += MIN_LRU_BATCH; + return true; + } + + if (cgroup_reclaim(sc)) + return false; + } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim) + return false; + + /* keep scanning at low priorities to ensure fairness */ + if (sc->priority > DEF_PRIORITY - 2) + return false; + + /* + * A minimum amount of work was done under global memory pressure. For + * kswapd, it may be overshooting. For direct reclaim, the allocation + * may succeed if all suitable zones are somewhat safe. In either case, + * it's better to stop now, and restart later if necessary. + */ + for (i = 0; i <= sc->reclaim_idx; i++) { + unsigned long wmark; + struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i; + + if (!managed_zone(zone)) + continue; + + wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone); + if (wmark > zone_page_state(zone, NR_FREE_PAGES)) + return false; + } + + sc->nr_reclaimed += MIN_LRU_BATCH; + + return true; +} + +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + struct blk_plug plug; + bool need_aging = false; + bool need_swapping = false; + unsigned long scanned = 0; + unsigned long reclaimed = sc->nr_reclaimed; + DEFINE_MAX_SEQ(lruvec); + + lru_add_drain(); + + blk_start_plug(&plug); + + set_mm_walk(lruvec_pgdat(lruvec)); + + while (true) { + int delta; + int swappiness; + unsigned long nr_to_scan; + + if (sc->may_swap) + swappiness = get_swappiness(lruvec, sc); + else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc)) + swappiness = 1; + else + swappiness = 0; + + nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging); + if (!nr_to_scan) + goto done; + + delta = evict_folios(lruvec, sc, swappiness, &need_swapping); + if (!delta) + goto done; + + scanned += delta; + if (scanned >= nr_to_scan) + break; + + if (should_abort_scan(lruvec, max_seq, sc, need_swapping)) + break; + + cond_resched(); + } + + /* see the comment in lru_gen_age_node() */ + if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging) + sc->memcgs_need_aging = false; +done: + clear_mm_walk(); + + blk_finish_plug(&plug); +} + +/****************************************************************************** + * state change + ******************************************************************************/ + +static bool __maybe_unused state_is_valid(struct lruvec *lruvec) +{ + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + if (lrugen->enabled) { + enum lru_list lru; + + for_each_evictable_lru(lru) { + if (!list_empty(&lruvec->lists[lru])) + return false; + } + } else { + int gen, type, zone; + + for_each_gen_type_zone(gen, type, zone) { + if (!list_empty(&lrugen->lists[gen][type][zone])) + return false; + } + } + + return true; +} + +static bool fill_evictable(struct lruvec *lruvec) +{ + enum lru_list lru; + int remaining = MAX_LRU_BATCH; + + for_each_evictable_lru(lru) { + int type = is_file_lru(lru); + bool active = is_active_lru(lru); + struct list_head *head = &lruvec->lists[lru]; + + while (!list_empty(head)) { + bool success; + struct folio *folio = lru_to_folio(head); + + VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio); + VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); + VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio); + + lruvec_del_folio(lruvec, folio); + success = lru_gen_add_folio(lruvec, folio, false); + VM_WARN_ON_ONCE(!success); + + if (!--remaining) + return false; + } + } + + return true; +} + +static bool drain_evictable(struct lruvec *lruvec) +{ + int gen, type, zone; + int remaining = MAX_LRU_BATCH; + + for_each_gen_type_zone(gen, type, zone) { + struct list_head *head = &lruvec->lrugen.lists[gen][type][zone]; + + while (!list_empty(head)) { + bool success; + struct folio *folio = lru_to_folio(head); + + VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); + VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); + + success = lru_gen_del_folio(lruvec, folio, false); + VM_WARN_ON_ONCE(!success); + lruvec_add_folio(lruvec, folio); + + if (!--remaining) + return false; + } + } + + return true; +} + +static void lru_gen_change_state(bool enabled) +{ + static DEFINE_MUTEX(state_mutex); + + struct mem_cgroup *memcg; + + cgroup_lock(); + cpus_read_lock(); + get_online_mems(); + mutex_lock(&state_mutex); + + if (enabled == lru_gen_enabled()) + goto unlock; + + if (enabled) + static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); + else + static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + int nid; + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + spin_lock_irq(&lruvec->lru_lock); + + VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); + VM_WARN_ON_ONCE(!state_is_valid(lruvec)); + + lruvec->lrugen.enabled = enabled; + + while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) { + spin_unlock_irq(&lruvec->lru_lock); + cond_resched(); + spin_lock_irq(&lruvec->lru_lock); + } + + spin_unlock_irq(&lruvec->lru_lock); + } + + cond_resched(); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); +unlock: + mutex_unlock(&state_mutex); + put_online_mems(); + cpus_read_unlock(); + cgroup_unlock(); +} + +/****************************************************************************** + * sysfs interface + ******************************************************************************/ + +static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); +} + +/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ +static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t len) +{ + unsigned int msecs; + + if (kstrtouint(buf, 0, &msecs)) + return -EINVAL; + + WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs)); + + return len; +} + +static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR( + min_ttl_ms, 0644, show_min_ttl, store_min_ttl +); + +static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + unsigned int caps = 0; + + if (get_cap(LRU_GEN_CORE)) + caps |= BIT(LRU_GEN_CORE); + + if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK)) + caps |= BIT(LRU_GEN_MM_WALK); + + if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) + caps |= BIT(LRU_GEN_NONLEAF_YOUNG); + + return sysfs_emit(buf, "0x%04x\n", caps); +} + +/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ +static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t len) +{ + int i; + unsigned int caps; + + if (tolower(*buf) == 'n') + caps = 0; + else if (tolower(*buf) == 'y') + caps = -1; + else if (kstrtouint(buf, 0, &caps)) + return -EINVAL; + + for (i = 0; i < NR_LRU_GEN_CAPS; i++) { + bool enabled = caps & BIT(i); + + if (i == LRU_GEN_CORE) + lru_gen_change_state(enabled); + else if (enabled) + static_branch_enable(&lru_gen_caps[i]); + else + static_branch_disable(&lru_gen_caps[i]); + } + + return len; +} + +static struct kobj_attribute lru_gen_enabled_attr = __ATTR( + enabled, 0644, show_enabled, store_enabled +); + +static struct attribute *lru_gen_attrs[] = { + &lru_gen_min_ttl_attr.attr, + &lru_gen_enabled_attr.attr, + NULL +}; + +static struct attribute_group lru_gen_attr_group = { + .name = "lru_gen", + .attrs = lru_gen_attrs, +}; + +/****************************************************************************** + * debugfs interface + ******************************************************************************/ + +static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos) +{ + struct mem_cgroup *memcg; + loff_t nr_to_skip = *pos; + + m->private = kvmalloc(PATH_MAX, GFP_KERNEL); + if (!m->private) + return ERR_PTR(-ENOMEM); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + int nid; + + for_each_node_state(nid, N_MEMORY) { + if (!nr_to_skip--) + return get_lruvec(memcg, nid); + } + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); + + return NULL; +} + +static void lru_gen_seq_stop(struct seq_file *m, void *v) +{ + if (!IS_ERR_OR_NULL(v)) + mem_cgroup_iter_break(NULL, lruvec_memcg(v)); + + kvfree(m->private); + m->private = NULL; +} + +static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + int nid = lruvec_pgdat(v)->node_id; + struct mem_cgroup *memcg = lruvec_memcg(v); + + ++*pos; + + nid = next_memory_node(nid); + if (nid == MAX_NUMNODES) { + memcg = mem_cgroup_iter(NULL, memcg, NULL); + if (!memcg) + return NULL; + + nid = first_memory_node; + } + + return get_lruvec(memcg, nid); +} + +static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, + unsigned long max_seq, unsigned long *min_seq, + unsigned long seq) +{ + int i; + int type, tier; + int hist = lru_hist_from_seq(seq); + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + for (tier = 0; tier < MAX_NR_TIERS; tier++) { + seq_printf(m, " %10d", tier); + for (type = 0; type < ANON_AND_FILE; type++) { + const char *s = " "; + unsigned long n[3] = {}; + + if (seq == max_seq) { + s = "RT "; + n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); + n[1] = READ_ONCE(lrugen->avg_total[type][tier]); + } else if (seq == min_seq[type] || NR_HIST_GENS > 1) { + s = "rep"; + n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); + n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); + if (tier) + n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); + } + + for (i = 0; i < 3; i++) + seq_printf(m, " %10lu%c", n[i], s[i]); + } + seq_putc(m, '\n'); + } + + seq_puts(m, " "); + for (i = 0; i < NR_MM_STATS; i++) { + const char *s = " "; + unsigned long n = 0; + + if (seq == max_seq && NR_HIST_GENS == 1) { + s = "LOYNFA"; + n = READ_ONCE(lruvec->mm_state.stats[hist][i]); + } else if (seq != max_seq && NR_HIST_GENS > 1) { + s = "loynfa"; + n = READ_ONCE(lruvec->mm_state.stats[hist][i]); + } + + seq_printf(m, " %10lu%c", n, s[i]); + } + seq_putc(m, '\n'); +} + +/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ +static int lru_gen_seq_show(struct seq_file *m, void *v) +{ + unsigned long seq; + bool full = !debugfs_real_fops(m->file)->write; + struct lruvec *lruvec = v; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + int nid = lruvec_pgdat(lruvec)->node_id; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); + + if (nid == first_memory_node) { + const char *path = memcg ? m->private : ""; + +#ifdef CONFIG_MEMCG + if (memcg) + cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); +#endif + seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path); + } + + seq_printf(m, " node %5d\n", nid); + + if (!full) + seq = min_seq[LRU_GEN_ANON]; + else if (max_seq >= MAX_NR_GENS) + seq = max_seq - MAX_NR_GENS + 1; + else + seq = 0; + + for (; seq <= max_seq; seq++) { + int type, zone; + int gen = lru_gen_from_seq(seq); + unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); + + seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth)); + + for (type = 0; type < ANON_AND_FILE; type++) { + unsigned long size = 0; + char mark = full && seq < min_seq[type] ? 'x' : ' '; + + for (zone = 0; zone < MAX_NR_ZONES; zone++) + size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); + + seq_printf(m, " %10lu%c", size, mark); + } + + seq_putc(m, '\n'); + + if (full) + lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq); + } + + return 0; +} + +static const struct seq_operations lru_gen_seq_ops = { + .start = lru_gen_seq_start, + .stop = lru_gen_seq_stop, + .next = lru_gen_seq_next, + .show = lru_gen_seq_show, +}; + +static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, + bool can_swap, bool force_scan) +{ + DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); + + if (seq < max_seq) + return 0; + + if (seq > max_seq) + return -EINVAL; + + if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq) + return -ERANGE; + + try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan); + + return 0; +} + +static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, + int swappiness, unsigned long nr_to_reclaim) +{ + DEFINE_MAX_SEQ(lruvec); + + if (seq + MIN_NR_GENS > max_seq) + return -EINVAL; + + sc->nr_reclaimed = 0; + + while (!signal_pending(current)) { + DEFINE_MIN_SEQ(lruvec); + + if (seq < min_seq[!swappiness]) + return 0; + + if (sc->nr_reclaimed >= nr_to_reclaim) + return 0; + + if (!evict_folios(lruvec, sc, swappiness, NULL)) + return 0; + + cond_resched(); + } + + return -EINTR; +} + +static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, + struct scan_control *sc, int swappiness, unsigned long opt) +{ + struct lruvec *lruvec; + int err = -EINVAL; + struct mem_cgroup *memcg = NULL; + + if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY)) + return -EINVAL; + + if (!mem_cgroup_disabled()) { + rcu_read_lock(); + memcg = mem_cgroup_from_id(memcg_id); +#ifdef CONFIG_MEMCG + if (memcg && !css_tryget(&memcg->css)) + memcg = NULL; +#endif + rcu_read_unlock(); + + if (!memcg) + return -EINVAL; + } + + if (memcg_id != mem_cgroup_id(memcg)) + goto done; + + lruvec = get_lruvec(memcg, nid); + + if (swappiness < 0) + swappiness = get_swappiness(lruvec, sc); + else if (swappiness > 200) + goto done; + + switch (cmd) { + case '+': + err = run_aging(lruvec, seq, sc, swappiness, opt); + break; + case '-': + err = run_eviction(lruvec, seq, sc, swappiness, opt); + break; + } +done: + mem_cgroup_put(memcg); + + return err; +} + +/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ +static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, + size_t len, loff_t *pos) +{ + void *buf; + char *cur, *next; + unsigned int flags; + struct blk_plug plug; + int err = -EINVAL; + struct scan_control sc = { + .may_writepage = true, + .may_unmap = true, + .may_swap = true, + .reclaim_idx = MAX_NR_ZONES - 1, + .gfp_mask = GFP_KERNEL, + }; + + buf = kvmalloc(len + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, src, len)) { + kvfree(buf); + return -EFAULT; + } + + set_task_reclaim_state(current, &sc.reclaim_state); + flags = memalloc_noreclaim_save(); + blk_start_plug(&plug); + if (!set_mm_walk(NULL)) { + err = -ENOMEM; + goto done; + } + + next = buf; + next[len] = '\0'; + + while ((cur = strsep(&next, ",;\n"))) { + int n; + int end; + char cmd; + unsigned int memcg_id; + unsigned int nid; + unsigned long seq; + unsigned int swappiness = -1; + unsigned long opt = -1; + + cur = skip_spaces(cur); + if (!*cur) + continue; + + n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid, + &seq, &end, &swappiness, &end, &opt, &end); + if (n < 4 || cur[end]) { + err = -EINVAL; + break; + } + + err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt); + if (err) + break; + } +done: + clear_mm_walk(); + blk_finish_plug(&plug); + memalloc_noreclaim_restore(flags); + set_task_reclaim_state(current, NULL); + + kvfree(buf); + + return err ? : len; +} + +static int lru_gen_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &lru_gen_seq_ops); +} + +static const struct file_operations lru_gen_rw_fops = { + .open = lru_gen_seq_open, + .read = seq_read, + .write = lru_gen_seq_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations lru_gen_ro_fops = { + .open = lru_gen_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/****************************************************************************** + * initialization + ******************************************************************************/ + +void lru_gen_init_lruvec(struct lruvec *lruvec) +{ + int i; + int gen, type, zone; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + lrugen->max_seq = MIN_NR_GENS + 1; + lrugen->enabled = lru_gen_enabled(); + + for (i = 0; i <= MIN_NR_GENS + 1; i++) + lrugen->timestamps[i] = jiffies; + + for_each_gen_type_zone(gen, type, zone) + INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); + + lruvec->mm_state.seq = MIN_NR_GENS; + init_waitqueue_head(&lruvec->mm_state.wait); +} + +#ifdef CONFIG_MEMCG +void lru_gen_init_memcg(struct mem_cgroup *memcg) +{ + INIT_LIST_HEAD(&memcg->mm_list.fifo); + spin_lock_init(&memcg->mm_list.lock); +} + +void lru_gen_exit_memcg(struct mem_cgroup *memcg) +{ + int i; + int nid; + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, + sizeof(lruvec->lrugen.nr_pages))); + + for (i = 0; i < NR_BLOOM_FILTERS; i++) { + bitmap_free(lruvec->mm_state.filters[i]); + lruvec->mm_state.filters[i] = NULL; + } + } +} +#endif + +static int __init init_lru_gen(void) +{ + BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); + BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); + + if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) + pr_err("lru_gen: failed to create sysfs group\n"); + + debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); + debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); + + return 0; +}; +late_initcall(init_lru_gen); + +#else /* !CONFIG_LRU_GEN */ + +static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) +{ +} + +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ +} + +#endif /* CONFIG_LRU_GEN */ + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; @@ -2955,8 +5910,13 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) enum lru_list lru; unsigned long nr_reclaimed = 0; unsigned long nr_to_reclaim = sc->nr_to_reclaim; + bool proportional_reclaim; struct blk_plug plug; - bool scan_adjusted; + + if (lru_gen_enabled()) { + lru_gen_shrink_lruvec(lruvec, sc); + return; + } get_scan_count(lruvec, sc, nr); @@ -2974,8 +5934,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) * abort proportional reclaim if either the file or anon lru has already * dropped to zero at the first pass. */ - scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() && - sc->priority == DEF_PRIORITY); + proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() && + sc->priority == DEF_PRIORITY); blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || @@ -2995,7 +5955,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) cond_resched(); - if (nr_reclaimed < nr_to_reclaim || scan_adjusted) + if (nr_reclaimed < nr_to_reclaim || proportional_reclaim) continue; /* @@ -3046,8 +6006,6 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) nr_scanned = targets[lru] - nr[lru]; nr[lru] = targets[lru] * (100 - percentage) / 100; nr[lru] -= min(nr[lru], nr_scanned); - - scan_adjusted = true; } blk_finish_plug(&plug); sc->nr_reclaimed += nr_reclaimed; @@ -3154,13 +6112,13 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) mem_cgroup_calculate_protection(target_memcg, memcg); - if (mem_cgroup_below_min(memcg)) { + if (mem_cgroup_below_min(target_memcg, memcg)) { /* * Hard protection. * If there is no reclaimable memory, OOM. */ continue; - } else if (mem_cgroup_below_low(memcg)) { + } else if (mem_cgroup_below_low(target_memcg, memcg)) { /* * Soft protection. * Respect the protection only as long as @@ -3197,109 +6155,16 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) unsigned long nr_reclaimed, nr_scanned; struct lruvec *target_lruvec; bool reclaimable = false; - unsigned long file; target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); again: - /* - * Flush the memory cgroup stats, so that we read accurate per-memcg - * lruvec stats for heuristics. - */ - mem_cgroup_flush_stats(); - memset(&sc->nr, 0, sizeof(sc->nr)); nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; - /* - * Determine the scan balance between anon and file LRUs. - */ - spin_lock_irq(&target_lruvec->lru_lock); - sc->anon_cost = target_lruvec->anon_cost; - sc->file_cost = target_lruvec->file_cost; - spin_unlock_irq(&target_lruvec->lru_lock); - - /* - * Target desirable inactive:active list ratios for the anon - * and file LRU lists. - */ - if (!sc->force_deactivate) { - unsigned long refaults; - - refaults = lruvec_page_state(target_lruvec, - WORKINGSET_ACTIVATE_ANON); - if (refaults != target_lruvec->refaults[0] || - inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) - sc->may_deactivate |= DEACTIVATE_ANON; - else - sc->may_deactivate &= ~DEACTIVATE_ANON; - - /* - * When refaults are being observed, it means a new - * workingset is being established. Deactivate to get - * rid of any stale active pages quickly. - */ - refaults = lruvec_page_state(target_lruvec, - WORKINGSET_ACTIVATE_FILE); - if (refaults != target_lruvec->refaults[1] || - inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) - sc->may_deactivate |= DEACTIVATE_FILE; - else - sc->may_deactivate &= ~DEACTIVATE_FILE; - } else - sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; - - /* - * If we have plenty of inactive file pages that aren't - * thrashing, try to reclaim those first before touching - * anonymous pages. - */ - file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); - if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) - sc->cache_trim_mode = 1; - else - sc->cache_trim_mode = 0; - - /* - * Prevent the reclaimer from falling into the cache trap: as - * cache pages start out inactive, every cache fault will tip - * the scan balance towards the file LRU. And as the file LRU - * shrinks, so does the window for rotation from references. - * This means we have a runaway feedback loop where a tiny - * thrashing file LRU becomes infinitely more attractive than - * anon pages. Try to detect this based on file LRU size. - */ - if (!cgroup_reclaim(sc)) { - unsigned long total_high_wmark = 0; - unsigned long free, anon; - int z; - - free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); - file = node_page_state(pgdat, NR_ACTIVE_FILE) + - node_page_state(pgdat, NR_INACTIVE_FILE); - - for (z = 0; z < MAX_NR_ZONES; z++) { - struct zone *zone = &pgdat->node_zones[z]; - if (!managed_zone(zone)) - continue; - - total_high_wmark += high_wmark_pages(zone); - } - - /* - * Consider anon: if that's low too, this isn't a - * runaway file reclaim problem, but rather just - * extreme pressure. Reclaim as per usual then. - */ - anon = node_page_state(pgdat, NR_INACTIVE_ANON); - - sc->file_is_tiny = - file + free <= total_high_wmark && - !(sc->may_deactivate & DEACTIVATE_ANON) && - anon >> sc->priority; - } + prepare_scan_count(pgdat, sc); shrink_node_memcgs(pgdat, sc); @@ -3557,11 +6422,14 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) struct lruvec *target_lruvec; unsigned long refaults; + if (lru_gen_enabled()) + return; + target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); - target_lruvec->refaults[0] = refaults; + target_lruvec->refaults[WORKINGSET_ANON] = refaults; refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE); - target_lruvec->refaults[1] = refaults; + target_lruvec->refaults[WORKINGSET_FILE] = refaults; } /* @@ -3886,7 +6754,8 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - unsigned int reclaim_options) + unsigned int reclaim_options, + nodemask_t *nodemask) { unsigned long nr_reclaimed; unsigned int noreclaim_flag; @@ -3901,6 +6770,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .may_unmap = 1, .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), + .nodemask = nodemask, }; /* * Traverse the ZONELIST_FALLBACK zonelist of the current node to put @@ -3923,12 +6793,16 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, } #endif -static void age_active_anon(struct pglist_data *pgdat, - struct scan_control *sc) +static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) { struct mem_cgroup *memcg; struct lruvec *lruvec; + if (lru_gen_enabled()) { + lru_gen_age_node(pgdat, sc); + return; + } + if (!can_age_anon_pages(pgdat, sc)) return; @@ -4248,12 +7122,11 @@ restart: sc.may_swap = !nr_boost_reclaim; /* - * Do some background aging of the anon list, to give - * pages a chance to be referenced before reclaiming. All - * pages are rotated regardless of classzone as this is - * about consistent aging. + * Do some background aging, to give pages a chance to be + * referenced before reclaiming. All pages are rotated + * regardless of classzone as this is about consistent aging. */ - age_active_anon(pgdat, &sc); + kswapd_age_node(pgdat, &sc); /* * If we're getting trouble reclaiming, start doing writepage @@ -4643,16 +7516,17 @@ void kswapd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); - if (pgdat->kswapd) - return; - - pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); - if (IS_ERR(pgdat->kswapd)) { - /* failure at boot is fatal */ - BUG_ON(system_state < SYSTEM_RUNNING); - pr_err("Failed to start kswapd on node %d\n", nid); - pgdat->kswapd = NULL; + pgdat_kswapd_lock(pgdat); + if (!pgdat->kswapd) { + pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); + if (IS_ERR(pgdat->kswapd)) { + /* failure at boot is fatal */ + BUG_ON(system_state < SYSTEM_RUNNING); + pr_err("Failed to start kswapd on node %d\n", nid); + pgdat->kswapd = NULL; + } } + pgdat_kswapd_unlock(pgdat); } /* @@ -4661,12 +7535,16 @@ void kswapd_run(int nid) */ void kswapd_stop(int nid) { - struct task_struct *kswapd = NODE_DATA(nid)->kswapd; + pg_data_t *pgdat = NODE_DATA(nid); + struct task_struct *kswapd; + pgdat_kswapd_lock(pgdat); + kswapd = pgdat->kswapd; if (kswapd) { kthread_stop(kswapd); - NODE_DATA(nid)->kswapd = NULL; + pgdat->kswapd = NULL; } + pgdat_kswapd_unlock(pgdat); } static int __init kswapd_init(void) diff --git a/mm/vmstat.c b/mm/vmstat.c index 90af9a8572f5..1ea6a5ce1c41 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -28,7 +28,6 @@ #include <linux/mm_inline.h> #include <linux/page_ext.h> #include <linux/page_owner.h> -#include <linux/migrate.h> #include "internal.h" @@ -355,8 +354,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, * CPU migrations and preemption potentially corrupts a counter so * disable preemption. */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); x = delta + __this_cpu_read(*p); @@ -368,8 +366,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, } __this_cpu_write(*p, x); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); } EXPORT_SYMBOL(__mod_zone_page_state); @@ -393,8 +390,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, } /* See __mod_node_page_state */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); x = delta + __this_cpu_read(*p); @@ -406,8 +402,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, } __this_cpu_write(*p, x); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); } EXPORT_SYMBOL(__mod_node_page_state); @@ -441,8 +436,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) s8 v, t; /* See __mod_node_page_state */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); v = __this_cpu_inc_return(*p); t = __this_cpu_read(pcp->stat_threshold); @@ -453,8 +447,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) __this_cpu_write(*p, -overstep); } - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); } void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) @@ -466,8 +459,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); /* See __mod_node_page_state */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); v = __this_cpu_inc_return(*p); t = __this_cpu_read(pcp->stat_threshold); @@ -478,8 +470,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) __this_cpu_write(*p, -overstep); } - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); } void __inc_zone_page_state(struct page *page, enum zone_stat_item item) @@ -501,8 +492,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) s8 v, t; /* See __mod_node_page_state */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); v = __this_cpu_dec_return(*p); t = __this_cpu_read(pcp->stat_threshold); @@ -513,8 +503,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) __this_cpu_write(*p, overstep); } - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); } void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) @@ -526,8 +515,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); /* See __mod_node_page_state */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); v = __this_cpu_dec_return(*p); t = __this_cpu_read(pcp->stat_threshold); @@ -538,8 +526,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) __this_cpu_write(*p, overstep); } - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); } void __dec_zone_page_state(struct page *page, enum zone_stat_item item) @@ -1247,11 +1234,13 @@ const char * const vmstat_text[] = { "nr_shadow_call_stack", #endif "nr_page_table_pages", + "nr_sec_page_table_pages", #ifdef CONFIG_SWAP "nr_swapcached", #endif #ifdef CONFIG_NUMA_BALANCING "pgpromote_success", + "pgpromote_candidate", #endif /* enum writeback_stat_item counters */ @@ -1282,10 +1271,13 @@ const char * const vmstat_text[] = { "pgreuse", "pgsteal_kswapd", "pgsteal_direct", + "pgsteal_khugepaged", "pgdemote_kswapd", "pgdemote_direct", + "pgdemote_khugepaged", "pgscan_kswapd", "pgscan_direct", + "pgscan_khugepaged", "pgscan_direct_throttle", "pgscan_anon", "pgscan_file", @@ -1389,10 +1381,6 @@ const char * const vmstat_text[] = { "nr_tlb_local_flush_one", #endif /* CONFIG_DEBUG_TLBFLUSH */ -#ifdef CONFIG_DEBUG_VM_VMACACHE - "vmacache_find_calls", - "vmacache_find_hits", -#endif #ifdef CONFIG_SWAP "swap_ra", "swap_ra_hit", @@ -2067,7 +2055,6 @@ static int vmstat_cpu_online(unsigned int cpu) if (!node_state(cpu_to_node(cpu), N_CPU)) { node_set_state(cpu_to_node(cpu), N_CPU); - set_migration_target_nodes(); } return 0; @@ -2092,7 +2079,6 @@ static int vmstat_cpu_dead(unsigned int cpu) return 0; node_clear_state(node, N_CPU); - set_migration_target_nodes(); return 0; } @@ -2125,7 +2111,6 @@ void __init init_mm_internals(void) start_shepherd_timer(); #endif - migrate_on_reclaim_init(); #ifdef CONFIG_PROC_FS proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op); proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op); diff --git a/mm/workingset.c b/mm/workingset.c index a5e84862fc86..1a86645b7b3c 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -187,7 +187,6 @@ static unsigned int bucket_order __read_mostly; static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, bool workingset) { - eviction >>= bucket_order; eviction &= EVICTION_MASK; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; @@ -212,10 +211,107 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, *memcgidp = memcgid; *pgdat = NODE_DATA(nid); - *evictionp = entry << bucket_order; + *evictionp = entry; *workingsetp = workingset; } +#ifdef CONFIG_LRU_GEN + +static void *lru_gen_eviction(struct folio *folio) +{ + int hist; + unsigned long token; + unsigned long min_seq; + struct lruvec *lruvec; + struct lru_gen_struct *lrugen; + int type = folio_is_file_lru(folio); + int delta = folio_nr_pages(folio); + int refs = folio_lru_refs(folio); + int tier = lru_tier_from_refs(refs); + struct mem_cgroup *memcg = folio_memcg(folio); + struct pglist_data *pgdat = folio_pgdat(folio); + + BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); + + lruvec = mem_cgroup_lruvec(memcg, pgdat); + lrugen = &lruvec->lrugen; + min_seq = READ_ONCE(lrugen->min_seq[type]); + token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0); + + hist = lru_hist_from_seq(min_seq); + atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); + + return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs); +} + +static void lru_gen_refault(struct folio *folio, void *shadow) +{ + int hist, tier, refs; + int memcg_id; + bool workingset; + unsigned long token; + unsigned long min_seq; + struct lruvec *lruvec; + struct lru_gen_struct *lrugen; + struct mem_cgroup *memcg; + struct pglist_data *pgdat; + int type = folio_is_file_lru(folio); + int delta = folio_nr_pages(folio); + + unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset); + + if (pgdat != folio_pgdat(folio)) + return; + + rcu_read_lock(); + + memcg = folio_memcg_rcu(folio); + if (memcg_id != mem_cgroup_id(memcg)) + goto unlock; + + lruvec = mem_cgroup_lruvec(memcg, pgdat); + lrugen = &lruvec->lrugen; + + min_seq = READ_ONCE(lrugen->min_seq[type]); + if ((token >> LRU_REFS_WIDTH) != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH))) + goto unlock; + + hist = lru_hist_from_seq(min_seq); + /* see the comment in folio_lru_refs() */ + refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset; + tier = lru_tier_from_refs(refs); + + atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); + mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); + + /* + * Count the following two cases as stalls: + * 1. For pages accessed through page tables, hotter pages pushed out + * hot pages which refaulted immediately. + * 2. For pages accessed multiple times through file descriptors, + * numbers of accesses might have been out of the range. + */ + if (lru_gen_in_fault() || refs == BIT(LRU_REFS_WIDTH)) { + folio_set_workingset(folio); + mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); + } +unlock: + rcu_read_unlock(); +} + +#else /* !CONFIG_LRU_GEN */ + +static void *lru_gen_eviction(struct folio *folio) +{ + return NULL; +} + +static void lru_gen_refault(struct folio *folio, void *shadow) +{ +} + +#endif /* CONFIG_LRU_GEN */ + /** * workingset_age_nonresident - age non-resident entries as LRU ages * @lruvec: the lruvec that was aged @@ -264,10 +360,14 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + if (lru_gen_enabled()) + return lru_gen_eviction(folio); + lruvec = mem_cgroup_lruvec(target_memcg, pgdat); /* XXX: target_memcg can be NULL, go through lruvec */ memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); eviction = atomic_long_read(&lruvec->nonresident_age); + eviction >>= bucket_order; workingset_age_nonresident(lruvec, folio_nr_pages(folio)); return pack_shadow(memcgid, pgdat, eviction, folio_test_workingset(folio)); @@ -298,7 +398,13 @@ void workingset_refault(struct folio *folio, void *shadow) int memcgid; long nr; + if (lru_gen_enabled()) { + lru_gen_refault(folio, shadow); + return; + } + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); + eviction <<= bucket_order; rcu_read_lock(); /* @@ -386,8 +492,11 @@ void workingset_refault(struct folio *folio, void *shadow) /* Folio was active prior to eviction */ if (workingset) { folio_set_workingset(folio); - /* XXX: Move to lru_cache_add() when it supports new vs putback */ - lru_note_cost_folio(folio); + /* + * XXX: Move to folio_add_lru() when it supports new vs + * putback + */ + lru_note_cost_refault(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); } out: diff --git a/mm/z3fold.c b/mm/z3fold.c index cf71da10d04e..a4de0c317ac7 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -68,9 +68,6 @@ * Structures *****************/ struct z3fold_pool; -struct z3fold_ops { - int (*evict)(struct z3fold_pool *pool, unsigned long handle); -}; enum buddy { HEADLESS = 0, @@ -138,8 +135,6 @@ struct z3fold_header { * @stale: list of pages marked for freeing * @pages_nr: number of z3fold pages in the pool. * @c_handle: cache for z3fold_buddy_slots allocation - * @ops: pointer to a structure of user defined operations specified at - * pool creation time. * @zpool: zpool driver * @zpool_ops: zpool operations structure with an evict callback * @compact_wq: workqueue for page layout background optimization @@ -158,7 +153,6 @@ struct z3fold_pool { struct list_head stale; atomic64_t pages_nr; struct kmem_cache *c_handle; - const struct z3fold_ops *ops; struct zpool *zpool; const struct zpool_ops *zpool_ops; struct workqueue_struct *compact_wq; @@ -907,13 +901,11 @@ out_fail: * z3fold_create_pool() - create a new z3fold pool * @name: pool name * @gfp: gfp flags when allocating the z3fold pool structure - * @ops: user-defined operations for the z3fold pool * * Return: pointer to the new z3fold pool or NULL if the metadata allocation * failed. */ -static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, - const struct z3fold_ops *ops) +static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp) { struct z3fold_pool *pool = NULL; int i, cpu; @@ -949,7 +941,6 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, if (!pool->release_wq) goto out_wq; INIT_WORK(&pool->work, free_pages_work); - pool->ops = ops; return pool; out_wq: @@ -1230,10 +1221,6 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) slots.pool = (unsigned long)pool | (1 << HANDLES_NOFREE); spin_lock(&pool->lock); - if (!pool->ops || !pool->ops->evict || retries == 0) { - spin_unlock(&pool->lock); - return -EINVAL; - } for (i = 0; i < retries; i++) { if (list_empty(&pool->lru)) { spin_unlock(&pool->lock); @@ -1319,17 +1306,17 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) } /* Issue the eviction callback(s) */ if (middle_handle) { - ret = pool->ops->evict(pool, middle_handle); + ret = pool->zpool_ops->evict(pool->zpool, middle_handle); if (ret) goto next; } if (first_handle) { - ret = pool->ops->evict(pool, first_handle); + ret = pool->zpool_ops->evict(pool->zpool, first_handle); if (ret) goto next; } if (last_handle) { - ret = pool->ops->evict(pool, last_handle); + ret = pool->zpool_ops->evict(pool->zpool, last_handle); if (ret) goto next; } @@ -1593,26 +1580,13 @@ static const struct movable_operations z3fold_mops = { * zpool ****************/ -static int z3fold_zpool_evict(struct z3fold_pool *pool, unsigned long handle) -{ - if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict) - return pool->zpool_ops->evict(pool->zpool, handle); - else - return -ENOENT; -} - -static const struct z3fold_ops z3fold_zpool_ops = { - .evict = z3fold_zpool_evict -}; - static void *z3fold_zpool_create(const char *name, gfp_t gfp, const struct zpool_ops *zpool_ops, struct zpool *zpool) { struct z3fold_pool *pool; - pool = z3fold_create_pool(name, gfp, - zpool_ops ? &z3fold_zpool_ops : NULL); + pool = z3fold_create_pool(name, gfp); if (pool) { pool->zpool = zpool; pool->zpool_ops = zpool_ops; diff --git a/mm/zbud.c b/mm/zbud.c index 6348932430b8..3acd26193920 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -74,10 +74,6 @@ struct zbud_pool; -struct zbud_ops { - int (*evict)(struct zbud_pool *pool, unsigned long handle); -}; - /** * struct zbud_pool - stores metadata for each zbud pool * @lock: protects all pool fields and first|last_chunk fields of any @@ -90,8 +86,6 @@ struct zbud_ops { * @lru: list tracking the zbud pages in LRU order by most recently * added buddy. * @pages_nr: number of zbud pages in the pool. - * @ops: pointer to a structure of user defined operations specified at - * pool creation time. * @zpool: zpool driver * @zpool_ops: zpool operations structure with an evict callback * @@ -110,7 +104,6 @@ struct zbud_pool { }; struct list_head lru; u64 pages_nr; - const struct zbud_ops *ops; struct zpool *zpool; const struct zpool_ops *zpool_ops; }; @@ -212,12 +205,11 @@ static int num_free_chunks(struct zbud_header *zhdr) /** * zbud_create_pool() - create a new zbud pool * @gfp: gfp flags when allocating the zbud pool structure - * @ops: user-defined operations for the zbud pool * * Return: pointer to the new zbud pool or NULL if the metadata allocation * failed. */ -static struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops) +static struct zbud_pool *zbud_create_pool(gfp_t gfp) { struct zbud_pool *pool; int i; @@ -231,7 +223,6 @@ static struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops) INIT_LIST_HEAD(&pool->buddied); INIT_LIST_HEAD(&pool->lru); pool->pages_nr = 0; - pool->ops = ops; return pool; } @@ -419,8 +410,7 @@ static int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries) unsigned long first_handle = 0, last_handle = 0; spin_lock(&pool->lock); - if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) || - retries == 0) { + if (list_empty(&pool->lru)) { spin_unlock(&pool->lock); return -EINVAL; } @@ -444,12 +434,12 @@ static int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries) /* Issue the eviction callback(s) */ if (first_handle) { - ret = pool->ops->evict(pool, first_handle); + ret = pool->zpool_ops->evict(pool->zpool, first_handle); if (ret) goto next; } if (last_handle) { - ret = pool->ops->evict(pool, last_handle); + ret = pool->zpool_ops->evict(pool->zpool, last_handle); if (ret) goto next; } @@ -524,25 +514,13 @@ static u64 zbud_get_pool_size(struct zbud_pool *pool) * zpool ****************/ -static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle) -{ - if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict) - return pool->zpool_ops->evict(pool->zpool, handle); - else - return -ENOENT; -} - -static const struct zbud_ops zbud_zpool_ops = { - .evict = zbud_zpool_evict -}; - static void *zbud_zpool_create(const char *name, gfp_t gfp, const struct zpool_ops *zpool_ops, struct zpool *zpool) { struct zbud_pool *pool; - pool = zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); + pool = zbud_create_pool(gfp); if (pool) { pool->zpool = zpool; pool->zpool_ops = zpool_ops; diff --git a/mm/zpool.c b/mm/zpool.c index 68facc193496..571f5c5031dd 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -21,9 +21,6 @@ struct zpool { struct zpool_driver *driver; void *pool; - const struct zpool_ops *ops; - bool evictable; - bool can_sleep_mapped; }; static LIST_HEAD(drivers_head); @@ -177,9 +174,6 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, zpool->driver = driver; zpool->pool = driver->create(name, gfp, ops, zpool); - zpool->ops = ops; - zpool->evictable = driver->shrink && ops && ops->evict; - zpool->can_sleep_mapped = driver->sleep_mapped; if (!zpool->pool) { pr_err("couldn't create %s pool\n", type); @@ -380,18 +374,25 @@ u64 zpool_get_total_size(struct zpool *zpool) */ bool zpool_evictable(struct zpool *zpool) { - return zpool->evictable; + return zpool->driver->shrink; } /** * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped. * @zpool: The zpool to test * + * Some allocators enter non-preemptible context in ->map() callback (e.g. + * disable pagefaults) and exit that context in ->unmap(), which limits what + * we can do with the mapped object. For instance, we cannot wait for + * asynchronous crypto API to decompress such an object or take mutexes + * since those will call into the scheduler. This function tells us whether + * we use such an allocator. + * * Returns: true if zpool can sleep; false otherwise. */ bool zpool_can_sleep_mapped(struct zpool *zpool) { - return zpool->can_sleep_mapped; + return zpool->driver->sleep_mapped; } MODULE_LICENSE("GPL"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 907c9b1e1e61..9445bee6b014 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -33,8 +33,7 @@ /* * lock ordering: * page_lock - * pool->migrate_lock - * class->lock + * pool->lock * zspage->lock */ @@ -192,7 +191,6 @@ static const int fullness_threshold_frac = 4; static size_t huge_class_size; struct size_class { - spinlock_t lock; struct list_head fullness_list[NR_ZS_FULLNESS]; /* * Size of objects stored in this class. Must be multiple @@ -241,14 +239,20 @@ struct zs_pool { /* Compact classes */ struct shrinker shrinker; +#ifdef CONFIG_ZPOOL + /* List tracking the zspages in LRU order by most recently added object */ + struct list_head lru; + struct zpool *zpool; + const struct zpool_ops *zpool_ops; +#endif + #ifdef CONFIG_ZSMALLOC_STAT struct dentry *stat_dentry; #endif #ifdef CONFIG_COMPACTION struct work_struct free_work; #endif - /* protect page/zspage migration */ - rwlock_t migrate_lock; + spinlock_t lock; }; struct zspage { @@ -263,10 +267,17 @@ struct zspage { unsigned int freeobj; struct page *first_page; struct list_head list; /* fullness list */ + +#ifdef CONFIG_ZPOOL + /* links the zspage to the lru list in the pool */ + struct list_head lru; + bool under_reclaim; + /* list of unfreed handles whose objects have been reclaimed */ + unsigned long *deferred_handles; +#endif + struct zs_pool *pool; -#ifdef CONFIG_COMPACTION rwlock_t lock; -#endif }; struct mapping_area { @@ -287,10 +298,11 @@ static bool ZsHugePage(struct zspage *zspage) return zspage->huge; } -#ifdef CONFIG_COMPACTION static void migrate_lock_init(struct zspage *zspage); static void migrate_read_lock(struct zspage *zspage); static void migrate_read_unlock(struct zspage *zspage); + +#ifdef CONFIG_COMPACTION static void migrate_write_lock(struct zspage *zspage); static void migrate_write_lock_nested(struct zspage *zspage); static void migrate_write_unlock(struct zspage *zspage); @@ -298,9 +310,6 @@ static void kick_deferred_free(struct zs_pool *pool); static void init_deferred_free(struct zs_pool *pool); static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage); #else -static void migrate_lock_init(struct zspage *zspage) {} -static void migrate_read_lock(struct zspage *zspage) {} -static void migrate_read_unlock(struct zspage *zspage) {} static void migrate_write_lock(struct zspage *zspage) {} static void migrate_write_lock_nested(struct zspage *zspage) {} static void migrate_write_unlock(struct zspage *zspage) {} @@ -355,7 +364,7 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) kmem_cache_free(pool->zspage_cachep, zspage); } -/* class->lock(which owns the handle) synchronizes races */ +/* pool->lock(which owns the handle) synchronizes races */ static void record_obj(unsigned long handle, unsigned long obj) { *(unsigned long *)handle = obj; @@ -374,7 +383,14 @@ static void *zs_zpool_create(const char *name, gfp_t gfp, * different contexts and its caller must provide a valid * gfp mask. */ - return zs_create_pool(name); + struct zs_pool *pool = zs_create_pool(name); + + if (pool) { + pool->zpool = zpool; + pool->zpool_ops = zpool_ops; + } + + return pool; } static void zs_zpool_destroy(void *pool) @@ -387,7 +403,7 @@ static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, { *handle = zs_malloc(pool, size, gfp); - if (IS_ERR((void *)(*handle))) + if (IS_ERR_VALUE(*handle)) return PTR_ERR((void *)*handle); return 0; } @@ -396,6 +412,27 @@ static void zs_zpool_free(void *pool, unsigned long handle) zs_free(pool, handle); } +static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries); + +static int zs_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + unsigned int total = 0; + int ret = -EINVAL; + + while (total < pages) { + ret = zs_reclaim_page(pool, 8); + if (ret < 0) + break; + total++; + } + + if (reclaimed) + *reclaimed = total; + + return ret; +} + static void *zs_zpool_map(void *pool, unsigned long handle, enum zpool_mapmode mm) { @@ -434,6 +471,7 @@ static struct zpool_driver zs_zpool_driver = { .malloc_support_movable = true, .malloc = zs_zpool_malloc, .free = zs_zpool_free, + .shrink = zs_zpool_shrink, .map = zs_zpool_map, .unmap = zs_zpool_unmap, .total_size = zs_zpool_total_size, @@ -452,7 +490,7 @@ static __maybe_unused int is_first_page(struct page *page) return PagePrivate(page); } -/* Protected by class->lock */ +/* Protected by pool->lock */ static inline int get_zspage_inuse(struct zspage *zspage) { return zspage->inuse; @@ -472,12 +510,12 @@ static inline struct page *get_first_page(struct zspage *zspage) return first_page; } -static inline int get_first_obj_offset(struct page *page) +static inline unsigned int get_first_obj_offset(struct page *page) { return page->page_type; } -static inline void set_first_obj_offset(struct page *page, int offset) +static inline void set_first_obj_offset(struct page *page, unsigned int offset) { page->page_type = offset; } @@ -597,13 +635,13 @@ static int zs_stats_size_show(struct seq_file *s, void *v) if (class->index != i) continue; - spin_lock(&class->lock); + spin_lock(&pool->lock); class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL); class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY); obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); obj_used = zs_stat_get(class, OBJ_USED); freeable = zs_can_compact(class); - spin_unlock(&class->lock); + spin_unlock(&pool->lock); objs_per_zspage = class->objs_per_zspage; pages_used = obj_allocated / objs_per_zspage * @@ -907,6 +945,25 @@ unlock: return 0; } +#ifdef CONFIG_ZPOOL +/* + * Free all the deferred handles whose objects are freed in zs_free. + */ +static void free_handles(struct zs_pool *pool, struct zspage *zspage) +{ + unsigned long handle = (unsigned long)zspage->deferred_handles; + + while (handle) { + unsigned long nxt_handle = handle_to_obj(handle); + + cache_free_handle(pool, handle); + handle = nxt_handle; + } +} +#else +static inline void free_handles(struct zs_pool *pool, struct zspage *zspage) {} +#endif + static void __free_zspage(struct zs_pool *pool, struct size_class *class, struct zspage *zspage) { @@ -916,11 +973,14 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class, get_zspage_mapping(zspage, &class_idx, &fg); - assert_spin_locked(&class->lock); + assert_spin_locked(&pool->lock); VM_BUG_ON(get_zspage_inuse(zspage)); VM_BUG_ON(fg != ZS_EMPTY); + /* Free all deferred handles from zs_free */ + free_handles(pool, zspage); + next = page = get_first_page(zspage); do { VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -956,6 +1016,9 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class, } remove_zspage(class, zspage, ZS_EMPTY); +#ifdef CONFIG_ZPOOL + list_del(&zspage->lru); +#endif __free_zspage(pool, class, zspage); } @@ -1001,6 +1064,12 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) off %= PAGE_SIZE; } +#ifdef CONFIG_ZPOOL + INIT_LIST_HEAD(&zspage->lru); + zspage->under_reclaim = false; + zspage->deferred_handles = NULL; +#endif + set_freeobj(zspage, 0); } @@ -1205,6 +1274,27 @@ static bool zspage_full(struct size_class *class, struct zspage *zspage) return get_zspage_inuse(zspage) == class->objs_per_zspage; } +/** + * zs_lookup_class_index() - Returns index of the zsmalloc &size_class + * that hold objects of the provided size. + * @pool: zsmalloc pool to use + * @size: object size + * + * Context: Any context. + * + * Return: the index of the zsmalloc &size_class that hold objects of the + * provided size. + */ +unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size) +{ + struct size_class *class; + + class = pool->size_class[get_size_class_index(size)]; + + return class->index; +} +EXPORT_SYMBOL_GPL(zs_lookup_class_index); + unsigned long zs_get_total_pages(struct zs_pool *pool) { return atomic_long_read(&pool->pages_allocated); @@ -1247,19 +1337,44 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, BUG_ON(in_interrupt()); /* It guarantees it can get zspage from handle safely */ - read_lock(&pool->migrate_lock); + spin_lock(&pool->lock); obj = handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); zspage = get_zspage(page); +#ifdef CONFIG_ZPOOL + /* + * Move the zspage to front of pool's LRU. + * + * Note that this is swap-specific, so by definition there are no ongoing + * accesses to the memory while the page is swapped out that would make + * it "hot". A new entry is hot, then ages to the tail until it gets either + * written back or swaps back in. + * + * Furthermore, map is also called during writeback. We must not put an + * isolated page on the LRU mid-reclaim. + * + * As a result, only update the LRU when the page is mapped for write + * when it's first instantiated. + * + * This is a deviation from the other backends, which perform this update + * in the allocation function (zbud_alloc, z3fold_alloc). + */ + if (mm == ZS_MM_WO) { + if (!list_empty(&zspage->lru)) + list_del(&zspage->lru); + list_add(&zspage->lru, &pool->lru); + } +#endif + /* - * migration cannot move any zpages in this zspage. Here, class->lock + * migration cannot move any zpages in this zspage. Here, pool->lock * is too heavy since callers would take some time until they calls * zs_unmap_object API so delegate the locking from class to zspage * which is smaller granularity. */ migrate_read_lock(zspage); - read_unlock(&pool->migrate_lock); + spin_unlock(&pool->lock); class = zspage_class(pool, zspage); off = (class->size * obj_idx) & ~PAGE_MASK; @@ -1412,8 +1527,8 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) size += ZS_HANDLE_SIZE; class = pool->size_class[get_size_class_index(size)]; - /* class->lock effectively protects the zpage migration */ - spin_lock(&class->lock); + /* pool->lock effectively protects the zpage migration */ + spin_lock(&pool->lock); zspage = find_get_zspage(class); if (likely(zspage)) { obj = obj_malloc(pool, zspage, handle); @@ -1421,12 +1536,12 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) fix_fullness_group(class, zspage); record_obj(handle, obj); class_stat_inc(class, OBJ_USED, 1); - spin_unlock(&class->lock); + spin_unlock(&pool->lock); return handle; } - spin_unlock(&class->lock); + spin_unlock(&pool->lock); zspage = alloc_zspage(pool, class, gfp); if (!zspage) { @@ -1434,7 +1549,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) return (unsigned long)ERR_PTR(-ENOMEM); } - spin_lock(&class->lock); + spin_lock(&pool->lock); obj = obj_malloc(pool, zspage, handle); newfg = get_fullness_group(class, zspage); insert_zspage(class, zspage, newfg); @@ -1447,7 +1562,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) /* We completely set up zspage so mark them as movable */ SetZsPageMovable(pool, zspage); - spin_unlock(&class->lock); + spin_unlock(&pool->lock); return handle; } @@ -1491,26 +1606,38 @@ void zs_free(struct zs_pool *pool, unsigned long handle) return; /* - * The pool->migrate_lock protects the race with zpage's migration + * The pool->lock protects the race with zpage's migration * so it's safe to get the page from handle. */ - read_lock(&pool->migrate_lock); + spin_lock(&pool->lock); obj = handle_to_obj(handle); obj_to_page(obj, &f_page); zspage = get_zspage(f_page); class = zspage_class(pool, zspage); - spin_lock(&class->lock); - read_unlock(&pool->migrate_lock); obj_free(class->size, obj); class_stat_dec(class, OBJ_USED, 1); + +#ifdef CONFIG_ZPOOL + if (zspage->under_reclaim) { + /* + * Reclaim needs the handles during writeback. It'll free + * them along with the zspage when it's done with them. + * + * Record current deferred handle at the memory location + * whose address is given by handle. + */ + record_obj(handle, (unsigned long)zspage->deferred_handles); + zspage->deferred_handles = (unsigned long *)handle; + spin_unlock(&pool->lock); + return; + } +#endif fullness = fix_fullness_group(class, zspage); - if (fullness != ZS_EMPTY) - goto out; + if (fullness == ZS_EMPTY) + free_zspage(pool, class, zspage); - free_zspage(pool, class, zspage); -out: - spin_unlock(&class->lock); + spin_unlock(&pool->lock); cache_free_handle(pool, handle); } EXPORT_SYMBOL_GPL(zs_free); @@ -1555,6 +1682,13 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, d_off += size; d_size -= size; + /* + * Calling kunmap_atomic(d_addr) is necessary. kunmap_atomic() + * calls must occurs in reverse order of calls to kmap_atomic(). + * So, to call kunmap_atomic(s_addr) we should first call + * kunmap_atomic(d_addr). For more details see + * Documentation/mm/highmem.rst. + */ if (s_off >= PAGE_SIZE) { kunmap_atomic(d_addr); kunmap_atomic(s_addr); @@ -1585,7 +1719,7 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, static unsigned long find_alloced_obj(struct size_class *class, struct page *page, int *obj_idx) { - int offset = 0; + unsigned int offset; int index = *obj_idx; unsigned long handle = 0; void *addr = kmap_atomic(page); @@ -1702,7 +1836,7 @@ static enum fullness_group putback_zspage(struct size_class *class, return fullness; } -#ifdef CONFIG_COMPACTION +#if defined(CONFIG_ZPOOL) || defined(CONFIG_COMPACTION) /* * To prevent zspage destroy during migration, zspage freeing should * hold locks of all pages in the zspage. @@ -1744,6 +1878,24 @@ static void lock_zspage(struct zspage *zspage) } migrate_read_unlock(zspage); } +#endif /* defined(CONFIG_ZPOOL) || defined(CONFIG_COMPACTION) */ + +#ifdef CONFIG_ZPOOL +/* + * Unlocks all the pages of the zspage. + * + * pool->lock must be held before this function is called + * to prevent the underlying pages from migrating. + */ +static void unlock_zspage(struct zspage *zspage) +{ + struct page *page = get_first_page(zspage); + + do { + unlock_page(page); + } while ((page = get_next_page(page)) != NULL); +} +#endif /* CONFIG_ZPOOL */ static void migrate_lock_init(struct zspage *zspage) { @@ -1760,6 +1912,7 @@ static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock) read_unlock(&zspage->lock); } +#ifdef CONFIG_COMPACTION static void migrate_write_lock(struct zspage *zspage) { write_lock(&zspage->lock); @@ -1839,7 +1992,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, struct zspage *zspage; struct page *dummy; void *s_addr, *d_addr, *addr; - int offset; + unsigned int offset; unsigned long handle; unsigned long old_obj, new_obj; unsigned int obj_idx; @@ -1860,16 +2013,12 @@ static int zs_page_migrate(struct page *newpage, struct page *page, pool = zspage->pool; /* - * The pool migrate_lock protects the race between zpage migration + * The pool's lock protects the race between zpage migration * and zs_free. */ - write_lock(&pool->migrate_lock); + spin_lock(&pool->lock); class = zspage_class(pool, zspage); - /* - * the class lock protects zpage alloc/free in the zspage. - */ - spin_lock(&class->lock); /* the migrate_write_lock protects zpage access via zs_map_object */ migrate_write_lock(zspage); @@ -1899,10 +2048,9 @@ static int zs_page_migrate(struct page *newpage, struct page *page, replace_sub_page(class, zspage, newpage, page); /* * Since we complete the data copy and set up new zspage structure, - * it's okay to release migration_lock. + * it's okay to release the pool's lock. */ - write_unlock(&pool->migrate_lock); - spin_unlock(&class->lock); + spin_unlock(&pool->lock); dec_zspage_isolation(zspage); migrate_write_unlock(zspage); @@ -1957,9 +2105,9 @@ static void async_free_zspage(struct work_struct *work) if (class->index != i) continue; - spin_lock(&class->lock); + spin_lock(&pool->lock); list_splice_init(&class->fullness_list[ZS_EMPTY], &free_pages); - spin_unlock(&class->lock); + spin_unlock(&pool->lock); } list_for_each_entry_safe(zspage, tmp, &free_pages, list) { @@ -1969,9 +2117,12 @@ static void async_free_zspage(struct work_struct *work) get_zspage_mapping(zspage, &class_idx, &fullness); VM_BUG_ON(fullness != ZS_EMPTY); class = pool->size_class[class_idx]; - spin_lock(&class->lock); + spin_lock(&pool->lock); +#ifdef CONFIG_ZPOOL + list_del(&zspage->lru); +#endif __free_zspage(pool, class, zspage); - spin_unlock(&class->lock); + spin_unlock(&pool->lock); } }; @@ -2032,10 +2183,11 @@ static unsigned long __zs_compact(struct zs_pool *pool, struct zspage *dst_zspage = NULL; unsigned long pages_freed = 0; - /* protect the race between zpage migration and zs_free */ - write_lock(&pool->migrate_lock); - /* protect zpage allocation/free */ - spin_lock(&class->lock); + /* + * protect the race between zpage migration and zs_free + * as well as zpage allocation/free + */ + spin_lock(&pool->lock); while ((src_zspage = isolate_zspage(class, true))) { /* protect someone accessing the zspage(i.e., zs_map_object) */ migrate_write_lock(src_zspage); @@ -2060,7 +2212,7 @@ static unsigned long __zs_compact(struct zs_pool *pool, putback_zspage(class, dst_zspage); migrate_write_unlock(dst_zspage); dst_zspage = NULL; - if (rwlock_is_contended(&pool->migrate_lock)) + if (spin_is_contended(&pool->lock)) break; } @@ -2077,11 +2229,9 @@ static unsigned long __zs_compact(struct zs_pool *pool, pages_freed += class->pages_per_zspage; } else migrate_write_unlock(src_zspage); - spin_unlock(&class->lock); - write_unlock(&pool->migrate_lock); + spin_unlock(&pool->lock); cond_resched(); - write_lock(&pool->migrate_lock); - spin_lock(&class->lock); + spin_lock(&pool->lock); } if (src_zspage) { @@ -2089,8 +2239,7 @@ static unsigned long __zs_compact(struct zs_pool *pool, migrate_write_unlock(src_zspage); } - spin_unlock(&class->lock); - write_unlock(&pool->migrate_lock); + spin_unlock(&pool->lock); return pages_freed; } @@ -2103,8 +2252,6 @@ unsigned long zs_compact(struct zs_pool *pool) for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { class = pool->size_class[i]; - if (!class) - continue; if (class->index != i) continue; pages_freed += __zs_compact(pool, class); @@ -2149,8 +2296,6 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker, for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { class = pool->size_class[i]; - if (!class) - continue; if (class->index != i) continue; @@ -2197,7 +2342,7 @@ struct zs_pool *zs_create_pool(const char *name) return NULL; init_deferred_free(pool); - rwlock_init(&pool->migrate_lock); + spin_lock_init(&pool->lock); pool->name = kstrdup(name, GFP_KERNEL); if (!pool->name) @@ -2268,7 +2413,6 @@ struct zs_pool *zs_create_pool(const char *name) class->index = i; class->pages_per_zspage = pages_per_zspage; class->objs_per_zspage = objs_per_zspage; - spin_lock_init(&class->lock); pool->size_class[i] = class; for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS; fullness++) @@ -2288,6 +2432,10 @@ struct zs_pool *zs_create_pool(const char *name) */ zs_register_shrinker(pool); +#ifdef CONFIG_ZPOOL + INIT_LIST_HEAD(&pool->lru); +#endif + return pool; err: @@ -2329,6 +2477,100 @@ void zs_destroy_pool(struct zs_pool *pool) } EXPORT_SYMBOL_GPL(zs_destroy_pool); +#ifdef CONFIG_ZPOOL +static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries) +{ + int i, obj_idx, ret = 0; + unsigned long handle; + struct zspage *zspage; + struct page *page; + enum fullness_group fullness; + + /* Lock LRU and fullness list */ + spin_lock(&pool->lock); + if (list_empty(&pool->lru)) { + spin_unlock(&pool->lock); + return -EINVAL; + } + + for (i = 0; i < retries; i++) { + struct size_class *class; + + zspage = list_last_entry(&pool->lru, struct zspage, lru); + list_del(&zspage->lru); + + /* zs_free may free objects, but not the zspage and handles */ + zspage->under_reclaim = true; + + class = zspage_class(pool, zspage); + fullness = get_fullness_group(class, zspage); + + /* Lock out object allocations and object compaction */ + remove_zspage(class, zspage, fullness); + + spin_unlock(&pool->lock); + cond_resched(); + + /* Lock backing pages into place */ + lock_zspage(zspage); + + obj_idx = 0; + page = get_first_page(zspage); + while (1) { + handle = find_alloced_obj(class, page, &obj_idx); + if (!handle) { + page = get_next_page(page); + if (!page) + break; + obj_idx = 0; + continue; + } + + /* + * This will write the object and call zs_free. + * + * zs_free will free the object, but the + * under_reclaim flag prevents it from freeing + * the zspage altogether. This is necessary so + * that we can continue working with the + * zspage potentially after the last object + * has been freed. + */ + ret = pool->zpool_ops->evict(pool->zpool, handle); + if (ret) + goto next; + + obj_idx++; + } + +next: + /* For freeing the zspage, or putting it back in the pool and LRU list. */ + spin_lock(&pool->lock); + zspage->under_reclaim = false; + + if (!get_zspage_inuse(zspage)) { + /* + * Fullness went stale as zs_free() won't touch it + * while the page is removed from the pool. Fix it + * up for the check in __free_zspage(). + */ + zspage->fullness = ZS_EMPTY; + + __free_zspage(pool, class, zspage); + spin_unlock(&pool->lock); + return 0; + } + + putback_zspage(class, zspage); + list_add(&zspage->lru, &pool->lru); + unlock_zspage(zspage); + } + + spin_unlock(&pool->lock); + return -EAGAIN; +} +#endif /* CONFIG_ZPOOL */ + static int __init zs_init(void) { int ret; diff --git a/mm/zswap.c b/mm/zswap.c index 104835b379ec..f6c89049cf70 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -958,7 +958,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) }; if (!zpool_can_sleep_mapped(pool)) { - tmp = kmalloc(PAGE_SIZE, GFP_ATOMIC); + tmp = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!tmp) return -ENOMEM; } @@ -968,6 +968,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) swpentry = zhdr->swpentry; /* here */ tree = zswap_trees[swp_type(swpentry)]; offset = swp_offset(swpentry); + zpool_unmap_handle(pool, handle); /* find and ref zswap entry */ spin_lock(&tree->lock); @@ -975,20 +976,12 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) if (!entry) { /* entry was invalidated */ spin_unlock(&tree->lock); - zpool_unmap_handle(pool, handle); kfree(tmp); return 0; } spin_unlock(&tree->lock); BUG_ON(offset != entry->offset); - src = (u8 *)zhdr + sizeof(struct zswap_header); - if (!zpool_can_sleep_mapped(pool)) { - memcpy(tmp, src, entry->length); - src = tmp; - zpool_unmap_handle(pool, handle); - } - /* try to allocate swap cache page */ switch (zswap_get_swap_cache_page(swpentry, &page)) { case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ @@ -1006,6 +999,14 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); dlen = PAGE_SIZE; + zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); + src = (u8 *)zhdr + sizeof(struct zswap_header); + if (!zpool_can_sleep_mapped(pool)) { + memcpy(tmp, src, entry->length); + src = tmp; + zpool_unmap_handle(pool, handle); + } + mutex_lock(acomp_ctx->mutex); sg_init_one(&input, src, entry->length); sg_init_table(&output, 1); @@ -1015,6 +1016,11 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) dlen = acomp_ctx->req->dlen; mutex_unlock(acomp_ctx->mutex); + if (!zpool_can_sleep_mapped(pool)) + kfree(tmp); + else + zpool_unmap_handle(pool, handle); + BUG_ON(ret); BUG_ON(dlen != PAGE_SIZE); @@ -1026,7 +1032,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) SetPageReclaim(page); /* start writeback */ - __swap_writepage(page, &wbc, end_swap_bio_write); + __swap_writepage(page, &wbc); put_page(page); zswap_written_back_pages++; @@ -1045,7 +1051,11 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) zswap_entry_put(tree, entry); spin_unlock(&tree->lock); - goto end; + return ret; + +fail: + if (!zpool_can_sleep_mapped(pool)) + kfree(tmp); /* * if we get here due to ZSWAP_SWAPCACHE_EXIST @@ -1054,17 +1064,10 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) * if we free the entry in the following put * it is also okay to return !0 */ -fail: spin_lock(&tree->lock); zswap_entry_put(tree, entry); spin_unlock(&tree->lock); -end: - if (zpool_can_sleep_mapped(pool)) - zpool_unmap_handle(pool, handle); - else - kfree(tmp); - return ret; } @@ -1311,7 +1314,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, } if (!zpool_can_sleep_mapped(entry->pool->zpool)) { - tmp = kmalloc(entry->length, GFP_ATOMIC); + tmp = kmalloc(entry->length, GFP_KERNEL); if (!tmp) { ret = -ENOMEM; goto freeentry; |