From 8b9167cd9ef039d95b65ef9600a7507795173121 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Tue, 25 Apr 2023 23:52:35 +0800 Subject: mm: compaction: optimize compact_memory to comply with the admin-guide For the /proc/sys/vm/compact_memory file, the admin-guide states: When 1 is written to the file, all zones are compacted such that free memory is available in contiguous blocks where possible. This can be important for example in the allocation of huge pages although processes will also directly compact memory as required But it was not strictly followed, writing any value would cause all zones to be compacted. It has been slightly optimized to comply with the admin-guide. Enforce the 1 on the unlikely chance that the sysctl handler is ever extended to do something different. Commit ef4984384172 ("mm/compaction: remove unused variable sysctl_compact_memory") has also been optimized a bit here, as the declaration in the external header file has been eliminated, and sysctl_compact_memory also needs to be verified. [akpm@linux-foundation.org: add __read_mostly, per Mel] Link: https://lkml.kernel.org/r/tencent_DFF54DB2A60F3333F97D3F6B5441519B050A@qq.com Signed-off-by: Wen Yang Acked-by: Mel Gorman Cc: Oscar Salvador Cc: William Lam Cc: Pintu Kumar Cc: Fu Wei Signed-off-by: Andrew Morton --- mm/compaction.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'mm/compaction.c') diff --git a/mm/compaction.c b/mm/compaction.c index c8bcdea15f5f..5584fa5fa3d4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1736,6 +1736,7 @@ static int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNE */ static unsigned int __read_mostly sysctl_compaction_proactiveness = 20; static int sysctl_extfrag_threshold = 500; +static int __read_mostly sysctl_compact_memory; static inline void update_fast_start_pfn(struct compact_control *cc, unsigned long pfn) @@ -2780,6 +2781,15 @@ static int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int static int sysctl_compaction_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { + int ret; + + ret = proc_dointvec(table, write, buffer, length, ppos); + if (ret) + return ret; + + if (sysctl_compact_memory != 1) + return -EINVAL; + if (write) compact_nodes(); @@ -3095,7 +3105,7 @@ static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, static struct ctl_table vm_compaction[] = { { .procname = "compact_memory", - .data = NULL, + .data = &sysctl_compact_memory, .maxlen = sizeof(int), .mode = 0200, .proc_handler = sysctl_compaction_handler, -- cgit v1.2.3 From 539aa041a9b1d98cd847b949ba8f91857c995f26 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 15 May 2023 12:33:41 +0100 Subject: mm: compaction: ensure rescanning only happens on partially scanned pageblocks Patch series "Follow-up "Fix excessive CPU usage during compaction"". The series "Fix excessive CPU usage during compaction" [1] attempted to fix a bug [2] but Vlastimil noted that the fix was incomplete [3]. While the series was merged, fast_find_migrateblock was still disabled. This series should fix the corner cases and allow 95e7a450b819 ("Revert "mm/compaction: fix set skip in fast_find_migrateblock"") to be safely reverted. Details on how many pageblocks are rescanned are in the changelog of the last patch. "Raghavendra K T" tested this and reported "decent improvement from perf perspective as well as compaction related data [4] [1] https://lore.kernel.org/r/20230125134434.18017-1-mgorman@techsingularity.net [2] https://bugzilla.suse.com/show_bug.cgi?id=1206848 [3] https://lore.kernel.org/r/a55cf026-a2f9-ef01-9a4c-398693e048ea@suse.cz [4] https://lkml.kernel.org/r/6d62686f-964d-342c-e085-0eae2555cc54@amd.com This patch (of 4): compact_zone() intends to rescan pageblocks if there is a failure to migrate "within the current order-aligned block". However, the pageblock scan may already be complete and moved to the next block causing the next pageblock to be "rescanned". Ensure only the most recent pageblock is rescanned. Link: https://lkml.kernel.org/r/20230515113344.6869-1-mgorman@techsingularity.net Link: https://lkml.kernel.org/r/20230515113344.6869-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Reported-by: Vlastimil Babka Tested-by: Raghavendra K T Acked-by: Vlastimil Babka Cc: Chuyi Zhou Cc: Jiri Slaby Cc: Maxim Levitsky Cc: Mel Gorman Cc: Michal Hocko Cc: Paolo Bonzini Cc: Pedro Falcato Signed-off-by: Andrew Morton --- mm/compaction.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm/compaction.c') diff --git a/mm/compaction.c b/mm/compaction.c index 5584fa5fa3d4..d16b0fcd6db5 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2465,8 +2465,9 @@ rescan: * fast_find_migrateblock revisiting blocks that were * recently partially scanned. */ - if (cc->direct_compaction && !cc->finish_pageblock && - (cc->mode < MIGRATE_SYNC)) { + if (!pageblock_aligned(cc->migrate_pfn) && + cc->direct_compaction && !cc->finish_pageblock && + (cc->mode < MIGRATE_SYNC)) { cc->finish_pageblock = true; /* -- cgit v1.2.3 From 9ecc5fc50a9c75b0af252913163a9f1ac4206b17 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 15 May 2023 12:33:42 +0100 Subject: mm: compaction: only force pageblock scan completion when skip hints are obeyed fast_find_migrateblock relies on skip hints to avoid rescanning a recently selected pageblock but compact_zone() only forces the pageblock scan completion to set the skip hint if in direct compaction. While this prevents direct compaction repeatedly scanning a subset of blocks due to fast_find_migrateblock(), it does not prevent proactive compaction, node compaction and kcompactd encountering the same problem described in commit cfccd2e63e7e ("mm, compaction: finish pageblocks on complete migration failure"). Force the scan completion of a pageblock to set the skip hint if skip hints are obeyed to prevent fast_find_migrateblock() repeatedly selecting a subset of pageblocks. Link: https://lkml.kernel.org/r/20230515113344.6869-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Suggested-by: Vlastimil Babka Tested-by: Raghavendra K T Acked-by: Vlastimil Babka Cc: Chuyi Zhou Cc: Jiri Slaby Cc: Maxim Levitsky Cc: Michal Hocko Cc: Paolo Bonzini Cc: Pedro Falcato Signed-off-by: Andrew Morton --- mm/compaction.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm/compaction.c') diff --git a/mm/compaction.c b/mm/compaction.c index d16b0fcd6db5..009128d1e4ef 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2457,7 +2457,8 @@ rescan: } /* * If an ASYNC or SYNC_LIGHT fails to migrate a page - * within the current order-aligned block, scan the + * within the current order-aligned block and + * fast_find_migrateblock may be used then scan the * remainder of the pageblock. This will mark the * pageblock "skip" to avoid rescanning in the near * future. This will isolate more pages than necessary @@ -2466,7 +2467,7 @@ rescan: * recently partially scanned. */ if (!pageblock_aligned(cc->migrate_pfn) && - cc->direct_compaction && !cc->finish_pageblock && + !cc->ignore_skip_hint && !cc->finish_pageblock && (cc->mode < MIGRATE_SYNC)) { cc->finish_pageblock = true; -- cgit v1.2.3 From 590ccea80af950685de7f72ec43831765e5c8cb1 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 15 May 2023 12:33:43 +0100 Subject: mm: compaction: update pageblock skip when first migration candidate is not at the start isolate_migratepages_block should mark a pageblock as skip if scanning started on an aligned pageblock boundary but it only updates the skip flag if the first migration candidate is also aligned. Tracing during a compaction stress load (mmtests: workload-usemem-stress-numa-compact) that many pageblocks are not marked skip causing excessive scanning of blocks that had been recently checked. Update pageblock skip based on "valid_page" which is set if scanning started on a pageblock boundary. [mgorman@techsingularity.net: fix handling of skip bit] Link: https://lkml.kernel.org/r/20230602111622.swtxhn6lu2qwgrwq@techsingularity.net Link: https://lkml.kernel.org/r/20230515113344.6869-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman Tested-by: Raghavendra K T Acked-by: Vlastimil Babka Cc: Chuyi Zhou Cc: Jiri Slaby Cc: Maxim Levitsky Cc: Michal Hocko Cc: Paolo Bonzini Cc: Pedro Falcato Signed-off-by: Andrew Morton --- mm/compaction.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'mm/compaction.c') diff --git a/mm/compaction.c b/mm/compaction.c index 009128d1e4ef..02aa3788765d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -392,18 +392,14 @@ void reset_isolation_suitable(pg_data_t *pgdat) * Sets the pageblock skip bit if it was clear. Note that this is a hint as * locks are not required for read/writers. Returns true if it was already set. */ -static bool test_and_set_skip(struct compact_control *cc, struct page *page, - unsigned long pfn) +static bool test_and_set_skip(struct compact_control *cc, struct page *page) { bool skip; - /* Do no update if skip hint is being ignored */ + /* Do not update if skip hint is being ignored */ if (cc->ignore_skip_hint) return false; - if (!pageblock_aligned(pfn)) - return false; - skip = get_pageblock_skip(page); if (!skip && !cc->no_set_skip_hint) set_pageblock_skip(page); @@ -470,8 +466,7 @@ static void update_cached_migrate(struct compact_control *cc, unsigned long pfn) { } -static bool test_and_set_skip(struct compact_control *cc, struct page *page, - unsigned long pfn) +static bool test_and_set_skip(struct compact_control *cc, struct page *page) { return false; } @@ -1074,11 +1069,17 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, lruvec_memcg_debug(lruvec, page_folio(page)); - /* Try get exclusive access under lock */ - if (!skip_updated) { + /* + * Try get exclusive access under lock. If marked for + * skip, the scan is aborted unless the current context + * is a rescan to reach the end of the pageblock. + */ + if (!skip_updated && valid_page) { skip_updated = true; - if (test_and_set_skip(cc, page, low_pfn)) + if (test_and_set_skip(cc, valid_page) && + !cc->finish_pageblock) { goto isolate_abort; + } } /* -- cgit v1.2.3 From 90ed667c03fe553a41d79057740ed5df951eead0 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 15 May 2023 12:33:44 +0100 Subject: Revert "Revert "mm/compaction: fix set skip in fast_find_migrateblock"" This reverts commit 95e7a450b819 ("Revert "mm/compaction: fix set skip in fast_find_migrateblock""). Commit 7efc3b726103 ("mm/compaction: fix set skip in fast_find_migrateblock") was reverted due to bug reports about khugepaged consuming large amounts of CPU without making progress. The underlying bug was partially fixed by commit cfccd2e63e7e ("mm, compaction: finish pageblocks on complete migration failure") but it only mitigated the problem and Vlastimil Babka pointing out the same issue could theoretically happen to kcompactd. As pageblocks containing pages that fail to migrate should now be forcibly rescanned to set the skip hint if skip hints are used, fast_find_migrateblock() should no longer loop on a small subset of pageblocks for prolonged periods of time. Revert the revert so fast_find_migrateblock() is effective again. Using the mmtests config workload-usemem-stress-numa-compact, the number of unique ranges scanned was analysed for both kcompactd and !kcompactd activity. 6.4.0-rc1-vanilla kcompactd 7 range=(0x10d600~0x10d800) 7 range=(0x110c00~0x110e00) 7 range=(0x110e00~0x111000) 7 range=(0x111800~0x111a00) 7 range=(0x111a00~0x111c00) !kcompactd 1 range=(0x113e00~0x114000) 1 range=(0x114000~0x114020) 1 range=(0x114400~0x114489) 1 range=(0x114489~0x1144aa) 1 range=(0x1144aa~0x114600) 6.4.0-rc1-mm-revertfastmigrate kcompactd 17 range=(0x104200~0x104400) 17 range=(0x104400~0x104600) 17 range=(0x104600~0x104800) 17 range=(0x104800~0x104a00) 17 range=(0x104a00~0x104c00) !kcompactd 1793 range=(0x15c200~0x15c400) 5436 range=(0x105800~0x105a00) 19826 range=(0x150a00~0x150c00) 19833 range=(0x150800~0x150a00) 19834 range=(0x11ce00~0x11d000) 6.4.0-rc1-mm-follupfastfind kcompactd 22 range=(0x107200~0x107400) 23 range=(0x107400~0x107600) 23 range=(0x107600~0x107800) 23 range=(0x107c00~0x107e00) 23 range=(0x107e00~0x108000) !kcompactd 3 range=(0x890240~0x890400) 5 range=(0x886e00~0x887000) 5 range=(0x88a400~0x88a600) 6 range=(0x88f800~0x88fa00) 9 range=(0x88a400~0x88a420) Note that the vanilla kernel and the full series had some duplication of ranges scanned but it was not severe and would be in line with compaction resets when the skip hints are cleared. Just a revert of commit 7efc3b726103 ("mm/compaction: fix set skip in fast_find_migrateblock") showed excessive rescans of the same ranges so the series should not reintroduce bug 1206848. Link: https://bugzilla.suse.com/show_bug.cgi?id=1206848 Link: https://lkml.kernel.org/r/20230515113344.6869-5-mgorman@techsingularity.net Signed-off-by: Mel Gorman Tested-by: Raghavendra K T Acked-by: Vlastimil Babka Cc: Chuyi Zhou Cc: Jiri Slaby Cc: Maxim Levitsky Cc: Michal Hocko Cc: Paolo Bonzini Cc: Pedro Falcato Signed-off-by: Andrew Morton --- mm/compaction.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm/compaction.c') diff --git a/mm/compaction.c b/mm/compaction.c index 02aa3788765d..f6465ae74d3f 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1866,7 +1866,6 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) pfn = cc->zone->zone_start_pfn; cc->fast_search_fail = 0; found_block = true; - set_pageblock_skip(freepage); break; } } -- cgit v1.2.3 From 4e096ae1801e24b338e02715c65c3ffa8883ba5d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 13 May 2023 01:11:01 +0100 Subject: mm: convert migrate_pages() to work on folios Almost all of the callers & implementors of migrate_pages() were already converted to use folios. compaction_alloc() & compaction_free() are trivial to convert a part of this patch and not worth splitting out. Link: https://lkml.kernel.org/r/20230513001101.276972-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: "Huang, Ying" Signed-off-by: Andrew Morton --- Documentation/mm/page_migration.rst | 7 +- .../translations/zh_CN/mm/page_migration.rst | 2 +- include/linux/migrate.h | 16 +- mm/compaction.c | 15 +- mm/mempolicy.c | 15 +- mm/migrate.c | 161 ++++++++++----------- mm/vmscan.c | 15 +- 7 files changed, 108 insertions(+), 123 deletions(-) (limited to 'mm/compaction.c') diff --git a/Documentation/mm/page_migration.rst b/Documentation/mm/page_migration.rst index 313dce18893e..e35af7805be5 100644 --- a/Documentation/mm/page_migration.rst +++ b/Documentation/mm/page_migration.rst @@ -73,14 +73,13 @@ In kernel use of migrate_pages() It also prevents the swapper or other scans from encountering the page. -2. We need to have a function of type new_page_t that can be +2. We need to have a function of type new_folio_t that can be passed to migrate_pages(). This function should figure out - how to allocate the correct new page given the old page. + how to allocate the correct new folio given the old folio. 3. The migrate_pages() function is called which attempts to do the migration. It will call the function to allocate - the new page for each page that is considered for - moving. + the new folio for each folio that is considered for moving. How migrate_pages() works ========================= diff --git a/Documentation/translations/zh_CN/mm/page_migration.rst b/Documentation/translations/zh_CN/mm/page_migration.rst index 076081dc1635..f95063826a15 100644 --- a/Documentation/translations/zh_CN/mm/page_migration.rst +++ b/Documentation/translations/zh_CN/mm/page_migration.rst @@ -55,7 +55,7 @@ mbind()设置一个新的内存策略。一个进程的页面也可以通过sys_ 消失。它还可以防止交换器或其他扫描器遇到该页。 -2. 我们需要有一个new_page_t类型的函数,可以传递给migrate_pages()。这个函数应该计算 +2. 我们需要有一个new_folio_t类型的函数,可以传递给migrate_pages()。这个函数应该计算 出如何在给定的旧页面中分配正确的新页面。 3. migrate_pages()函数被调用,它试图进行迁移。它将调用该函数为每个被考虑迁移的页面分 diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 6241a1596a75..6de5756d8533 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -7,8 +7,8 @@ #include #include -typedef struct page *new_page_t(struct page *page, unsigned long private); -typedef void free_page_t(struct page *page, unsigned long private); +typedef struct folio *new_folio_t(struct folio *folio, unsigned long private); +typedef void free_folio_t(struct folio *folio, unsigned long private); struct migration_target_control; @@ -67,10 +67,10 @@ int migrate_folio_extra(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode, int extra_count); int migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode); -int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, +int migrate_pages(struct list_head *l, new_folio_t new, free_folio_t free, unsigned long private, enum migrate_mode mode, int reason, unsigned int *ret_succeeded); -struct page *alloc_migration_target(struct page *page, unsigned long private); +struct folio *alloc_migration_target(struct folio *src, unsigned long private); bool isolate_movable_page(struct page *page, isolate_mode_t mode); int migrate_huge_page_move_mapping(struct address_space *mapping, @@ -85,11 +85,11 @@ int folio_migrate_mapping(struct address_space *mapping, #else static inline void putback_movable_pages(struct list_head *l) {} -static inline int migrate_pages(struct list_head *l, new_page_t new, - free_page_t free, unsigned long private, enum migrate_mode mode, - int reason, unsigned int *ret_succeeded) +static inline int migrate_pages(struct list_head *l, new_folio_t new, + free_folio_t free, unsigned long private, + enum migrate_mode mode, int reason, unsigned int *ret_succeeded) { return -ENOSYS; } -static inline struct page *alloc_migration_target(struct page *page, +static inline struct folio *alloc_migration_target(struct folio *src, unsigned long private) { return NULL; } static inline bool isolate_movable_page(struct page *page, isolate_mode_t mode) diff --git a/mm/compaction.c b/mm/compaction.c index f6465ae74d3f..e23e00bec030 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1685,11 +1685,10 @@ splitmap: * This is a migrate-callback that "allocates" freepages by taking pages * from the isolated freelists in the block we are migrating to. */ -static struct page *compaction_alloc(struct page *migratepage, - unsigned long data) +static struct folio *compaction_alloc(struct folio *src, unsigned long data) { struct compact_control *cc = (struct compact_control *)data; - struct page *freepage; + struct folio *dst; if (list_empty(&cc->freepages)) { isolate_freepages(cc); @@ -1698,11 +1697,11 @@ static struct page *compaction_alloc(struct page *migratepage, return NULL; } - freepage = list_entry(cc->freepages.next, struct page, lru); - list_del(&freepage->lru); + dst = list_entry(cc->freepages.next, struct folio, lru); + list_del(&dst->lru); cc->nr_freepages--; - return freepage; + return dst; } /* @@ -1710,11 +1709,11 @@ static struct page *compaction_alloc(struct page *migratepage, * freelist. All pages on the freelist are from the same zone, so there is no * special handling needed for NUMA. */ -static void compaction_free(struct page *page, unsigned long data) +static void compaction_free(struct folio *dst, unsigned long data) { struct compact_control *cc = (struct compact_control *)data; - list_add(&page->lru, &cc->freepages); + list_add(&dst->lru, &cc->freepages); cc->nr_freepages++; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1756389a0609..f06ca8c18e62 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1195,24 +1195,22 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, * list of pages handed to migrate_pages()--which is how we get here-- * is in virtual address order. */ -static struct page *new_page(struct page *page, unsigned long start) +static struct folio *new_folio(struct folio *src, unsigned long start) { - struct folio *dst, *src = page_folio(page); struct vm_area_struct *vma; unsigned long address; VMA_ITERATOR(vmi, current->mm, start); gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL; for_each_vma(vmi, vma) { - address = page_address_in_vma(page, vma); + address = page_address_in_vma(&src->page, vma); if (address != -EFAULT) break; } if (folio_test_hugetlb(src)) { - dst = alloc_hugetlb_folio_vma(folio_hstate(src), + return alloc_hugetlb_folio_vma(folio_hstate(src), vma, address); - return &dst->page; } if (folio_test_large(src)) @@ -1221,9 +1219,8 @@ static struct page *new_page(struct page *page, unsigned long start) /* * if !vma, vma_alloc_folio() will use task or system default policy */ - dst = vma_alloc_folio(gfp, folio_order(src), vma, address, + return vma_alloc_folio(gfp, folio_order(src), vma, address, folio_test_large(src)); - return &dst->page; } #else @@ -1239,7 +1236,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, return -ENOSYS; } -static struct page *new_page(struct page *page, unsigned long start) +static struct folio *new_folio(struct folio *src, unsigned long start) { return NULL; } @@ -1334,7 +1331,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { WARN_ON_ONCE(flags & MPOL_MF_LAZY); - nr_failed = migrate_pages(&pagelist, new_page, NULL, + nr_failed = migrate_pages(&pagelist, new_folio, NULL, start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); if (nr_failed) putback_movable_pages(&pagelist); diff --git a/mm/migrate.c b/mm/migrate.c index cb292d2a90ce..30b5ce10935e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1067,15 +1067,13 @@ static void migrate_folio_undo_src(struct folio *src, } /* Restore the destination folio to the original state upon failure */ -static void migrate_folio_undo_dst(struct folio *dst, - bool locked, - free_page_t put_new_page, - unsigned long private) +static void migrate_folio_undo_dst(struct folio *dst, bool locked, + free_folio_t put_new_folio, unsigned long private) { if (locked) folio_unlock(dst); - if (put_new_page) - put_new_page(&dst->page, private); + if (put_new_folio) + put_new_folio(dst, private); else folio_put(dst); } @@ -1099,14 +1097,13 @@ static void migrate_folio_done(struct folio *src, } /* Obtain the lock on page, remove all ptes. */ -static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page, - unsigned long private, struct folio *src, - struct folio **dstp, enum migrate_mode mode, - enum migrate_reason reason, struct list_head *ret) +static int migrate_folio_unmap(new_folio_t get_new_folio, + free_folio_t put_new_folio, unsigned long private, + struct folio *src, struct folio **dstp, enum migrate_mode mode, + enum migrate_reason reason, struct list_head *ret) { struct folio *dst; int rc = -EAGAIN; - struct page *newpage = NULL; int page_was_mapped = 0; struct anon_vma *anon_vma = NULL; bool is_lru = !__PageMovable(&src->page); @@ -1123,10 +1120,9 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page return MIGRATEPAGE_SUCCESS; } - newpage = get_new_page(&src->page, private); - if (!newpage) + dst = get_new_folio(src, private); + if (!dst) return -ENOMEM; - dst = page_folio(newpage); *dstp = dst; dst->private = NULL; @@ -1254,13 +1250,13 @@ out: ret = NULL; migrate_folio_undo_src(src, page_was_mapped, anon_vma, locked, ret); - migrate_folio_undo_dst(dst, dst_locked, put_new_page, private); + migrate_folio_undo_dst(dst, dst_locked, put_new_folio, private); return rc; } /* Migrate the folio to the newly allocated folio in dst. */ -static int migrate_folio_move(free_page_t put_new_page, unsigned long private, +static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, struct folio *src, struct folio *dst, enum migrate_mode mode, enum migrate_reason reason, struct list_head *ret) @@ -1332,7 +1328,7 @@ out: } migrate_folio_undo_src(src, page_was_mapped, anon_vma, true, ret); - migrate_folio_undo_dst(dst, true, put_new_page, private); + migrate_folio_undo_dst(dst, true, put_new_folio, private); return rc; } @@ -1355,16 +1351,14 @@ out: * because then pte is replaced with migration swap entry and direct I/O code * will wait in the page fault for migration to complete. */ -static int unmap_and_move_huge_page(new_page_t get_new_page, - free_page_t put_new_page, unsigned long private, - struct page *hpage, int force, - enum migrate_mode mode, int reason, - struct list_head *ret) +static int unmap_and_move_huge_page(new_folio_t get_new_folio, + free_folio_t put_new_folio, unsigned long private, + struct folio *src, int force, enum migrate_mode mode, + int reason, struct list_head *ret) { - struct folio *dst, *src = page_folio(hpage); + struct folio *dst; int rc = -EAGAIN; int page_was_mapped = 0; - struct page *new_hpage; struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; @@ -1374,10 +1368,9 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, return MIGRATEPAGE_SUCCESS; } - new_hpage = get_new_page(hpage, private); - if (!new_hpage) + dst = get_new_folio(src, private); + if (!dst) return -ENOMEM; - dst = page_folio(new_hpage); if (!folio_trylock(src)) { if (!force) @@ -1418,7 +1411,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, * semaphore in write mode here and set TTU_RMAP_LOCKED * to let lower levels know we have taken the lock. */ - mapping = hugetlb_page_mapping_lock_write(hpage); + mapping = hugetlb_page_mapping_lock_write(&src->page); if (unlikely(!mapping)) goto unlock_put_anon; @@ -1448,7 +1441,7 @@ put_anon: if (rc == MIGRATEPAGE_SUCCESS) { move_hugetlb_state(src, dst, reason); - put_new_page = NULL; + put_new_folio = NULL; } out_unlock: @@ -1464,8 +1457,8 @@ out: * it. Otherwise, put_page() will drop the reference grabbed during * isolation. */ - if (put_new_page) - put_new_page(new_hpage, private); + if (put_new_folio) + put_new_folio(dst, private); else folio_putback_active_hugetlb(dst); @@ -1512,8 +1505,8 @@ struct migrate_pages_stats { * exist any more. It is caller's responsibility to call putback_movable_pages() * only if ret != 0. */ -static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page, - free_page_t put_new_page, unsigned long private, +static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio, + free_folio_t put_new_folio, unsigned long private, enum migrate_mode mode, int reason, struct migrate_pages_stats *stats, struct list_head *ret_folios) @@ -1551,9 +1544,9 @@ static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page, continue; } - rc = unmap_and_move_huge_page(get_new_page, - put_new_page, private, - &folio->page, pass > 2, mode, + rc = unmap_and_move_huge_page(get_new_folio, + put_new_folio, private, + folio, pass > 2, mode, reason, ret_folios); /* * The rules are: @@ -1610,11 +1603,11 @@ static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page, * deadlock (e.g., for loop device). So, if mode != MIGRATE_ASYNC, the * length of the from list must be <= 1. */ -static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, - free_page_t put_new_page, unsigned long private, - enum migrate_mode mode, int reason, struct list_head *ret_folios, - struct list_head *split_folios, struct migrate_pages_stats *stats, - int nr_pass) +static int migrate_pages_batch(struct list_head *from, + new_folio_t get_new_folio, free_folio_t put_new_folio, + unsigned long private, enum migrate_mode mode, int reason, + struct list_head *ret_folios, struct list_head *split_folios, + struct migrate_pages_stats *stats, int nr_pass) { int retry = 1; int thp_retry = 1; @@ -1664,8 +1657,9 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, continue; } - rc = migrate_folio_unmap(get_new_page, put_new_page, private, - folio, &dst, mode, reason, ret_folios); + rc = migrate_folio_unmap(get_new_folio, put_new_folio, + private, folio, &dst, mode, reason, + ret_folios); /* * The rules are: * Success: folio will be freed @@ -1762,7 +1756,7 @@ move: cond_resched(); - rc = migrate_folio_move(put_new_page, private, + rc = migrate_folio_move(put_new_folio, private, folio, dst, mode, reason, ret_folios); /* @@ -1808,7 +1802,7 @@ out: migrate_folio_undo_src(folio, page_was_mapped, anon_vma, true, ret_folios); list_del(&dst->lru); - migrate_folio_undo_dst(dst, true, put_new_page, private); + migrate_folio_undo_dst(dst, true, put_new_folio, private); dst = dst2; dst2 = list_next_entry(dst, lru); } @@ -1816,10 +1810,11 @@ out: return rc; } -static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page, - free_page_t put_new_page, unsigned long private, - enum migrate_mode mode, int reason, struct list_head *ret_folios, - struct list_head *split_folios, struct migrate_pages_stats *stats) +static int migrate_pages_sync(struct list_head *from, new_folio_t get_new_folio, + free_folio_t put_new_folio, unsigned long private, + enum migrate_mode mode, int reason, + struct list_head *ret_folios, struct list_head *split_folios, + struct migrate_pages_stats *stats) { int rc, nr_failed = 0; LIST_HEAD(folios); @@ -1827,7 +1822,7 @@ static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page, memset(&astats, 0, sizeof(astats)); /* Try to migrate in batch with MIGRATE_ASYNC mode firstly */ - rc = migrate_pages_batch(from, get_new_page, put_new_page, private, MIGRATE_ASYNC, + rc = migrate_pages_batch(from, get_new_folio, put_new_folio, private, MIGRATE_ASYNC, reason, &folios, split_folios, &astats, NR_MAX_MIGRATE_ASYNC_RETRY); stats->nr_succeeded += astats.nr_succeeded; @@ -1849,7 +1844,7 @@ static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page, list_splice_tail_init(&folios, from); while (!list_empty(from)) { list_move(from->next, &folios); - rc = migrate_pages_batch(&folios, get_new_page, put_new_page, + rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio, private, mode, reason, ret_folios, split_folios, stats, NR_MAX_MIGRATE_SYNC_RETRY); list_splice_tail_init(&folios, ret_folios); @@ -1866,11 +1861,11 @@ static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page, * supplied as the target for the page migration * * @from: The list of folios to be migrated. - * @get_new_page: The function used to allocate free folios to be used + * @get_new_folio: The function used to allocate free folios to be used * as the target of the folio migration. - * @put_new_page: The function used to free target folios if migration + * @put_new_folio: The function used to free target folios if migration * fails, or NULL if no special handling is necessary. - * @private: Private data to be passed on to get_new_page() + * @private: Private data to be passed on to get_new_folio() * @mode: The migration mode that specifies the constraints for * folio migration, if any. * @reason: The reason for folio migration. @@ -1887,8 +1882,8 @@ static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page, * considered as the number of non-migrated large folio, no matter how many * split folios of the large folio are migrated successfully. */ -int migrate_pages(struct list_head *from, new_page_t get_new_page, - free_page_t put_new_page, unsigned long private, +int migrate_pages(struct list_head *from, new_folio_t get_new_folio, + free_folio_t put_new_folio, unsigned long private, enum migrate_mode mode, int reason, unsigned int *ret_succeeded) { int rc, rc_gather; @@ -1903,7 +1898,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, memset(&stats, 0, sizeof(stats)); - rc_gather = migrate_hugetlbs(from, get_new_page, put_new_page, private, + rc_gather = migrate_hugetlbs(from, get_new_folio, put_new_folio, private, mode, reason, &stats, &ret_folios); if (rc_gather < 0) goto out; @@ -1926,12 +1921,14 @@ again: else list_splice_init(from, &folios); if (mode == MIGRATE_ASYNC) - rc = migrate_pages_batch(&folios, get_new_page, put_new_page, private, - mode, reason, &ret_folios, &split_folios, &stats, - NR_MAX_MIGRATE_PAGES_RETRY); + rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio, + private, mode, reason, &ret_folios, + &split_folios, &stats, + NR_MAX_MIGRATE_PAGES_RETRY); else - rc = migrate_pages_sync(&folios, get_new_page, put_new_page, private, - mode, reason, &ret_folios, &split_folios, &stats); + rc = migrate_pages_sync(&folios, get_new_folio, put_new_folio, + private, mode, reason, &ret_folios, + &split_folios, &stats); list_splice_tail_init(&folios, &ret_folios); if (rc < 0) { rc_gather = rc; @@ -1944,8 +1941,9 @@ again: * is counted as 1 failure already. And, we only try to migrate * with minimal effort, force MIGRATE_ASYNC mode and retry once. */ - migrate_pages_batch(&split_folios, get_new_page, put_new_page, private, - MIGRATE_ASYNC, reason, &ret_folios, NULL, &stats, 1); + migrate_pages_batch(&split_folios, get_new_folio, + put_new_folio, private, MIGRATE_ASYNC, reason, + &ret_folios, NULL, &stats, 1); list_splice_tail_init(&split_folios, &ret_folios); } rc_gather += rc; @@ -1980,14 +1978,11 @@ out: return rc_gather; } -struct page *alloc_migration_target(struct page *page, unsigned long private) +struct folio *alloc_migration_target(struct folio *src, unsigned long private) { - struct folio *folio = page_folio(page); struct migration_target_control *mtc; gfp_t gfp_mask; unsigned int order = 0; - struct folio *hugetlb_folio = NULL; - struct folio *new_folio = NULL; int nid; int zidx; @@ -1995,33 +1990,30 @@ struct page *alloc_migration_target(struct page *page, unsigned long private) gfp_mask = mtc->gfp_mask; nid = mtc->nid; if (nid == NUMA_NO_NODE) - nid = folio_nid(folio); + nid = folio_nid(src); - if (folio_test_hugetlb(folio)) { - struct hstate *h = folio_hstate(folio); + if (folio_test_hugetlb(src)) { + struct hstate *h = folio_hstate(src); gfp_mask = htlb_modify_alloc_mask(h, gfp_mask); - hugetlb_folio = alloc_hugetlb_folio_nodemask(h, nid, + return alloc_hugetlb_folio_nodemask(h, nid, mtc->nmask, gfp_mask); - return &hugetlb_folio->page; } - if (folio_test_large(folio)) { + if (folio_test_large(src)) { /* * clear __GFP_RECLAIM to make the migration callback * consistent with regular THP allocations. */ gfp_mask &= ~__GFP_RECLAIM; gfp_mask |= GFP_TRANSHUGE; - order = folio_order(folio); + order = folio_order(src); } - zidx = zone_idx(folio_zone(folio)); + zidx = zone_idx(folio_zone(src)); if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE) gfp_mask |= __GFP_HIGHMEM; - new_folio = __folio_alloc(gfp_mask, order, nid, mtc->nmask); - - return &new_folio->page; + return __folio_alloc(gfp_mask, order, nid, mtc->nmask); } #ifdef CONFIG_NUMA @@ -2472,13 +2464,12 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, return false; } -static struct page *alloc_misplaced_dst_page(struct page *page, +static struct folio *alloc_misplaced_dst_folio(struct folio *src, unsigned long data) { int nid = (int) data; - int order = compound_order(page); + int order = folio_order(src); gfp_t gfp = __GFP_THISNODE; - struct folio *new; if (order > 0) gfp |= GFP_TRANSHUGE_LIGHT; @@ -2487,9 +2478,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, __GFP_NOWARN; gfp &= ~__GFP_RECLAIM; } - new = __folio_alloc_node(gfp, order, nid); - - return &new->page; + return __folio_alloc_node(gfp, order, nid); } static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) @@ -2567,7 +2556,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, goto out; list_add(&page->lru, &migratepages); - nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, + nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_folio, NULL, node, MIGRATE_ASYNC, MR_NUMA_MISPLACED, &nr_succeeded); if (nr_remaining) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 15efbfbb1963..4637f6462e9c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1621,9 +1621,10 @@ static void folio_check_dirty_writeback(struct folio *folio, mapping->a_ops->is_dirty_writeback(folio, dirty, writeback); } -static struct page *alloc_demote_page(struct page *page, unsigned long private) +static struct folio *alloc_demote_folio(struct folio *src, + unsigned long private) { - struct page *target_page; + struct folio *dst; nodemask_t *allowed_mask; struct migration_target_control *mtc; @@ -1641,14 +1642,14 @@ static struct page *alloc_demote_page(struct page *page, unsigned long private) */ mtc->nmask = NULL; mtc->gfp_mask |= __GFP_THISNODE; - target_page = alloc_migration_target(page, (unsigned long)mtc); - if (target_page) - return target_page; + dst = alloc_migration_target(src, (unsigned long)mtc); + if (dst) + return dst; mtc->gfp_mask &= ~__GFP_THISNODE; mtc->nmask = allowed_mask; - return alloc_migration_target(page, (unsigned long)mtc); + return alloc_migration_target(src, (unsigned long)mtc); } /* @@ -1683,7 +1684,7 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, node_get_allowed_targets(pgdat, &allowed_mask); /* Demotion ignores all cpuset and mempolicy settings */ - migrate_pages(demote_folios, alloc_demote_page, NULL, + migrate_pages(demote_folios, alloc_demote_folio, NULL, (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); -- cgit v1.2.3 From e8606320e9af9774fd879e71c940fc9e5fd9b901 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 19 May 2023 14:39:57 +0200 Subject: mm: compaction: refactor __compaction_suitable() __compaction_suitable() is supposed to check for available migration targets. However, it also checks whether the operation was requested via /proc/sys/vm/compact_memory, and whether the original allocation request can already succeed. These don't apply to all callsites. Move the checks out to the callers, so that later patches can deal with them one by one. No functional change intended. [hannes@cmpxchg.org: fix comment, per Vlastimil] Link: https://lkml.kernel.org/r/20230602144942.GC161817@cmpxchg.org Link: https://lkml.kernel.org/r/20230519123959.77335-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/compaction.h | 4 +-- mm/compaction.c | 79 +++++++++++++++++++++++++++++----------------- mm/vmscan.c | 35 ++++++++++++-------- 3 files changed, 73 insertions(+), 45 deletions(-) (limited to 'mm/compaction.c') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 1f0328a2ba48..9f7cf3e1bf89 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -90,7 +90,7 @@ extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, struct page **page); extern void reset_isolation_suitable(pg_data_t *pgdat); extern enum compact_result compaction_suitable(struct zone *zone, int order, - unsigned int alloc_flags, int highest_zoneidx); + int highest_zoneidx); extern void compaction_defer_reset(struct zone *zone, int order, bool alloc_success); @@ -108,7 +108,7 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat) } static inline enum compact_result compaction_suitable(struct zone *zone, int order, - int alloc_flags, int highest_zoneidx) + int highest_zoneidx) { return COMPACT_SKIPPED; } diff --git a/mm/compaction.c b/mm/compaction.c index e23e00bec030..bb9b76244a5d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2194,24 +2194,10 @@ static enum compact_result compact_finished(struct compact_control *cc) } static enum compact_result __compaction_suitable(struct zone *zone, int order, - unsigned int alloc_flags, int highest_zoneidx, unsigned long wmark_target) { unsigned long watermark; - - if (is_via_compact_memory(order)) - return COMPACT_CONTINUE; - - watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); - /* - * If watermarks for high-order allocation are already met, there - * should be no need for compaction at all. - */ - if (zone_watermark_ok(zone, order, watermark, highest_zoneidx, - alloc_flags)) - return COMPACT_SUCCESS; - /* * Watermarks for order-0 must be met for compaction to be able to * isolate free pages for migration targets. This means that the @@ -2240,17 +2226,15 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * compaction_suitable: Is this suitable to run compaction on this zone now? * Returns * COMPACT_SKIPPED - If there are too few free pages for compaction - * COMPACT_SUCCESS - If the allocation would succeed without compaction * COMPACT_CONTINUE - If compaction should run now */ enum compact_result compaction_suitable(struct zone *zone, int order, - unsigned int alloc_flags, int highest_zoneidx) { enum compact_result ret; int fragindex; - ret = __compaction_suitable(zone, order, alloc_flags, highest_zoneidx, + ret = __compaction_suitable(zone, order, highest_zoneidx, zone_page_state(zone, NR_FREE_PAGES)); /* * fragmentation index determines if allocation failures are due to @@ -2294,7 +2278,16 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->highest_zoneidx, ac->nodemask) { unsigned long available; - enum compact_result compact_result; + unsigned long watermark; + + if (is_via_compact_memory(order)) + return true; + + /* Allocation can already succeed, nothing to do */ + watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); + if (zone_watermark_ok(zone, order, watermark, + ac->highest_zoneidx, alloc_flags)) + continue; /* * Do not consider all the reclaimable memory because we do not @@ -2304,9 +2297,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, */ available = zone_reclaimable_pages(zone) / order; available += zone_page_state_snapshot(zone, NR_FREE_PAGES); - compact_result = __compaction_suitable(zone, order, alloc_flags, - ac->highest_zoneidx, available); - if (compact_result == COMPACT_CONTINUE) + if (__compaction_suitable(zone, order, ac->highest_zoneidx, + available) == COMPACT_CONTINUE) return true; } @@ -2336,11 +2328,23 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) INIT_LIST_HEAD(&cc->migratepages); cc->migratetype = gfp_migratetype(cc->gfp_mask); - ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, - cc->highest_zoneidx); - /* Compaction is likely to fail */ - if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) - return ret; + + if (!is_via_compact_memory(cc->order)) { + unsigned long watermark; + + /* Allocation can already succeed, nothing to do */ + watermark = wmark_pages(cc->zone, + cc->alloc_flags & ALLOC_WMARK_MASK); + if (zone_watermark_ok(cc->zone, cc->order, watermark, + cc->highest_zoneidx, cc->alloc_flags)) + return COMPACT_SUCCESS; + + ret = compaction_suitable(cc->zone, cc->order, + cc->highest_zoneidx); + /* Compaction is likely to fail */ + if (ret == COMPACT_SKIPPED) + return ret; + } /* * Clear pageblock skip if there were failures recently and compaction @@ -2844,7 +2848,16 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) if (!populated_zone(zone)) continue; - if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0, + if (is_via_compact_memory(pgdat->kcompactd_max_order)) + return true; + + /* Allocation can already succeed, check other zones */ + if (zone_watermark_ok(zone, pgdat->kcompactd_max_order, + min_wmark_pages(zone), + highest_zoneidx, 0)) + continue; + + if (compaction_suitable(zone, pgdat->kcompactd_max_order, highest_zoneidx) == COMPACT_CONTINUE) return true; } @@ -2882,10 +2895,18 @@ static void kcompactd_do_work(pg_data_t *pgdat) if (compaction_deferred(zone, cc.order)) continue; - if (compaction_suitable(zone, cc.order, 0, zoneid) != - COMPACT_CONTINUE) + if (is_via_compact_memory(cc.order)) + goto compact; + + /* Allocation can already succeed, nothing to do */ + if (zone_watermark_ok(zone, cc.order, + min_wmark_pages(zone), zoneid, 0)) continue; + if (compaction_suitable(zone, cc.order, + zoneid) != COMPACT_CONTINUE) + continue; +compact: if (kthread_should_stop()) return; diff --git a/mm/vmscan.c b/mm/vmscan.c index 4637f6462e9c..9f8bfd1fcf58 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6399,14 +6399,17 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, if (!managed_zone(zone)) continue; - switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) { - case COMPACT_SUCCESS: - case COMPACT_CONTINUE: + if (sc->order == -1) /* is_via_compact_memory() */ + return false; + + /* Allocation can already succeed, nothing to do */ + if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), + sc->reclaim_idx, 0)) + return false; + + if (compaction_suitable(zone, sc->order, + sc->reclaim_idx) == COMPACT_CONTINUE) return false; - default: - /* check next zone */ - ; - } } /* @@ -6594,16 +6597,20 @@ again: static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) { unsigned long watermark; - enum compact_result suitable; - suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx); - if (suitable == COMPACT_SUCCESS) - /* Allocation should succeed already. Don't reclaim. */ + if (sc->order == -1) /* is_via_compact_memory() */ + goto suitable; + + /* Allocation can already succeed, nothing to do */ + if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), + sc->reclaim_idx, 0)) return true; - if (suitable == COMPACT_SKIPPED) - /* Compaction cannot yet proceed. Do reclaim. */ - return false; + /* Compaction cannot yet proceed. Do reclaim. */ + if (compaction_suitable(zone, sc->order, + sc->reclaim_idx) == COMPACT_SKIPPED) + return false; +suitable: /* * Compaction is already possible, but it takes time to run and there * are potentially other callers using the pages just freed. So proceed -- cgit v1.2.3 From f98a497e1f16ee411df72629e32e31cba4cfa9cf Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 19 May 2023 14:39:58 +0200 Subject: mm: compaction: remove unnecessary is_via_compact_memory() checks Remove from all paths not reachable via /proc/sys/vm/compact_memory. Link: https://lkml.kernel.org/r/20230519123959.77335-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Michal Hocko Cc: Baolin Wang Signed-off-by: Andrew Morton --- mm/compaction.c | 11 +---------- mm/vmscan.c | 8 +------- 2 files changed, 2 insertions(+), 17 deletions(-) (limited to 'mm/compaction.c') diff --git a/mm/compaction.c b/mm/compaction.c index bb9b76244a5d..bc1f389ed378 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2280,9 +2280,6 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, unsigned long available; unsigned long watermark; - if (is_via_compact_memory(order)) - return true; - /* Allocation can already succeed, nothing to do */ watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); if (zone_watermark_ok(zone, order, watermark, @@ -2848,9 +2845,6 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) if (!populated_zone(zone)) continue; - if (is_via_compact_memory(pgdat->kcompactd_max_order)) - return true; - /* Allocation can already succeed, check other zones */ if (zone_watermark_ok(zone, pgdat->kcompactd_max_order, min_wmark_pages(zone), @@ -2895,9 +2889,6 @@ static void kcompactd_do_work(pg_data_t *pgdat) if (compaction_deferred(zone, cc.order)) continue; - if (is_via_compact_memory(cc.order)) - goto compact; - /* Allocation can already succeed, nothing to do */ if (zone_watermark_ok(zone, cc.order, min_wmark_pages(zone), zoneid, 0)) @@ -2906,7 +2897,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) if (compaction_suitable(zone, cc.order, zoneid) != COMPACT_CONTINUE) continue; -compact: + if (kthread_should_stop()) return; diff --git a/mm/vmscan.c b/mm/vmscan.c index 9f8bfd1fcf58..99e4ae44850d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6399,9 +6399,6 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, if (!managed_zone(zone)) continue; - if (sc->order == -1) /* is_via_compact_memory() */ - return false; - /* Allocation can already succeed, nothing to do */ if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), sc->reclaim_idx, 0)) @@ -6598,9 +6595,6 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) { unsigned long watermark; - if (sc->order == -1) /* is_via_compact_memory() */ - goto suitable; - /* Allocation can already succeed, nothing to do */ if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), sc->reclaim_idx, 0)) @@ -6610,7 +6604,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) if (compaction_suitable(zone, sc->order, sc->reclaim_idx) == COMPACT_SKIPPED) return false; -suitable: + /* * Compaction is already possible, but it takes time to run and there * are potentially other callers using the pages just freed. So proceed -- cgit v1.2.3 From 1c9568e806a589da84b7afbdf0619b2c1f6c102a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 19 May 2023 14:39:59 +0200 Subject: mm: compaction: drop redundant watermark check in compaction_zonelist_suitable() The watermark check in compaction_zonelist_suitable(), called from should_compact_retry(), is sandwiched between two watermark checks already: before, there are freelist attempts as part of direct reclaim and direct compaction; after, there is a last-minute freelist attempt in __alloc_pages_may_oom(). The check in compaction_zonelist_suitable() isn't necessary. Kill it. Link: https://lkml.kernel.org/r/20230519123959.77335-6-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Michal Hocko Cc: Baolin Wang Signed-off-by: Andrew Morton --- mm/compaction.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'mm/compaction.c') diff --git a/mm/compaction.c b/mm/compaction.c index bc1f389ed378..470cfd24ef18 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2278,13 +2278,6 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->highest_zoneidx, ac->nodemask) { unsigned long available; - unsigned long watermark; - - /* Allocation can already succeed, nothing to do */ - watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); - if (zone_watermark_ok(zone, order, watermark, - ac->highest_zoneidx, alloc_flags)) - continue; /* * Do not consider all the reclaimable memory because we do not -- cgit v1.2.3 From 3cf04937529020e149666f56a41ebdeb226b69ed Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 2 Jun 2023 11:12:04 -0400 Subject: mm: compaction: have compaction_suitable() return bool Since it only returns COMPACT_CONTINUE or COMPACT_SKIPPED now, a bool return value simplifies the callsites. Link: https://lkml.kernel.org/r/20230602151204.GD161817@cmpxchg.org Signed-off-by: Johannes Weiner Suggested-by: Vlastimil Babka Acked-by: Vlastimil Babka Cc: Baolin Wang Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/compaction.h | 6 ++--- mm/compaction.c | 64 ++++++++++++++++++++++------------------------ mm/vmscan.c | 6 ++--- 3 files changed, 36 insertions(+), 40 deletions(-) (limited to 'mm/compaction.c') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 9f7cf3e1bf89..57b16e69c19a 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -89,7 +89,7 @@ extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, const struct alloc_context *ac, enum compact_priority prio, struct page **page); extern void reset_isolation_suitable(pg_data_t *pgdat); -extern enum compact_result compaction_suitable(struct zone *zone, int order, +extern bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx); extern void compaction_defer_reset(struct zone *zone, int order, @@ -107,10 +107,10 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat) { } -static inline enum compact_result compaction_suitable(struct zone *zone, int order, +static inline bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx) { - return COMPACT_SKIPPED; + return false; } static inline void kcompactd_run(int nid) diff --git a/mm/compaction.c b/mm/compaction.c index 470cfd24ef18..9b550bfe900b 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2193,9 +2193,9 @@ static enum compact_result compact_finished(struct compact_control *cc) return ret; } -static enum compact_result __compaction_suitable(struct zone *zone, int order, - int highest_zoneidx, - unsigned long wmark_target) +static bool __compaction_suitable(struct zone *zone, int order, + int highest_zoneidx, + unsigned long wmark_target) { unsigned long watermark; /* @@ -2215,27 +2215,20 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? low_wmark_pages(zone) : min_wmark_pages(zone); watermark += compact_gap(order); - if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx, - ALLOC_CMA, wmark_target)) - return COMPACT_SKIPPED; - - return COMPACT_CONTINUE; + return __zone_watermark_ok(zone, 0, watermark, highest_zoneidx, + ALLOC_CMA, wmark_target); } /* * compaction_suitable: Is this suitable to run compaction on this zone now? - * Returns - * COMPACT_SKIPPED - If there are too few free pages for compaction - * COMPACT_CONTINUE - If compaction should run now */ -enum compact_result compaction_suitable(struct zone *zone, int order, - int highest_zoneidx) +bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx) { - enum compact_result ret; - int fragindex; + enum compact_result compact_result; + bool suitable; - ret = __compaction_suitable(zone, order, highest_zoneidx, - zone_page_state(zone, NR_FREE_PAGES)); + suitable = __compaction_suitable(zone, order, highest_zoneidx, + zone_page_state(zone, NR_FREE_PAGES)); /* * fragmentation index determines if allocation failures are due to * low memory or external fragmentation @@ -2252,17 +2245,24 @@ enum compact_result compaction_suitable(struct zone *zone, int order, * excessive compaction for costly orders, but it should not be at the * expense of system stability. */ - if (ret == COMPACT_CONTINUE && (order > PAGE_ALLOC_COSTLY_ORDER)) { - fragindex = fragmentation_index(zone, order); - if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) - ret = COMPACT_NOT_SUITABLE_ZONE; + if (suitable) { + compact_result = COMPACT_CONTINUE; + if (order > PAGE_ALLOC_COSTLY_ORDER) { + int fragindex = fragmentation_index(zone, order); + + if (fragindex >= 0 && + fragindex <= sysctl_extfrag_threshold) { + suitable = false; + compact_result = COMPACT_NOT_SUITABLE_ZONE; + } + } + } else { + compact_result = COMPACT_SKIPPED; } - trace_mm_compaction_suitable(zone, order, ret); - if (ret == COMPACT_NOT_SUITABLE_ZONE) - ret = COMPACT_SKIPPED; + trace_mm_compaction_suitable(zone, order, compact_result); - return ret; + return suitable; } bool compaction_zonelist_suitable(struct alloc_context *ac, int order, @@ -2288,7 +2288,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, available = zone_reclaimable_pages(zone) / order; available += zone_page_state_snapshot(zone, NR_FREE_PAGES); if (__compaction_suitable(zone, order, ac->highest_zoneidx, - available) == COMPACT_CONTINUE) + available)) return true; } @@ -2329,11 +2329,10 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) cc->highest_zoneidx, cc->alloc_flags)) return COMPACT_SUCCESS; - ret = compaction_suitable(cc->zone, cc->order, - cc->highest_zoneidx); /* Compaction is likely to fail */ - if (ret == COMPACT_SKIPPED) - return ret; + if (!compaction_suitable(cc->zone, cc->order, + cc->highest_zoneidx)) + return COMPACT_SKIPPED; } /* @@ -2845,7 +2844,7 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) continue; if (compaction_suitable(zone, pgdat->kcompactd_max_order, - highest_zoneidx) == COMPACT_CONTINUE) + highest_zoneidx)) return true; } @@ -2887,8 +2886,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) min_wmark_pages(zone), zoneid, 0)) continue; - if (compaction_suitable(zone, cc.order, - zoneid) != COMPACT_CONTINUE) + if (!compaction_suitable(zone, cc.order, zoneid)) continue; if (kthread_should_stop()) diff --git a/mm/vmscan.c b/mm/vmscan.c index 99e4ae44850d..df7e52b522ec 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6404,8 +6404,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, sc->reclaim_idx, 0)) return false; - if (compaction_suitable(zone, sc->order, - sc->reclaim_idx) == COMPACT_CONTINUE) + if (compaction_suitable(zone, sc->order, sc->reclaim_idx)) return false; } @@ -6601,8 +6600,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) return true; /* Compaction cannot yet proceed. Do reclaim. */ - if (compaction_suitable(zone, sc->order, - sc->reclaim_idx) == COMPACT_SKIPPED) + if (!compaction_suitable(zone, sc->order, sc->reclaim_idx)) return false; /* -- cgit v1.2.3 From 4fbbb3fde3c69879ceebb33a8edd9d867008728b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 19 May 2023 13:13:59 +0200 Subject: mm: compaction: avoid GFP_NOFS ABBA deadlock During stress testing with higher-order allocations, a deadlock scenario was observed in compaction: One GFP_NOFS allocation was sleeping on mm/compaction.c::too_many_isolated(), while all CPUs in the system were busy with compactors spinning on buffer locks held by the sleeping GFP_NOFS allocation. Reclaim is susceptible to this same deadlock; we fixed it by granting GFP_NOFS allocations additional LRU isolation headroom, to ensure it makes forward progress while holding fs locks that other reclaimers might acquire. Do the same here. This code has been like this since compaction was initially merged, and I only managed to trigger this with out-of-tree patches that dramatically increase the contexts that do GFP_NOFS compaction. While the issue is real, it seems theoretical in nature given existing allocation sites. Worth fixing now, but no Fixes tag or stable CC. Link: https://lkml.kernel.org/r/20230519111359.40475-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/compaction.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'mm/compaction.c') diff --git a/mm/compaction.c b/mm/compaction.c index 9b550bfe900b..261071a07681 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -740,8 +740,9 @@ isolate_freepages_range(struct compact_control *cc, } /* Similar to reclaim, but different enough that they don't share logic */ -static bool too_many_isolated(pg_data_t *pgdat) +static bool too_many_isolated(struct compact_control *cc) { + pg_data_t *pgdat = cc->zone->zone_pgdat; bool too_many; unsigned long active, inactive, isolated; @@ -753,6 +754,17 @@ static bool too_many_isolated(pg_data_t *pgdat) isolated = node_page_state(pgdat, NR_ISOLATED_FILE) + node_page_state(pgdat, NR_ISOLATED_ANON); + /* + * Allow GFP_NOFS to isolate past the limit set for regular + * compaction runs. This prevents an ABBA deadlock when other + * compactors have already isolated to the limit, but are + * blocked on filesystem locks held by the GFP_NOFS thread. + */ + if (cc->gfp_mask & __GFP_FS) { + inactive >>= 3; + active >>= 3; + } + too_many = isolated > (inactive + active) / 2; if (!too_many) wake_throttle_isolated(pgdat); @@ -801,7 +813,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * list by either parallel reclaimers or compaction. If there are, * delay for some time until fewer pages are isolated */ - while (unlikely(too_many_isolated(pgdat))) { + while (unlikely(too_many_isolated(cc))) { /* stop isolation if there are still pages not migrated */ if (cc->nr_migratepages) return -EAGAIN; -- cgit v1.2.3 From 75990f6459b9cf61a94e8a08d0f6a4aa0b8cf3b5 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 25 May 2023 20:53:56 +0800 Subject: mm: compaction: drop the redundant page validation in update_pageblock_skip() Patch series "Misc cleanups and improvements for compaction". This series cantains some cleanups and improvements for compaction. This patch (of 6): The caller has validated the page before calling update_pageblock_skip(), thus drop the redundant page validation in update_pageblock_skip(). Link: https://lkml.kernel.org/r/5142e15b9295fe8c447dbb39b7907a20177a1413.1685018752.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: Vlastimil Babka Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm/compaction.c') diff --git a/mm/compaction.c b/mm/compaction.c index 261071a07681..83004c15715a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -436,9 +436,6 @@ static void update_pageblock_skip(struct compact_control *cc, if (cc->no_set_skip_hint) return; - if (!page) - return; - set_pageblock_skip(page); /* Update where async and sync compaction should restart */ -- cgit v1.2.3 From 2dbd90054f965c899b9adb62b2d0d215f687d04b Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 25 May 2023 20:53:57 +0800 Subject: mm: compaction: change fast_isolate_freepages() to void type No caller cares about the return value of fast_isolate_freepages(), void it. Link: https://lkml.kernel.org/r/759fca20b22ebf4c81afa30496837b9e0fb2e53b.1685018752.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: Vlastimil Babka Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'mm/compaction.c') diff --git a/mm/compaction.c b/mm/compaction.c index 83004c15715a..d4cee5803214 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1413,8 +1413,7 @@ static int next_search_order(struct compact_control *cc, int order) return order; } -static unsigned long -fast_isolate_freepages(struct compact_control *cc) +static void fast_isolate_freepages(struct compact_control *cc) { unsigned int limit = max(1U, freelist_scan_limit(cc) >> 1); unsigned int nr_scanned = 0; @@ -1427,7 +1426,7 @@ fast_isolate_freepages(struct compact_control *cc) /* Full compaction passes in a negative order */ if (cc->order <= 0) - return cc->free_pfn; + return; /* * If starting the scan, use a deeper search and use the highest @@ -1566,11 +1565,10 @@ fast_isolate_freepages(struct compact_control *cc) cc->total_free_scanned += nr_scanned; if (!page) - return cc->free_pfn; + return; low_pfn = page_to_pfn(page); fast_isolate_around(cc, low_pfn); - return low_pfn; } /* -- cgit v1.2.3 From cf650342f83ae655c6d05a1a74ae1672459973d0 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 25 May 2023 20:53:58 +0800 Subject: mm: compaction: skip more fully scanned pageblock In fast_isolate_around(), it assumes the pageblock is fully scanned if cc->nr_freepages < cc->nr_migratepages after trying to isolate some free pages, and will set skip flag to avoid scanning in future. However this can miss setting the skip flag for a fully scanned pageblock (returned 'start_pfn' is equal to 'end_pfn') in the case where cc->nr_freepages is larger than cc->nr_migratepages. So using the returned 'start_pfn' from isolate_freepages_block() and 'end_pfn' to decide if a pageblock is fully scanned makes more sense. It can also cover the case where cc->nr_freepages < cc->nr_migratepages, which means the 'start_pfn' is usually equal to 'end_pfn' except some uncommon fatal error occurs after non-strict mode isolation. Link: https://lkml.kernel.org/r/f4efd2fa08735794a6d809da3249b6715ba6ad38.1685018752.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: Vlastimil Babka Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/compaction.c') diff --git a/mm/compaction.c b/mm/compaction.c index d4cee5803214..c5f97bfd629c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1389,7 +1389,7 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn) isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false); /* Skip this pageblock in the future as it's full or nearly full */ - if (cc->nr_freepages < cc->nr_migratepages) + if (start_pfn == end_pfn) set_pageblock_skip(page); return; -- cgit v1.2.3 From 8b71b499ff98fdcda7efefc146841a8b4d26813d Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 25 May 2023 20:53:59 +0800 Subject: mm: compaction: only set skip flag if cc->no_set_skip_hint is false To keep the same logic as test_and_set_skip(), only set the skip flag if cc->no_set_skip_hint is false, which makes code more reasonable. Link: https://lkml.kernel.org/r/0eb2cd2407ffb259ae6e3071e10f70f2d41d0f3e.1685018752.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: Vlastimil Babka Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/compaction.c') diff --git a/mm/compaction.c b/mm/compaction.c index c5f97bfd629c..3b09d8d02581 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1201,7 +1201,7 @@ isolate_abort: * rescanned twice in a row. */ if (low_pfn == end_pfn && (!nr_isolated || cc->finish_pageblock)) { - if (valid_page && !skip_updated) + if (!cc->no_set_skip_hint && valid_page && !skip_updated) set_pageblock_skip(valid_page); update_cached_migrate(cc, low_pfn); } -- cgit v1.2.3 From 447ba88658faa8dbfd29d557daa38b7d88f460ec Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 25 May 2023 20:54:00 +0800 Subject: mm: compaction: add trace event for fast freepages isolation The fast_isolate_freepages() can also isolate freepages, but we can not know the fast isolation efficiency to understand the fast isolation pressure. So add a trace event to show some numbers to help to understand the efficiency for fast freepages isolation. Link: https://lkml.kernel.org/r/78d2932d0160d122c15372aceb3f2c45460a17fc.1685018752.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: Vlastimil Babka Cc: Mel Gorman Signed-off-by: Andrew Morton --- include/trace/events/compaction.h | 11 +++++++++++ mm/compaction.c | 6 +++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'mm/compaction.c') diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index 3313eb83c117..2b2a975efd20 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -64,6 +64,17 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages, TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken) ); +DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_fast_isolate_freepages, + + TP_PROTO( + unsigned long start_pfn, + unsigned long end_pfn, + unsigned long nr_scanned, + unsigned long nr_taken), + + TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken) +); + #ifdef CONFIG_COMPACTION TRACE_EVENT(mm_compaction_migratepages, diff --git a/mm/compaction.c b/mm/compaction.c index 3b09d8d02581..ce6293bf9c4a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1416,7 +1416,7 @@ static int next_search_order(struct compact_control *cc, int order) static void fast_isolate_freepages(struct compact_control *cc) { unsigned int limit = max(1U, freelist_scan_limit(cc) >> 1); - unsigned int nr_scanned = 0; + unsigned int nr_scanned = 0, total_isolated = 0; unsigned long low_pfn, min_pfn, highest = 0; unsigned long nr_isolated = 0; unsigned long distance; @@ -1515,6 +1515,7 @@ static void fast_isolate_freepages(struct compact_control *cc) set_page_private(page, order); nr_isolated = 1 << order; nr_scanned += nr_isolated - 1; + total_isolated += nr_isolated; cc->nr_freepages += nr_isolated; list_add_tail(&page->lru, &cc->freepages); count_compact_events(COMPACTISOLATED, nr_isolated); @@ -1535,6 +1536,9 @@ static void fast_isolate_freepages(struct compact_control *cc) limit = max(1U, limit >> 1); } + trace_mm_compaction_fast_isolate_freepages(min_pfn, cc->free_pfn, + nr_scanned, total_isolated); + if (!page) { cc->fast_search_fail++; if (scan_start) { -- cgit v1.2.3 From a8d13355c660255266ece529e81e6cb26754941a Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 25 May 2023 20:54:01 +0800 Subject: mm: compaction: skip fast freepages isolation if enough freepages are isolated I've observed that fast isolation often isolates more pages than cc->migratepages, and the excess freepages will be released back to the buddy system. So skip fast freepages isolation if enough freepages are isolated to save some CPU cycles. Link: https://lkml.kernel.org/r/f39c2c07f2dba2732fd9c0843572e5bef96f7f67.1685018752.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Johannes Weiner Signed-off-by: Andrew Morton --- mm/compaction.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm/compaction.c') diff --git a/mm/compaction.c b/mm/compaction.c index ce6293bf9c4a..767b0815c874 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1528,6 +1528,10 @@ static void fast_isolate_freepages(struct compact_control *cc) spin_unlock_irqrestore(&cc->zone->lock, flags); + /* Skip fast search if enough freepages isolated */ + if (cc->nr_freepages >= cc->nr_migratepages) + break; + /* * Smaller scan on next order so the total scan is related * to freelist_scan_limit. -- cgit v1.2.3 From 833dfc0090b3f8017ddac82d818b2d8e5ceb61db Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 10 Jun 2023 11:46:15 +0800 Subject: mm: compaction: mark kcompactd_run() and kcompactd_stop() __meminit Add __meminit to kcompactd_run() and kcompactd_stop() to ensure they're default to __init when memory hotplug is not enabled. Link: https://lkml.kernel.org/r/20230610034615.997813-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Baolin Wang Signed-off-by: Andrew Morton --- include/linux/compaction.h | 4 ++-- mm/compaction.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/compaction.c') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 57b16e69c19a..e94776496049 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -98,8 +98,8 @@ extern void compaction_defer_reset(struct zone *zone, int order, bool compaction_zonelist_suitable(struct alloc_context *ac, int order, int alloc_flags); -extern void kcompactd_run(int nid); -extern void kcompactd_stop(int nid); +extern void __meminit kcompactd_run(int nid); +extern void __meminit kcompactd_stop(int nid); extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx); #else diff --git a/mm/compaction.c b/mm/compaction.c index 767b0815c874..6149a2d324be 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -3050,7 +3050,7 @@ static int kcompactd(void *p) * This kcompactd start function will be called by init and node-hot-add. * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added. */ -void kcompactd_run(int nid) +void __meminit kcompactd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); @@ -3068,7 +3068,7 @@ void kcompactd_run(int nid) * Called by memory hotplug when all memory in a node is offlined. Caller must * be holding mem_hotplug_begin/done(). */ -void kcompactd_stop(int nid) +void __meminit kcompactd_stop(int nid) { struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd; -- cgit v1.2.3 From 9721fd82351d47a37ba982272e128101f24efd7c Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 14 Jun 2023 16:40:20 +0800 Subject: mm: compaction: skip memory hole rapidly when isolating migratable pages On some machines, the normal zone can have a large memory hole like below memory layout, and we can see the range from 0x100000000 to 0x1800000000 is a hole. So when isolating some migratable pages, the scanner can meet the hole and it will take more time to skip the large hole. From my measurement, I can see the isolation scanner will take 80us ~ 100us to skip the large hole [0x100000000 - 0x1800000000]. So adding a new helper to fast search next online memory section to skip the large hole can help to find next suitable pageblock efficiently. With this patch, I can see the large hole scanning only takes < 1us. [ 0.000000] Zone ranges: [ 0.000000] DMA [mem 0x0000000040000000-0x00000000ffffffff] [ 0.000000] DMA32 empty [ 0.000000] Normal [mem 0x0000000100000000-0x0000001fa7ffffff] [ 0.000000] Movable zone start for each node [ 0.000000] Early memory node ranges [ 0.000000] node 0: [mem 0x0000000040000000-0x0000000fffffffff] [ 0.000000] node 0: [mem 0x0000001800000000-0x0000001fa3c7ffff] [ 0.000000] node 0: [mem 0x0000001fa3c80000-0x0000001fa3ffffff] [ 0.000000] node 0: [mem 0x0000001fa4000000-0x0000001fa402ffff] [ 0.000000] node 0: [mem 0x0000001fa4030000-0x0000001fa40effff] [ 0.000000] node 0: [mem 0x0000001fa40f0000-0x0000001fa73cffff] [ 0.000000] node 0: [mem 0x0000001fa73d0000-0x0000001fa745ffff] [ 0.000000] node 0: [mem 0x0000001fa7460000-0x0000001fa746ffff] [ 0.000000] node 0: [mem 0x0000001fa7470000-0x0000001fa758ffff] [ 0.000000] node 0: [mem 0x0000001fa7590000-0x0000001fa7ffffff] [baolin.wang@linux.alibaba.com: limit next_ptn to not exceed cc->free_pfn] Link: https://lkml.kernel.org/r/a1d859c28af0c7e85e91795e7473f553eb180a9d.1686813379.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/75b4c8ca36bf44ad8c42bf0685ac19d272e426ec.1686705221.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Acked-by: "Huang, Ying" Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/compaction.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'mm/compaction.c') diff --git a/mm/compaction.c b/mm/compaction.c index 6149a2d324be..0fb3b89b3967 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -229,6 +229,33 @@ static void reset_cached_positions(struct zone *zone) pageblock_start_pfn(zone_end_pfn(zone) - 1); } +#ifdef CONFIG_SPARSEMEM +/* + * If the PFN falls into an offline section, return the start PFN of the + * next online section. If the PFN falls into an online section or if + * there is no next online section, return 0. + */ +static unsigned long skip_offline_sections(unsigned long start_pfn) +{ + unsigned long start_nr = pfn_to_section_nr(start_pfn); + + if (online_section_nr(start_nr)) + return 0; + + while (++start_nr <= __highest_present_section_nr) { + if (online_section_nr(start_nr)) + return section_nr_to_pfn(start_nr); + } + + return 0; +} +#else +static unsigned long skip_offline_sections(unsigned long start_pfn) +{ + return 0; +} +#endif + /* * Compound pages of >= pageblock_order should consistently be skipped until * released. It is always pointless to compact pages of such order (if they are @@ -1955,8 +1982,14 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, cc->zone); - if (!page) + if (!page) { + unsigned long next_pfn; + + next_pfn = skip_offline_sections(block_start_pfn); + if (next_pfn) + block_end_pfn = min(next_pfn, cc->free_pfn); continue; + } /* * If isolation recently failed, do not retry. Only check the -- cgit v1.2.3 From 56ae0bb349b4eeb172674d4876f2b6290d505a25 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 19 Jun 2023 19:07:17 +0800 Subject: mm: compaction: convert to use a folio in isolate_migratepages_block() Directly use a folio instead of page_folio() when page successfully isolated (hugepage and movable page) and after folio_get_nontail_page(), which removes several calls to compound_head(). Link: https://lkml.kernel.org/r/20230619110718.65679-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Baolin Wang Cc: James Gowans Cc: Matthew Wilcox Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/compaction.c | 84 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 44 insertions(+), 40 deletions(-) (limited to 'mm/compaction.c') diff --git a/mm/compaction.c b/mm/compaction.c index 0fb3b89b3967..dbc9f86b1934 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -822,6 +822,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, struct lruvec *lruvec; unsigned long flags = 0; struct lruvec *locked = NULL; + struct folio *folio = NULL; struct page *page = NULL, *valid_page = NULL; struct address_space *mapping; unsigned long start_pfn = low_pfn; @@ -918,7 +919,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (!valid_page && pageblock_aligned(low_pfn)) { if (!isolation_suitable(cc, page)) { low_pfn = end_pfn; - page = NULL; + folio = NULL; goto isolate_abort; } valid_page = page; @@ -950,7 +951,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * Hugepage was successfully isolated and placed * on the cc->migratepages list. */ - low_pfn += compound_nr(page) - 1; + folio = page_folio(page); + low_pfn += folio_nr_pages(folio) - 1; goto isolate_success_no_list; } @@ -1018,8 +1020,10 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, locked = NULL; } - if (isolate_movable_page(page, mode)) + if (isolate_movable_page(page, mode)) { + folio = page_folio(page); goto isolate_success; + } } goto isolate_fail; @@ -1030,7 +1034,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * sure the page is not being freed elsewhere -- the * page release code relies on it. */ - if (unlikely(!get_page_unless_zero(page))) + folio = folio_get_nontail_page(page); + if (unlikely(!folio)) goto isolate_fail; /* @@ -1038,8 +1043,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * so avoid taking lru_lock and isolating it unnecessarily in an * admittedly racy check. */ - mapping = page_mapping(page); - if (!mapping && (page_count(page) - 1) > total_mapcount(page)) + mapping = folio_mapping(folio); + if (!mapping && (folio_ref_count(folio) - 1) > folio_mapcount(folio)) goto isolate_fail_put; /* @@ -1050,11 +1055,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, goto isolate_fail_put; /* Only take pages on LRU: a check now makes later tests safe */ - if (!PageLRU(page)) + if (!folio_test_lru(folio)) goto isolate_fail_put; /* Compaction might skip unevictable pages but CMA takes them */ - if (!(mode & ISOLATE_UNEVICTABLE) && PageUnevictable(page)) + if (!(mode & ISOLATE_UNEVICTABLE) && folio_test_unevictable(folio)) goto isolate_fail_put; /* @@ -1063,10 +1068,10 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * it will be able to migrate without blocking - clean pages * for the most part. PageWriteback would require blocking. */ - if ((mode & ISOLATE_ASYNC_MIGRATE) && PageWriteback(page)) + if ((mode & ISOLATE_ASYNC_MIGRATE) && folio_test_writeback(folio)) goto isolate_fail_put; - if ((mode & ISOLATE_ASYNC_MIGRATE) && PageDirty(page)) { + if ((mode & ISOLATE_ASYNC_MIGRATE) && folio_test_dirty(folio)) { bool migrate_dirty; /* @@ -1078,22 +1083,22 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * the page lock until after the page is removed * from the page cache. */ - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto isolate_fail_put; - mapping = page_mapping(page); + mapping = folio_mapping(folio); migrate_dirty = !mapping || mapping->a_ops->migrate_folio; - unlock_page(page); + folio_unlock(folio); if (!migrate_dirty) goto isolate_fail_put; } - /* Try isolate the page */ - if (!TestClearPageLRU(page)) + /* Try isolate the folio */ + if (!folio_test_clear_lru(folio)) goto isolate_fail_put; - lruvec = folio_lruvec(page_folio(page)); + lruvec = folio_lruvec(folio); /* If we already hold the lock, we can skip some rechecking */ if (lruvec != locked) { @@ -1103,7 +1108,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, compact_lock_irqsave(&lruvec->lru_lock, &flags, cc); locked = lruvec; - lruvec_memcg_debug(lruvec, page_folio(page)); + lruvec_memcg_debug(lruvec, folio); /* * Try get exclusive access under lock. If marked for @@ -1119,34 +1124,33 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } /* - * Page become compound since the non-locked check, - * and it's on LRU. It can only be a THP so the order - * is safe to read and it's 0 for tail pages. + * folio become large since the non-locked check, + * and it's on LRU. */ - if (unlikely(PageCompound(page) && !cc->alloc_contig)) { - low_pfn += compound_nr(page) - 1; - nr_scanned += compound_nr(page) - 1; - SetPageLRU(page); + if (unlikely(folio_test_large(folio) && !cc->alloc_contig)) { + low_pfn += folio_nr_pages(folio) - 1; + nr_scanned += folio_nr_pages(folio) - 1; + folio_set_lru(folio); goto isolate_fail_put; } } - /* The whole page is taken off the LRU; skip the tail pages. */ - if (PageCompound(page)) - low_pfn += compound_nr(page) - 1; + /* The folio is taken off the LRU */ + if (folio_test_large(folio)) + low_pfn += folio_nr_pages(folio) - 1; /* Successfully isolated */ - del_page_from_lru_list(page, lruvec); - mod_node_page_state(page_pgdat(page), - NR_ISOLATED_ANON + page_is_file_lru(page), - thp_nr_pages(page)); + lruvec_del_folio(lruvec, folio); + node_stat_mod_folio(folio, + NR_ISOLATED_ANON + folio_is_file_lru(folio), + folio_nr_pages(folio)); isolate_success: - list_add(&page->lru, &cc->migratepages); + list_add(&folio->lru, &cc->migratepages); isolate_success_no_list: - cc->nr_migratepages += compound_nr(page); - nr_isolated += compound_nr(page); - nr_scanned += compound_nr(page) - 1; + cc->nr_migratepages += folio_nr_pages(folio); + nr_isolated += folio_nr_pages(folio); + nr_scanned += folio_nr_pages(folio) - 1; /* * Avoid isolating too much unless this block is being @@ -1168,7 +1172,7 @@ isolate_fail_put: unlock_page_lruvec_irqrestore(locked, flags); locked = NULL; } - put_page(page); + folio_put(folio); isolate_fail: if (!skip_on_failure && ret != -ENOMEM) @@ -1209,14 +1213,14 @@ isolate_fail: if (unlikely(low_pfn > end_pfn)) low_pfn = end_pfn; - page = NULL; + folio = NULL; isolate_abort: if (locked) unlock_page_lruvec_irqrestore(locked, flags); - if (page) { - SetPageLRU(page); - put_page(page); + if (folio) { + folio_set_lru(folio); + folio_put(folio); } /* -- cgit v1.2.3