summaryrefslogtreecommitdiff
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c976
1 files changed, 557 insertions, 419 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 76378bc257e3..f8dfd2864bbf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -271,6 +271,25 @@ static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg)
}
#endif
+/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to
+ * and including the specified highidx
+ * @zone: The current zone in the iterator
+ * @pgdat: The pgdat which node_zones are being iterated
+ * @idx: The index variable
+ * @highidx: The index of the highest zone to return
+ *
+ * This macro iterates through all managed zones up to and including the specified highidx.
+ * The zone iterator enters an invalid state after macro call and must be reinitialized
+ * before it can be used again.
+ */
+#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \
+ for ((idx) = 0, (zone) = (pgdat)->node_zones; \
+ (idx) <= (highidx); \
+ (idx)++, (zone)++) \
+ if (!managed_zone(zone)) \
+ continue; \
+ else
+
static void set_task_reclaim_state(struct task_struct *task,
struct reclaim_state *rs)
{
@@ -323,16 +342,22 @@ static void flush_reclaim_state(struct scan_control *sc)
}
}
-static bool can_demote(int nid, struct scan_control *sc)
+static bool can_demote(int nid, struct scan_control *sc,
+ struct mem_cgroup *memcg)
{
+ int demotion_nid;
+
if (!numa_demotion_enabled)
return false;
if (sc && sc->no_demotion)
return false;
- if (next_demotion_node(nid) == NUMA_NO_NODE)
+
+ demotion_nid = next_demotion_node(nid);
+ if (demotion_nid == NUMA_NO_NODE)
return false;
- return true;
+ /* If demotion node isn't in the cgroup's mems_allowed, fall back */
+ return mem_cgroup_node_allowed(memcg, demotion_nid);
}
static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
@@ -357,7 +382,7 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
*
* Can it be reclaimed from this node via demotion?
*/
- return can_demote(nid, sc);
+ return can_demote(nid, sc, memcg);
}
/*
@@ -374,7 +399,14 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
-
+ /*
+ * If there are no reclaimable file-backed or anonymous pages,
+ * ensure zones with sufficient free pages are not skipped.
+ * This prevents zones like DMA32 from being ignored in reclaim
+ * scenarios where they can still help alleviate memory pressure.
+ */
+ if (nr == 0)
+ nr = zone_page_state_snapshot(zone, NR_FREE_PAGES);
return nr;
}
@@ -389,13 +421,9 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
{
unsigned long size = 0;
int zid;
+ struct zone *zone;
- for (zid = 0; zid <= zone_idx; zid++) {
- struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
-
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, lruvec_pgdat(lruvec), zid, zone_idx) {
if (!mem_cgroup_disabled())
size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
else
@@ -434,21 +462,26 @@ void drop_slab(void)
} while ((freed >> shift++) > 1);
}
-static int reclaimer_offset(void)
+#define CHECK_RECLAIMER_OFFSET(type) \
+ do { \
+ BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD != \
+ PGDEMOTE_##type - PGDEMOTE_KSWAPD); \
+ BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD != \
+ PGSCAN_##type - PGSCAN_KSWAPD); \
+ } while (0)
+
+static int reclaimer_offset(struct scan_control *sc)
{
- BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
- PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD);
- BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
- PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD);
- BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
- PGSCAN_DIRECT - PGSCAN_KSWAPD);
- BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
- PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD);
+ CHECK_RECLAIMER_OFFSET(DIRECT);
+ CHECK_RECLAIMER_OFFSET(KHUGEPAGED);
+ CHECK_RECLAIMER_OFFSET(PROACTIVE);
if (current_is_kswapd())
return 0;
if (current_is_khugepaged())
return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD;
+ if (sc->proactive)
+ return PGSTEAL_PROACTIVE - PGSTEAL_KSWAPD;
return PGSTEAL_DIRECT - PGSTEAL_KSWAPD;
}
@@ -488,7 +521,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat)
{
int reclaimable = 0, write_pending = 0;
int i;
-
+ struct zone *zone;
/*
* If kswapd is disabled, reschedule if necessary but do not
* throttle as the system is likely near OOM.
@@ -501,12 +534,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat)
* throttle as throttling will occur when the folios cycle
* towards the end of the LRU if still under writeback.
*/
- for (i = 0; i < MAX_NR_ZONES; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, i, MAX_NR_ZONES - 1) {
reclaimable += zone_reclaimable_pages(zone);
write_pending += zone_page_state_snapshot(zone,
NR_ZONE_WRITE_PENDING);
@@ -626,21 +654,20 @@ typedef enum {
/*
* pageout is called by shrink_folio_list() for each dirty folio.
- * Calls ->writepage().
*/
static pageout_t pageout(struct folio *folio, struct address_space *mapping,
struct swap_iocb **plug, struct list_head *folio_list)
{
+ int (*writeout)(struct folio *, struct writeback_control *);
+
/*
- * If the folio is dirty, only perform writeback if that write
- * will be non-blocking. To prevent this allocation from being
- * stalled by pagecache activity. But note that there may be
- * stalls if we need to run get_block(). We could test
- * PagePrivate for that.
- *
- * If this process is currently in __generic_file_write_iter() against
- * this folio's queue, we can perform writeback even if that
- * will block.
+ * We no longer attempt to writeback filesystem folios here, other
+ * than tmpfs/shmem. That's taken care of in page-writeback.
+ * If we find a dirty filesystem folio at the end of the LRU list,
+ * typically that means the filesystem is saturating the storage
+ * with contiguous writes and telling it to write a folio here
+ * would only make the situation worse by injecting an element
+ * of random access.
*
* If the folio is swapcache, write it back even if that would
* block, for some throttling. This happens by accident, because
@@ -663,7 +690,11 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
}
return PAGE_KEEP;
}
- if (mapping->a_ops->writepage == NULL)
+ if (shmem_mapping(mapping))
+ writeout = shmem_writeout;
+ else if (folio_test_anon(folio))
+ writeout = swap_writeout;
+ else
return PAGE_ACTIVATE;
if (folio_clear_dirty_for_io(folio)) {
@@ -686,7 +717,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
wbc.list = folio_list;
folio_set_reclaim(folio);
- res = mapping->a_ops->writepage(&folio->page, &wbc);
+ res = writeout(folio, &wbc);
if (res < 0)
handle_write_error(mapping, folio, res);
if (res == AOP_WRITEPAGE_ACTIVATE) {
@@ -695,7 +726,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
}
if (!folio_test_writeback(folio)) {
- /* synchronous write or broken a_ops? */
+ /* synchronous write? */
folio_clear_reclaim(folio);
}
trace_mm_vmscan_write_folio(folio);
@@ -762,7 +793,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(folio, target_memcg);
__delete_from_swap_cache(folio, swap, shadow);
- mem_cgroup_swapout(folio, swap);
+ memcg1_swapout(folio, swap);
xa_unlock_irq(&mapping->i_pages);
put_swap_folio(folio, swap);
} else {
@@ -855,6 +886,31 @@ enum folio_references {
FOLIOREF_ACTIVATE,
};
+#ifdef CONFIG_LRU_GEN
+/*
+ * Only used on a mapped folio in the eviction (rmap walk) path, where promotion
+ * needs to be done by taking the folio off the LRU list and then adding it back
+ * with PG_active set. In contrast, the aging (page table walk) path uses
+ * folio_update_gen().
+ */
+static bool lru_gen_set_refs(struct folio *folio)
+{
+ /* see the comment on LRU_REFS_FLAGS */
+ if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
+ set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
+ return false;
+ }
+
+ set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_workingset));
+ return true;
+}
+#else
+static bool lru_gen_set_refs(struct folio *folio)
+{
+ return false;
+}
+#endif /* CONFIG_LRU_GEN */
+
static enum folio_references folio_check_references(struct folio *folio,
struct scan_control *sc)
{
@@ -863,7 +919,6 @@ static enum folio_references folio_check_references(struct folio *folio,
referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup,
&vm_flags);
- referenced_folio = folio_test_clear_referenced(folio);
/*
* The supposedly reclaimable folio was found to be in a VM_LOCKED vma.
@@ -881,6 +936,15 @@ static enum folio_references folio_check_references(struct folio *folio,
if (referenced_ptes == -1)
return FOLIOREF_KEEP;
+ if (lru_gen_enabled()) {
+ if (!referenced_ptes)
+ return FOLIOREF_RECLAIM;
+
+ return lru_gen_set_refs(folio) ? FOLIOREF_ACTIVATE : FOLIOREF_KEEP;
+ }
+
+ referenced_folio = folio_test_clear_referenced(folio);
+
if (referenced_ptes) {
/*
* All mapped folios start out with page table
@@ -1041,12 +1105,13 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
*/
static unsigned int shrink_folio_list(struct list_head *folio_list,
struct pglist_data *pgdat, struct scan_control *sc,
- struct reclaim_stat *stat, bool ignore_references)
+ struct reclaim_stat *stat, bool ignore_references,
+ struct mem_cgroup *memcg)
{
struct folio_batch free_folios;
LIST_HEAD(ret_folios);
LIST_HEAD(demote_folios);
- unsigned int nr_reclaimed = 0;
+ unsigned int nr_reclaimed = 0, nr_demoted = 0;
unsigned int pgactivate = 0;
bool do_demote_pass;
struct swap_iocb *plug = NULL;
@@ -1054,7 +1119,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
folio_batch_init(&free_folios);
memset(stat, 0, sizeof(*stat));
cond_resched();
- do_demote_pass = can_demote(pgdat->node_id, sc);
+ do_demote_pass = can_demote(pgdat->node_id, sc, memcg);
retry:
while (!list_empty(folio_list)) {
@@ -1072,6 +1137,13 @@ retry:
if (!folio_trylock(folio))
goto keep;
+ if (folio_contain_hwpoisoned_page(folio)) {
+ unmap_poisoned_folio(folio, folio_pfn(folio), false);
+ folio_unlock(folio);
+ folio_put(folio);
+ continue;
+ }
+
VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
nr_pages = folio_nr_pages(folio);
@@ -1085,11 +1157,6 @@ retry:
if (!sc->may_unmap && folio_mapped(folio))
goto keep_locked;
- /* folio_update_gen() tried to promote this page? */
- if (lru_gen_enabled() && !ignore_references &&
- folio_mapped(folio) && folio_test_referenced(folio))
- goto keep_locked;
-
/*
* The number of dirty pages determines if a node is marked
* reclaim_congested. kswapd will stall and start writing
@@ -1130,8 +1197,10 @@ retry:
* 2) Global or new memcg reclaim encounters a folio that is
* not marked for immediate reclaim, or the caller does not
* have __GFP_FS (or __GFP_IO if it's simply going to swap,
- * not to fs). In this case mark the folio for immediate
- * reclaim and continue scanning.
+ * not to fs), or the folio belongs to a mapping where
+ * waiting on writeback during reclaim may lead to a deadlock.
+ * In this case mark the folio for immediate reclaim and
+ * continue scanning.
*
* Require may_enter_fs() because we would wait on fs, which
* may not have submitted I/O yet. And the loop driver might
@@ -1156,6 +1225,8 @@ retry:
* takes to write them to disk.
*/
if (folio_test_writeback(folio)) {
+ mapping = folio_mapping(folio);
+
/* Case 1 above */
if (current_is_kswapd() &&
folio_test_reclaim(folio) &&
@@ -1166,7 +1237,9 @@ retry:
/* Case 2 above */
} else if (writeback_throttling_sane(sc) ||
!folio_test_reclaim(folio) ||
- !may_enter_fs(folio, sc->gfp_mask)) {
+ !may_enter_fs(folio, sc->gfp_mask) ||
+ (mapping &&
+ mapping_writeback_may_deadlock_on_reclaim(mapping))) {
/*
* This is slightly racy -
* folio_end_writeback() might have
@@ -1244,7 +1317,7 @@ retry:
split_folio_to_list(folio, folio_list))
goto activate_locked;
}
- if (!add_to_swap(folio)) {
+ if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) {
int __maybe_unused order = folio_order(folio);
if (!folio_test_large(folio))
@@ -1260,9 +1333,21 @@ retry:
}
#endif
count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK);
- if (!add_to_swap(folio))
+ if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN))
goto activate_locked_split;
}
+ /*
+ * Normally the folio will be dirtied in unmap because its
+ * pte should be dirty. A special case is MADV_FREE page. The
+ * page's pte could have dirty bit cleared but the folio's
+ * SwapBacked flag is still set because clearing the dirty bit
+ * and SwapBacked flag has no lock protected. For such folio,
+ * unmap will not set dirty bit for it, so folio reclaim will
+ * not write the folio out. This can cause data corruption when
+ * the folio is swapped in later. Always setting the dirty flag
+ * for the folio solves the problem.
+ */
+ folio_mark_dirty(folio);
}
}
@@ -1515,8 +1600,9 @@ keep:
/* 'folio_list' is always empty here */
/* Migrate folios selected for demotion */
- stat->nr_demoted = demote_folio_list(&demote_folios, pgdat);
- nr_reclaimed += stat->nr_demoted;
+ nr_demoted = demote_folio_list(&demote_folios, pgdat);
+ nr_reclaimed += nr_demoted;
+ stat->nr_demoted += nr_demoted;
/* Folios that could not be demoted are still in @demote_folios */
if (!list_empty(&demote_folios)) {
/* Folios which weren't demoted go back on @folio_list */
@@ -1588,7 +1674,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
*/
noreclaim_flag = memalloc_noreclaim_save();
nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc,
- &stat, true);
+ &stat, true, NULL);
memalloc_noreclaim_restore(noreclaim_flag);
list_splice(&clean_folios, folio_list);
@@ -1655,12 +1741,11 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
unsigned long nr_taken = 0;
unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
- unsigned long skipped = 0;
- unsigned long scan, total_scan, nr_pages;
+ unsigned long skipped = 0, total_scan = 0, scan = 0;
+ unsigned long nr_pages;
+ unsigned long max_nr_skipped = 0;
LIST_HEAD(folios_skipped);
- total_scan = 0;
- scan = 0;
while (scan < nr_to_scan && !list_empty(src)) {
struct list_head *move_to = src;
struct folio *folio;
@@ -1671,9 +1756,12 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
nr_pages = folio_nr_pages(folio);
total_scan += nr_pages;
- if (folio_zonenum(folio) > sc->reclaim_idx) {
+ /* Using max_nr_skipped to prevent hard LOCKUP*/
+ if (max_nr_skipped < SWAP_CLUSTER_MAX_SKIPPED &&
+ (folio_zonenum(folio) > sc->reclaim_idx)) {
nr_skipped[folio_zonenum(folio)] += nr_pages;
move_to = &folios_skipped;
+ max_nr_skipped++;
goto move;
}
@@ -1946,10 +2034,10 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
&nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
- item = PGSCAN_KSWAPD + reclaimer_offset();
+ item = PGSCAN_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_scanned);
- __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
+ count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
__count_vm_events(PGSCAN_ANON + file, nr_scanned);
spin_unlock_irq(&lruvec->lru_lock);
@@ -1957,18 +2045,19 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
if (nr_taken == 0)
return 0;
- nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);
+ nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false,
+ lruvec_memcg(lruvec));
spin_lock_irq(&lruvec->lru_lock);
move_folios_to_lru(lruvec, &folio_list);
- __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(),
+ __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
stat.nr_demoted);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
- item = PGSTEAL_KSWAPD + reclaimer_offset();
+ item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_reclaimed);
- __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
+ count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
spin_unlock_irq(&lruvec->lru_lock);
@@ -2058,7 +2147,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
if (!cgroup_reclaim(sc))
__count_vm_events(PGREFILL, nr_scanned);
- __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
+ count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
spin_unlock_irq(&lruvec->lru_lock);
@@ -2115,7 +2204,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
nr_deactivate = move_folios_to_lru(lruvec, &l_inactive);
__count_vm_events(PGDEACTIVATE, nr_deactivate);
- __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
+ count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&lruvec->lru_lock);
@@ -2140,7 +2229,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
.no_demotion = 1,
};
- nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true);
+ nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true, NULL);
while (!list_empty(folio_list)) {
folio = lru_to_folio(folio_list);
list_del(&folio->lru);
@@ -2332,17 +2421,13 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
unsigned long total_high_wmark = 0;
unsigned long free, anon;
int z;
+ struct zone *zone;
free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
file = node_page_state(pgdat, NR_ACTIVE_FILE) +
node_page_state(pgdat, NR_INACTIVE_FILE);
- for (z = 0; z < MAX_NR_ZONES; z++) {
- struct zone *zone = &pgdat->node_zones[z];
-
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, z, MAX_NR_ZONES - 1) {
total_high_wmark += high_wmark_pages(zone);
}
@@ -2360,6 +2445,43 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
}
}
+static inline void calculate_pressure_balance(struct scan_control *sc,
+ int swappiness, u64 *fraction, u64 *denominator)
+{
+ unsigned long anon_cost, file_cost, total_cost;
+ unsigned long ap, fp;
+
+ /*
+ * Calculate the pressure balance between anon and file pages.
+ *
+ * The amount of pressure we put on each LRU is inversely
+ * proportional to the cost of reclaiming each list, as
+ * determined by the share of pages that are refaulting, times
+ * the relative IO cost of bringing back a swapped out
+ * anonymous page vs reloading a filesystem page (swappiness).
+ *
+ * Although we limit that influence to ensure no list gets
+ * left behind completely: at least a third of the pressure is
+ * applied, before swappiness.
+ *
+ * With swappiness at 100, anon and file have equal IO cost.
+ */
+ total_cost = sc->anon_cost + sc->file_cost;
+ anon_cost = total_cost + sc->anon_cost;
+ file_cost = total_cost + sc->file_cost;
+ total_cost = anon_cost + file_cost;
+
+ ap = swappiness * (total_cost + 1);
+ ap /= anon_cost + 1;
+
+ fp = (MAX_SWAPPINESS - swappiness) * (total_cost + 1);
+ fp /= file_cost + 1;
+
+ fraction[WORKINGSET_ANON] = ap;
+ fraction[WORKINGSET_FILE] = fp;
+ *denominator = ap + fp;
+}
+
/*
* Determine how aggressively the anon and file LRU lists should be
* scanned.
@@ -2372,12 +2494,10 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
{
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
- unsigned long anon_cost, file_cost, total_cost;
int swappiness = sc_swappiness(sc, memcg);
u64 fraction[ANON_AND_FILE];
u64 denominator = 0; /* gcc */
enum scan_balance scan_balance;
- unsigned long ap, fp;
enum lru_list lru;
/* If we have no swap space, do not bother scanning anon folios. */
@@ -2398,6 +2518,13 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
goto out;
}
+ /* Proactive reclaim initiated by userspace for anonymous memory only */
+ if (swappiness == SWAPPINESS_ANON_ONLY) {
+ WARN_ON_ONCE(!sc->proactive);
+ scan_balance = SCAN_ANON;
+ goto out;
+ }
+
/*
* Do not apply any pressure balancing cleverness when the
* system is close to OOM, scan both anon and file equally
@@ -2418,7 +2545,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
/*
* If there is enough inactive page cache, we do not reclaim
- * anything from the anonymous working right now.
+ * anything from the anonymous working right now to make sure
+ * a streaming file access pattern doesn't cause swapping.
*/
if (sc->cache_trim_mode) {
scan_balance = SCAN_FILE;
@@ -2426,35 +2554,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
}
scan_balance = SCAN_FRACT;
- /*
- * Calculate the pressure balance between anon and file pages.
- *
- * The amount of pressure we put on each LRU is inversely
- * proportional to the cost of reclaiming each list, as
- * determined by the share of pages that are refaulting, times
- * the relative IO cost of bringing back a swapped out
- * anonymous page vs reloading a filesystem page (swappiness).
- *
- * Although we limit that influence to ensure no list gets
- * left behind completely: at least a third of the pressure is
- * applied, before swappiness.
- *
- * With swappiness at 100, anon and file have equal IO cost.
- */
- total_cost = sc->anon_cost + sc->file_cost;
- anon_cost = total_cost + sc->anon_cost;
- file_cost = total_cost + sc->file_cost;
- total_cost = anon_cost + file_cost;
-
- ap = swappiness * (total_cost + 1);
- ap /= anon_cost + 1;
-
- fp = (MAX_SWAPPINESS - swappiness) * (total_cost + 1);
- fp /= file_cost + 1;
+ calculate_pressure_balance(sc, swappiness, fraction, &denominator);
- fraction[0] = ap;
- fraction[1] = fp;
- denominator = ap + fp;
out:
for_each_evictable_lru(lru) {
bool file = is_file_lru(lru);
@@ -2568,7 +2669,7 @@ out:
* Anonymous LRU management is a waste if there is
* ultimately no way to reclaim the memory.
*/
-static bool can_age_anon_pages(struct pglist_data *pgdat,
+static bool can_age_anon_pages(struct lruvec *lruvec,
struct scan_control *sc)
{
/* Aging the anon LRU is valuable if swap is present: */
@@ -2576,7 +2677,8 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
return true;
/* Also valuable if anon pages can be demoted: */
- return can_demote(pgdat->node_id, sc);
+ return can_demote(lruvec_pgdat(lruvec)->node_id, sc,
+ lruvec_memcg(lruvec));
}
#ifdef CONFIG_LRU_GEN
@@ -2612,11 +2714,21 @@ static bool should_clear_pmd_young(void)
READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \
}
+/* Get the min/max evictable type based on swappiness */
+#define min_type(swappiness) (!(swappiness))
+#define max_type(swappiness) ((swappiness) < SWAPPINESS_ANON_ONLY)
+
+#define evictable_min_seq(min_seq, swappiness) \
+ min((min_seq)[min_type(swappiness)], (min_seq)[max_type(swappiness)])
+
#define for_each_gen_type_zone(gen, type, zone) \
for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
+#define for_each_evictable_type(type, swappiness) \
+ for ((type) = min_type(swappiness); (type) <= max_type(swappiness); (type)++)
+
#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
@@ -2648,7 +2760,7 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
if (!sc->may_swap)
return 0;
- if (!can_demote(pgdat->node_id, sc) &&
+ if (!can_demote(pgdat->node_id, sc, memcg) &&
mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
return 0;
@@ -2662,10 +2774,16 @@ static int get_nr_gens(struct lruvec *lruvec, int type)
static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
{
- /* see the comment on lru_gen_folio */
- return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
- get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
- get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
+ int type;
+
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ int n = get_nr_gens(lruvec, type);
+
+ if (n < MIN_NR_GENS || n > MAX_NR_GENS)
+ return false;
+ }
+
+ return true;
}
/******************************************************************************
@@ -3066,16 +3184,20 @@ struct ctrl_pos {
static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
struct ctrl_pos *pos)
{
+ int i;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
- pos->refaulted = lrugen->avg_refaulted[type][tier] +
- atomic_long_read(&lrugen->refaulted[hist][type][tier]);
- pos->total = lrugen->avg_total[type][tier] +
- atomic_long_read(&lrugen->evicted[hist][type][tier]);
- if (tier)
- pos->total += lrugen->protected[hist][type][tier - 1];
pos->gain = gain;
+ pos->refaulted = pos->total = 0;
+
+ for (i = tier % MAX_NR_TIERS; i <= min(tier, MAX_NR_TIERS - 1); i++) {
+ pos->refaulted += lrugen->avg_refaulted[type][i] +
+ atomic_long_read(&lrugen->refaulted[hist][type][i]);
+ pos->total += lrugen->avg_total[type][i] +
+ lrugen->protected[hist][type][i] +
+ atomic_long_read(&lrugen->evicted[hist][type][i]);
+ }
}
static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
@@ -3101,17 +3223,15 @@ static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
sum = lrugen->avg_total[type][tier] +
+ lrugen->protected[hist][type][tier] +
atomic_long_read(&lrugen->evicted[hist][type][tier]);
- if (tier)
- sum += lrugen->protected[hist][type][tier - 1];
WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
}
if (clear) {
atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
- if (tier)
- WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
+ WRITE_ONCE(lrugen->protected[hist][type][tier], 0);
}
}
}
@@ -3138,16 +3258,19 @@ static int folio_update_gen(struct folio *folio, int gen)
VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
+ /* see the comment on LRU_REFS_FLAGS */
+ if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
+ set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
+ return -1;
+ }
+
do {
/* lru_gen_del_folio() has isolated this page? */
- if (!(old_flags & LRU_GEN_MASK)) {
- /* for shrink_folio_list() */
- new_flags = old_flags | BIT(PG_referenced);
- continue;
- }
+ if (!(old_flags & LRU_GEN_MASK))
+ return -1;
- new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
- new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS);
+ new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset);
} while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
@@ -3171,7 +3294,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
new_gen = (old_gen + 1) % MAX_NR_GENS;
- new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS);
new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
/* for folio_end_writeback() */
if (reclaiming)
@@ -3246,7 +3369,7 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal
return true;
if (vma_is_anonymous(vma))
- return !walk->can_swap;
+ return !walk->swappiness;
if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
return true;
@@ -3256,7 +3379,10 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal
return true;
if (shmem_mapping(mapping))
- return !walk->can_swap;
+ return !walk->swappiness;
+
+ if (walk->swappiness > MAX_SWAPPINESS)
+ return true;
/* to exclude special mappings like dax, etc. */
return !mapping->a_ops->read_folio;
@@ -3344,19 +3470,17 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned
}
static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
- struct pglist_data *pgdat, bool can_swap)
+ struct pglist_data *pgdat)
{
- struct folio *folio;
+ struct folio *folio = pfn_folio(pfn);
- folio = pfn_folio(pfn);
- if (folio_nid(folio) != pgdat->node_id)
+ if (folio_lru_gen(folio) < 0)
return NULL;
- if (folio_memcg(folio) != memcg)
+ if (folio_nid(folio) != pgdat->node_id)
return NULL;
- /* file VMAs can contain anon pages from COW */
- if (!folio_is_file_lru(folio) && !can_swap)
+ if (folio_memcg(folio) != memcg)
return NULL;
return folio;
@@ -3370,29 +3494,55 @@ static bool suitable_to_scan(int total, int young)
return young * n >= total;
}
+static void walk_update_folio(struct lru_gen_mm_walk *walk, struct folio *folio,
+ int new_gen, bool dirty)
+{
+ int old_gen;
+
+ if (!folio)
+ return;
+
+ if (dirty && !folio_test_dirty(folio) &&
+ !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio)))
+ folio_mark_dirty(folio);
+
+ if (walk) {
+ old_gen = folio_update_gen(folio, new_gen);
+ if (old_gen >= 0 && old_gen != new_gen)
+ update_batch_size(walk, folio, old_gen, new_gen);
+ } else if (lru_gen_set_refs(folio)) {
+ old_gen = folio_lru_gen(folio);
+ if (old_gen >= 0 && old_gen != new_gen)
+ folio_activate(folio);
+ }
+}
+
static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
struct mm_walk *args)
{
int i;
+ bool dirty;
pte_t *pte;
spinlock_t *ptl;
unsigned long addr;
int total = 0;
int young = 0;
+ struct folio *last = NULL;
struct lru_gen_mm_walk *walk = args->private;
struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
DEFINE_MAX_SEQ(walk->lruvec);
- int old_gen, new_gen = lru_gen_from_seq(max_seq);
+ int gen = lru_gen_from_seq(max_seq);
pmd_t pmdval;
- pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval,
- &ptl);
+ pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, &ptl);
if (!pte)
return false;
+
if (!spin_trylock(ptl)) {
pte_unmap(pte);
- return false;
+ return true;
}
if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
@@ -3414,26 +3564,30 @@ restart:
if (pfn == -1)
continue;
- folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
+ folio = get_pfn_folio(pfn, memcg, pgdat);
if (!folio)
continue;
if (!ptep_clear_young_notify(args->vma, addr, pte + i))
continue;
- young++;
- walk->mm_stats[MM_LEAF_YOUNG]++;
+ if (last != folio) {
+ walk_update_folio(walk, last, gen, dirty);
+
+ last = folio;
+ dirty = false;
+ }
- if (pte_dirty(ptent) && !folio_test_dirty(folio) &&
- !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
- !folio_test_swapcache(folio)))
- folio_mark_dirty(folio);
+ if (pte_dirty(ptent))
+ dirty = true;
- old_gen = folio_update_gen(folio, new_gen);
- if (old_gen >= 0 && old_gen != new_gen)
- update_batch_size(walk, folio, old_gen, new_gen);
+ young++;
+ walk->mm_stats[MM_LEAF_YOUNG]++;
}
+ walk_update_folio(walk, last, gen, dirty);
+ last = NULL;
+
if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
goto restart;
@@ -3447,13 +3601,15 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
{
int i;
+ bool dirty;
pmd_t *pmd;
spinlock_t *ptl;
+ struct folio *last = NULL;
struct lru_gen_mm_walk *walk = args->private;
struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
DEFINE_MAX_SEQ(walk->lruvec);
- int old_gen, new_gen = lru_gen_from_seq(max_seq);
+ int gen = lru_gen_from_seq(max_seq);
VM_WARN_ON_ONCE(pud_leaf(*pud));
@@ -3499,27 +3655,30 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
if (pfn == -1)
goto next;
- folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
+ folio = get_pfn_folio(pfn, memcg, pgdat);
if (!folio)
goto next;
if (!pmdp_clear_young_notify(vma, addr, pmd + i))
goto next;
- walk->mm_stats[MM_LEAF_YOUNG]++;
+ if (last != folio) {
+ walk_update_folio(walk, last, gen, dirty);
- if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) &&
- !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
- !folio_test_swapcache(folio)))
- folio_mark_dirty(folio);
+ last = folio;
+ dirty = false;
+ }
- old_gen = folio_update_gen(folio, new_gen);
- if (old_gen >= 0 && old_gen != new_gen)
- update_batch_size(walk, folio, old_gen, new_gen);
+ if (pmd_dirty(pmd[i]))
+ dirty = true;
+
+ walk->mm_stats[MM_LEAF_YOUNG]++;
next:
i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1;
} while (i <= MIN_LRU_BATCH);
+ walk_update_folio(walk, last, gen, dirty);
+
arch_leave_lazy_mmu_mode();
spin_unlock(ptl);
done:
@@ -3711,22 +3870,30 @@ static void clear_mm_walk(void)
kfree(walk);
}
-static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
+static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness)
{
int zone;
int remaining = MAX_LRU_BATCH;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ int hist = lru_hist_from_seq(lrugen->min_seq[type]);
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
- if (type == LRU_GEN_ANON && !can_swap)
+ /* For file type, skip the check if swappiness is anon only */
+ if (type && (swappiness == SWAPPINESS_ANON_ONLY))
+ goto done;
+
+ /* For anon type, skip the check if swappiness is zero (file only) */
+ if (!type && !swappiness)
goto done;
- /* prevent cold/hot inversion if force_scan is true */
+ /* prevent cold/hot inversion if the type is evictable */
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
struct list_head *head = &lrugen->folios[old_gen][type][zone];
while (!list_empty(head)) {
struct folio *folio = lru_to_folio(head);
+ int refs = folio_lru_refs(folio);
+ bool workingset = folio_test_workingset(folio);
VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
@@ -3736,6 +3903,15 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
new_gen = folio_inc_gen(lruvec, folio, false);
list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
+ /* don't count the workingset being lazily promoted */
+ if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) {
+ int tier = lru_tier_from_refs(refs, workingset);
+ int delta = folio_nr_pages(folio);
+
+ WRITE_ONCE(lrugen->protected[hist][type][tier],
+ lrugen->protected[hist][type][tier] + delta);
+ }
+
if (!--remaining)
return false;
}
@@ -3747,7 +3923,7 @@ done:
return true;
}
-static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
+static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness)
{
int gen, type, zone;
bool success = false;
@@ -3757,7 +3933,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
/* find the oldest populated generation */
- for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ for_each_evictable_type(type, swappiness) {
while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {
gen = lru_gen_from_seq(min_seq[type]);
@@ -3773,13 +3949,17 @@ next:
}
/* see the comment on lru_gen_folio */
- if (can_swap) {
- min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
- min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
+ if (swappiness && swappiness <= MAX_SWAPPINESS) {
+ unsigned long seq = lrugen->max_seq - MIN_NR_GENS;
+
+ if (min_seq[LRU_GEN_ANON] > seq && min_seq[LRU_GEN_FILE] < seq)
+ min_seq[LRU_GEN_ANON] = seq;
+ else if (min_seq[LRU_GEN_FILE] > seq && min_seq[LRU_GEN_ANON] < seq)
+ min_seq[LRU_GEN_FILE] = seq;
}
- for (type = !can_swap; type < ANON_AND_FILE; type++) {
- if (min_seq[type] == lrugen->min_seq[type])
+ for_each_evictable_type(type, swappiness) {
+ if (min_seq[type] <= lrugen->min_seq[type])
continue;
reset_ctrl_pos(lruvec, type, true);
@@ -3790,8 +3970,7 @@ next:
return success;
}
-static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq,
- bool can_swap, bool force_scan)
+static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness)
{
bool success;
int prev, next;
@@ -3809,13 +3988,11 @@ restart:
if (!success)
goto unlock;
- for (type = ANON_AND_FILE - 1; type >= 0; type--) {
+ for (type = 0; type < ANON_AND_FILE; type++) {
if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
continue;
- VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap));
-
- if (inc_min_seq(lruvec, type, can_swap))
+ if (inc_min_seq(lruvec, type, swappiness))
continue;
spin_unlock_irq(&lruvec->lru_lock);
@@ -3859,7 +4036,7 @@ unlock:
}
static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
- bool can_swap, bool force_scan)
+ int swappiness, bool force_scan)
{
bool success;
struct lru_gen_mm_walk *walk;
@@ -3870,7 +4047,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq));
if (!mm_state)
- return inc_max_seq(lruvec, seq, can_swap, force_scan);
+ return inc_max_seq(lruvec, seq, swappiness);
/* see the comment in iterate_mm_list() */
if (seq <= READ_ONCE(mm_state->seq))
@@ -3895,7 +4072,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
walk->lruvec = lruvec;
walk->seq = seq;
- walk->can_swap = can_swap;
+ walk->swappiness = swappiness;
walk->force_scan = force_scan;
do {
@@ -3905,7 +4082,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
} while (mm);
done:
if (success) {
- success = inc_max_seq(lruvec, seq, can_swap, force_scan);
+ success = inc_max_seq(lruvec, seq, swappiness);
WARN_ON_ONCE(!success);
}
@@ -3946,13 +4123,13 @@ static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
{
int gen, type, zone;
unsigned long total = 0;
- bool can_swap = get_swappiness(lruvec, sc);
+ int swappiness = get_swappiness(lruvec, sc);
struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MAX_SEQ(lruvec);
DEFINE_MIN_SEQ(lruvec);
- for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ for_each_evictable_type(type, swappiness) {
unsigned long seq;
for (seq = min_seq[type]; seq <= max_seq; seq++) {
@@ -3972,6 +4149,7 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
{
int gen;
unsigned long birth;
+ int swappiness = get_swappiness(lruvec, sc);
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MIN_SEQ(lruvec);
@@ -3981,8 +4159,7 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
if (!lruvec_is_sizable(lruvec, sc))
return false;
- /* see the comment on lru_gen_folio */
- gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
+ gen = lru_gen_from_seq(evictable_min_seq(min_seq, swappiness));
birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
return time_is_before_jiffies(birth + min_ttl);
@@ -4041,21 +4218,22 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
{
int i;
+ bool dirty;
unsigned long start;
unsigned long end;
struct lru_gen_mm_walk *walk;
+ struct folio *last = NULL;
int young = 1;
pte_t *pte = pvmw->pte;
unsigned long addr = pvmw->address;
struct vm_area_struct *vma = pvmw->vma;
struct folio *folio = pfn_folio(pvmw->pfn);
- bool can_swap = !folio_is_file_lru(folio);
struct mem_cgroup *memcg = folio_memcg(folio);
struct pglist_data *pgdat = folio_pgdat(folio);
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
DEFINE_MAX_SEQ(lruvec);
- int old_gen, new_gen = lru_gen_from_seq(max_seq);
+ int gen = lru_gen_from_seq(max_seq);
lockdep_assert_held(pvmw->ptl);
VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
@@ -4102,37 +4280,28 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
if (pfn == -1)
continue;
- folio = get_pfn_folio(pfn, memcg, pgdat, can_swap);
+ folio = get_pfn_folio(pfn, memcg, pgdat);
if (!folio)
continue;
if (!ptep_clear_young_notify(vma, addr, pte + i))
continue;
- young++;
-
- if (pte_dirty(ptent) && !folio_test_dirty(folio) &&
- !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
- !folio_test_swapcache(folio)))
- folio_mark_dirty(folio);
-
- if (walk) {
- old_gen = folio_update_gen(folio, new_gen);
- if (old_gen >= 0 && old_gen != new_gen)
- update_batch_size(walk, folio, old_gen, new_gen);
+ if (last != folio) {
+ walk_update_folio(walk, last, gen, dirty);
- continue;
+ last = folio;
+ dirty = false;
}
- old_gen = folio_lru_gen(folio);
- if (old_gen < 0)
- folio_set_referenced(folio);
- else if (old_gen != new_gen) {
- folio_clear_lru_refs(folio);
- folio_activate(folio);
- }
+ if (pte_dirty(ptent))
+ dirty = true;
+
+ young++;
}
+ walk_update_folio(walk, last, gen, dirty);
+
arch_leave_lazy_mmu_mode();
/* feedback from rmap walkers to page table walkers */
@@ -4290,7 +4459,8 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
int zone = folio_zonenum(folio);
int delta = folio_nr_pages(folio);
int refs = folio_lru_refs(folio);
- int tier = lru_tier_from_refs(refs);
+ bool workingset = folio_test_workingset(folio);
+ int tier = lru_tier_from_refs(refs, workingset);
struct lru_gen_folio *lrugen = &lruvec->lrugen;
VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
@@ -4312,14 +4482,17 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
}
/* protected */
- if (tier > tier_idx || refs == BIT(LRU_REFS_WIDTH)) {
- int hist = lru_hist_from_seq(lrugen->min_seq[type]);
-
+ if (tier > tier_idx || refs + workingset == BIT(LRU_REFS_WIDTH) + 1) {
gen = folio_inc_gen(lruvec, folio, false);
- list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
+ list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
- WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
- lrugen->protected[hist][type][tier - 1] + delta);
+ /* don't count the workingset being lazily promoted */
+ if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) {
+ int hist = lru_hist_from_seq(lrugen->min_seq[type]);
+
+ WRITE_ONCE(lrugen->protected[hist][type][tier],
+ lrugen->protected[hist][type][tier] + delta);
+ }
return true;
}
@@ -4339,8 +4512,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
}
/* waiting for writeback */
- if (folio_test_locked(folio) || writeback ||
- (type == LRU_GEN_FILE && dirty)) {
+ if (writeback || (type == LRU_GEN_FILE && dirty)) {
gen = folio_inc_gen(lruvec, folio, true);
list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
return true;
@@ -4369,13 +4541,12 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
return false;
}
- /* see the comment on MAX_NR_TIERS */
+ /* see the comment on LRU_REFS_FLAGS */
if (!folio_test_referenced(folio))
- folio_clear_lru_refs(folio);
+ set_mask_bits(&folio->flags, LRU_REFS_MASK, 0);
/* for shrink_folio_list() */
folio_clear_reclaim(folio);
- folio_clear_referenced(folio);
success = lru_gen_del_folio(lruvec, folio, true);
VM_WARN_ON_ONCE_FOLIO(!success, folio);
@@ -4445,13 +4616,13 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
break;
}
- item = PGSCAN_KSWAPD + reclaimer_offset();
+ item = PGSCAN_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc)) {
__count_vm_events(item, isolated);
__count_vm_events(PGREFILL, sorted);
}
- __count_memcg_events(memcg, item, isolated);
- __count_memcg_events(memcg, PGREFILL, sorted);
+ count_memcg_events(memcg, item, isolated);
+ count_memcg_events(memcg, PGREFILL, sorted);
__count_vm_events(PGSCAN_ANON + type, isolated);
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH,
scanned, skipped, isolated,
@@ -4471,13 +4642,13 @@ static int get_tier_idx(struct lruvec *lruvec, int type)
struct ctrl_pos sp, pv;
/*
- * To leave a margin for fluctuations, use a larger gain factor (1:2).
+ * To leave a margin for fluctuations, use a larger gain factor (2:3).
* This value is chosen because any other tier would have at least twice
* as many refaults as the first tier.
*/
- read_ctrl_pos(lruvec, type, 0, 1, &sp);
+ read_ctrl_pos(lruvec, type, 0, 2, &sp);
for (tier = 1; tier < MAX_NR_TIERS; tier++) {
- read_ctrl_pos(lruvec, type, tier, 2, &pv);
+ read_ctrl_pos(lruvec, type, tier, 3, &pv);
if (!positive_ctrl_err(&sp, &pv))
break;
}
@@ -4485,79 +4656,45 @@ static int get_tier_idx(struct lruvec *lruvec, int type)
return tier - 1;
}
-static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx)
+static int get_type_to_scan(struct lruvec *lruvec, int swappiness)
{
- int type, tier;
struct ctrl_pos sp, pv;
- int gain[ANON_AND_FILE] = { swappiness, MAX_SWAPPINESS - swappiness };
+ if (swappiness <= MIN_SWAPPINESS + 1)
+ return LRU_GEN_FILE;
+
+ if (swappiness >= MAX_SWAPPINESS)
+ return LRU_GEN_ANON;
/*
- * Compare the first tier of anon with that of file to determine which
- * type to scan. Also need to compare other tiers of the selected type
- * with the first tier of the other type to determine the last tier (of
- * the selected type) to evict.
+ * Compare the sum of all tiers of anon with that of file to determine
+ * which type to scan.
*/
- read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp);
- read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv);
- type = positive_ctrl_err(&sp, &pv);
+ read_ctrl_pos(lruvec, LRU_GEN_ANON, MAX_NR_TIERS, swappiness, &sp);
+ read_ctrl_pos(lruvec, LRU_GEN_FILE, MAX_NR_TIERS, MAX_SWAPPINESS - swappiness, &pv);
- read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp);
- for (tier = 1; tier < MAX_NR_TIERS; tier++) {
- read_ctrl_pos(lruvec, type, tier, gain[type], &pv);
- if (!positive_ctrl_err(&sp, &pv))
- break;
- }
-
- *tier_idx = tier - 1;
-
- return type;
+ return positive_ctrl_err(&sp, &pv);
}
static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
int *type_scanned, struct list_head *list)
{
int i;
- int type;
- int scanned;
- int tier = -1;
- DEFINE_MIN_SEQ(lruvec);
+ int type = get_type_to_scan(lruvec, swappiness);
- /*
- * Try to make the obvious choice first, and if anon and file are both
- * available from the same generation,
- * 1. Interpret swappiness 1 as file first and MAX_SWAPPINESS as anon
- * first.
- * 2. If !__GFP_IO, file first since clean pagecache is more likely to
- * exist than clean swapcache.
- */
- if (!swappiness)
- type = LRU_GEN_FILE;
- else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE])
- type = LRU_GEN_ANON;
- else if (swappiness == 1)
- type = LRU_GEN_FILE;
- else if (swappiness == MAX_SWAPPINESS)
- type = LRU_GEN_ANON;
- else if (!(sc->gfp_mask & __GFP_IO))
- type = LRU_GEN_FILE;
- else
- type = get_type_to_scan(lruvec, swappiness, &tier);
+ for_each_evictable_type(i, swappiness) {
+ int scanned;
+ int tier = get_tier_idx(lruvec, type);
- for (i = !swappiness; i < ANON_AND_FILE; i++) {
- if (tier < 0)
- tier = get_tier_idx(lruvec, type);
+ *type_scanned = type;
scanned = scan_folios(lruvec, sc, type, tier, list);
if (scanned)
- break;
+ return scanned;
type = !type;
- tier = -1;
}
- *type_scanned = type;
-
- return scanned;
+ return 0;
}
static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
@@ -4573,6 +4710,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
struct reclaim_stat stat;
struct lru_gen_mm_walk *walk;
bool skip_retry = false;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
@@ -4582,7 +4720,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
scanned += try_to_inc_min_seq(lruvec, swappiness);
- if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS)
+ if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq)
scanned = 0;
spin_unlock_irq(&lruvec->lru_lock);
@@ -4590,7 +4728,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
if (list_empty(&list))
return scanned;
retry:
- reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
+ reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, memcg);
sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
sc->nr_reclaimed += reclaimed;
trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
@@ -4598,31 +4736,24 @@ retry:
type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
list_for_each_entry_safe_reverse(folio, next, &list, lru) {
+ DEFINE_MIN_SEQ(lruvec);
+
if (!folio_evictable(folio)) {
list_del(&folio->lru);
folio_putback_lru(folio);
continue;
}
- if (folio_test_reclaim(folio) &&
- (folio_test_dirty(folio) || folio_test_writeback(folio))) {
- /* restore LRU_REFS_FLAGS cleared by isolate_folio() */
- if (folio_test_workingset(folio))
- folio_set_referenced(folio);
- continue;
- }
-
- if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) ||
- folio_mapped(folio) || folio_test_locked(folio) ||
- folio_test_dirty(folio) || folio_test_writeback(folio)) {
- /* don't add rejected folios to the oldest generation */
- set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
- BIT(PG_active));
+ /* retry folios that may have missed folio_rotate_reclaimable() */
+ if (!skip_retry && !folio_test_active(folio) && !folio_mapped(folio) &&
+ !folio_test_dirty(folio) && !folio_test_writeback(folio)) {
+ list_move(&folio->lru, &clean);
continue;
}
- /* retry folios that may have missed folio_rotate_reclaimable() */
- list_move(&folio->lru, &clean);
+ /* don't add rejected folios to the oldest generation */
+ if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type])
+ set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active));
}
spin_lock_irq(&lruvec->lru_lock);
@@ -4635,10 +4766,13 @@ retry:
reset_batch_size(walk);
}
- item = PGSTEAL_KSWAPD + reclaimer_offset();
+ __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
+ stat.nr_demoted);
+
+ item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc))
__count_vm_events(item, reclaimed);
- __count_memcg_events(memcg, item, reclaimed);
+ count_memcg_events(memcg, item, reclaimed);
__count_vm_events(PGSTEAL_ANON + type, reclaimed);
spin_unlock_irq(&lruvec->lru_lock);
@@ -4654,63 +4788,32 @@ retry:
}
static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
- bool can_swap, unsigned long *nr_to_scan)
+ int swappiness, unsigned long *nr_to_scan)
{
int gen, type, zone;
- unsigned long old = 0;
- unsigned long young = 0;
- unsigned long total = 0;
+ unsigned long size = 0;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
DEFINE_MIN_SEQ(lruvec);
- /* whether this lruvec is completely out of cold folios */
- if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
- *nr_to_scan = 0;
+ *nr_to_scan = 0;
+ /* have to run aging, since eviction is not possible anymore */
+ if (evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS > max_seq)
return true;
- }
- for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ for_each_evictable_type(type, swappiness) {
unsigned long seq;
for (seq = min_seq[type]; seq <= max_seq; seq++) {
- unsigned long size = 0;
-
gen = lru_gen_from_seq(seq);
for (zone = 0; zone < MAX_NR_ZONES; zone++)
size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
-
- total += size;
- if (seq == max_seq)
- young += size;
- else if (seq + MIN_NR_GENS == max_seq)
- old += size;
}
}
- *nr_to_scan = total;
-
- /*
- * The aging tries to be lazy to reduce the overhead, while the eviction
- * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
- * ideal number of generations is MIN_NR_GENS+1.
- */
- if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
- return false;
-
- /*
- * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
- * of the total number of pages for each generation. A reasonable range
- * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
- * aging cares about the upper bound of hot pages, while the eviction
- * cares about the lower bound of cold pages.
- */
- if (young * MIN_NR_GENS > total)
- return true;
- if (old * (MIN_NR_GENS + 2) < total)
- return true;
-
- return false;
+ *nr_to_scan = size;
+ /* better to run aging even though eviction is still possible */
+ return evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS == max_seq;
}
/*
@@ -4718,7 +4821,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
* 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
* reclaim.
*/
-static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
+static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
{
bool success;
unsigned long nr_to_scan;
@@ -4728,7 +4831,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool
if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg))
return -1;
- success = should_run_aging(lruvec, max_seq, can_swap, &nr_to_scan);
+ success = should_run_aging(lruvec, max_seq, swappiness, &nr_to_scan);
/* try to scrape all its memory if this memcg was deleted */
if (nr_to_scan && !mem_cgroup_online(memcg))
@@ -4739,7 +4842,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool
return nr_to_scan >> sc->priority;
/* stop scanning this lruvec as it's low on cold folios */
- return try_to_inc_max_seq(lruvec, max_seq, can_swap, false) ? -1 : 0;
+ return try_to_inc_max_seq(lruvec, max_seq, swappiness, false) ? -1 : 0;
}
static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc)
@@ -5283,8 +5386,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
s = "rep";
n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
- if (tier)
- n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
+ n[2] = READ_ONCE(lrugen->protected[hist][type][tier]);
}
for (i = 0; i < 3; i++)
@@ -5339,7 +5441,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v)
seq_printf(m, " node %5d\n", nid);
if (!full)
- seq = min_seq[LRU_GEN_ANON];
+ seq = evictable_min_seq(min_seq, MAX_SWAPPINESS / 2);
else if (max_seq >= MAX_NR_GENS)
seq = max_seq - MAX_NR_GENS + 1;
else
@@ -5379,23 +5481,14 @@ static const struct seq_operations lru_gen_seq_ops = {
};
static int run_aging(struct lruvec *lruvec, unsigned long seq,
- bool can_swap, bool force_scan)
+ int swappiness, bool force_scan)
{
DEFINE_MAX_SEQ(lruvec);
- DEFINE_MIN_SEQ(lruvec);
-
- if (seq < max_seq)
- return 0;
if (seq > max_seq)
return -EINVAL;
- if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
- return -ERANGE;
-
- try_to_inc_max_seq(lruvec, max_seq, can_swap, force_scan);
-
- return 0;
+ return try_to_inc_max_seq(lruvec, max_seq, swappiness, force_scan) ? 0 : -EEXIST;
}
static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
@@ -5411,7 +5504,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co
while (!signal_pending(current)) {
DEFINE_MIN_SEQ(lruvec);
- if (seq < min_seq[!swappiness])
+ if (seq < evictable_min_seq(min_seq, swappiness))
return 0;
if (sc->nr_reclaimed >= nr_to_reclaim)
@@ -5456,7 +5549,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
if (swappiness < MIN_SWAPPINESS)
swappiness = get_swappiness(lruvec, sc);
- else if (swappiness > MAX_SWAPPINESS)
+ else if (swappiness > SWAPPINESS_ANON_ONLY)
goto done;
switch (cmd) {
@@ -5513,24 +5606,35 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
while ((cur = strsep(&next, ",;\n"))) {
int n;
int end;
- char cmd;
+ char cmd, swap_string[5];
unsigned int memcg_id;
unsigned int nid;
unsigned long seq;
- unsigned int swappiness = -1;
+ unsigned int swappiness;
unsigned long opt = -1;
cur = skip_spaces(cur);
if (!*cur)
continue;
- n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
- &seq, &end, &swappiness, &end, &opt, &end);
+ n = sscanf(cur, "%c %u %u %lu %n %4s %n %lu %n", &cmd, &memcg_id, &nid,
+ &seq, &end, swap_string, &end, &opt, &end);
if (n < 4 || cur[end]) {
err = -EINVAL;
break;
}
+ if (n == 4) {
+ swappiness = -1;
+ } else if (!strcmp("max", swap_string)) {
+ /* set by userspace for anonymous memory only */
+ swappiness = SWAPPINESS_ANON_ONLY;
+ } else {
+ err = kstrtouint(swap_string, 0, &swappiness);
+ if (err)
+ break;
+ }
+
err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
if (err)
break;
@@ -5790,7 +5894,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
+ if (can_age_anon_pages(lruvec, sc) &&
inactive_is_low(lruvec, LRU_INACTIVE_ANON))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
@@ -5821,6 +5925,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
unsigned long pages_for_compaction;
unsigned long inactive_lru_pages;
int z;
+ struct zone *zone;
/* If not in reclaim/compaction mode, stop */
if (!in_reclaim_compaction(sc))
@@ -5840,17 +5945,16 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
return false;
/* If compaction would go ahead or the allocation would succeed, stop */
- for (z = 0; z <= sc->reclaim_idx; z++) {
- struct zone *zone = &pgdat->node_zones[z];
- if (!managed_zone(zone))
- continue;
+ for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) {
+ unsigned long watermark = min_wmark_pages(zone);
/* Allocation can already succeed, nothing to do */
- if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone),
+ if (zone_watermark_ok(zone, sc->order, watermark,
sc->reclaim_idx, 0))
return false;
- if (compaction_suitable(zone, sc->order, sc->reclaim_idx))
+ if (compaction_suitable(zone, sc->order, watermark,
+ sc->reclaim_idx))
return false;
}
@@ -6077,22 +6181,21 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
sc->reclaim_idx, 0))
return true;
- /* Compaction cannot yet proceed. Do reclaim. */
- if (!compaction_suitable(zone, sc->order, sc->reclaim_idx))
- return false;
-
/*
- * Compaction is already possible, but it takes time to run and there
- * are potentially other callers using the pages just freed. So proceed
- * with reclaim to make a buffer of free pages available to give
- * compaction a reasonable chance of completing and allocating the page.
+ * Direct reclaim usually targets the min watermark, but compaction
+ * takes time to run and there are potentially other callers using the
+ * pages just freed. So target a higher buffer to give compaction a
+ * reasonable chance of completing and allocating the pages.
+ *
* Note that we won't actually reclaim the whole buffer in one attempt
* as the target watermark in should_continue_reclaim() is lower. But if
* we are already above the high+gap watermark, don't reclaim at all.
*/
- watermark = high_wmark_pages(zone) + compact_gap(sc->order);
+ watermark = high_wmark_pages(zone);
+ if (compaction_suitable(zone, sc->order, watermark, sc->reclaim_idx))
+ return true;
- return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
+ return false;
}
static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
@@ -6371,11 +6474,7 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
return true;
- for (i = 0; i <= ZONE_NORMAL; i++) {
- zone = &pgdat->node_zones[i];
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) {
if (!zone_reclaimable_pages(zone))
continue;
@@ -6626,10 +6725,10 @@ static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
return;
}
- if (!can_age_anon_pages(pgdat, sc))
+ lruvec = mem_cgroup_lruvec(NULL, pgdat);
+ if (!can_age_anon_pages(lruvec, sc))
return;
- lruvec = mem_cgroup_lruvec(NULL, pgdat);
if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
return;
@@ -6680,17 +6779,48 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
* Check watermarks bottom-up as lower zones are more likely to
* meet watermarks.
*/
- for (i = 0; i <= highest_zoneidx; i++) {
- zone = pgdat->node_zones + i;
-
- if (!managed_zone(zone))
- continue;
+ for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
+ enum zone_stat_item item;
+ unsigned long free_pages;
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
mark = promo_wmark_pages(zone);
else
mark = high_wmark_pages(zone);
- if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
+
+ /*
+ * In defrag_mode, watermarks must be met in whole
+ * blocks to avoid polluting allocator fallbacks.
+ *
+ * However, kswapd usually cannot accomplish this on
+ * its own and needs kcompactd support. Once it's
+ * reclaimed a compaction gap, and kswapd_shrink_node
+ * has dropped order, simply ensure there are enough
+ * base pages for compaction, wake kcompactd & sleep.
+ */
+ if (defrag_mode && order)
+ item = NR_FREE_PAGES_BLOCKS;
+ else
+ item = NR_FREE_PAGES;
+
+ /*
+ * When there is a high number of CPUs in the system,
+ * the cumulative error from the vmstat per-cpu cache
+ * can blur the line between the watermarks. In that
+ * case, be safe and get an accurate snapshot.
+ *
+ * TODO: NR_FREE_PAGES_BLOCKS moves in steps of
+ * pageblock_nr_pages, while the vmstat pcp threshold
+ * is limited to 125. On many configurations that
+ * counter won't actually be per-cpu cached. But keep
+ * things simple for now; revisit when somebody cares.
+ */
+ free_pages = zone_page_state(zone, item);
+ if (zone->percpu_drift_mark && free_pages < zone->percpu_drift_mark)
+ free_pages = zone_page_state_snapshot(zone, item);
+
+ if (__zone_watermark_ok(zone, order, mark, highest_zoneidx,
+ 0, free_pages))
return true;
}
@@ -6770,11 +6900,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
/* Reclaim a number of pages proportional to the number of zones */
sc->nr_to_reclaim = 0;
- for (z = 0; z <= sc->reclaim_idx; z++) {
- zone = pgdat->node_zones + z;
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) {
sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
}
@@ -6805,12 +6931,7 @@ update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active)
int i;
struct zone *zone;
- for (i = 0; i <= highest_zoneidx; i++) {
- zone = pgdat->node_zones + i;
-
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
if (active)
set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
else
@@ -6871,11 +6992,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
* stall or direct reclaim until kswapd is finished.
*/
nr_boost_reclaim = 0;
- for (i = 0; i <= highest_zoneidx; i++) {
- zone = pgdat->node_zones + i;
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
nr_boost_reclaim += zone->watermark_boost;
zone_boosts[i] = zone->watermark_boost;
}
@@ -7182,10 +7299,6 @@ static int kswapd(void *p)
unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
pg_data_t *pgdat = (pg_data_t *)p;
struct task_struct *tsk = current;
- const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
-
- if (!cpumask_empty(cpumask))
- set_cpus_allowed_ptr(tsk, cpumask);
/*
* Tell the memory management that we're a "memory allocator",
@@ -7354,13 +7467,15 @@ void __meminit kswapd_run(int nid)
pgdat_kswapd_lock(pgdat);
if (!pgdat->kswapd) {
- pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
+ pgdat->kswapd = kthread_create_on_node(kswapd, pgdat, nid, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
pr_err("Failed to start kswapd on node %d,ret=%ld\n",
nid, PTR_ERR(pgdat->kswapd));
BUG_ON(system_state < SYSTEM_RUNNING);
pgdat->kswapd = NULL;
+ } else {
+ wake_up_process(pgdat->kswapd);
}
}
pgdat_kswapd_unlock(pgdat);
@@ -7384,6 +7499,28 @@ void __meminit kswapd_stop(int nid)
pgdat_kswapd_unlock(pgdat);
}
+static const struct ctl_table vmscan_sysctl_table[] = {
+ {
+ .procname = "swappiness",
+ .data = &vm_swappiness,
+ .maxlen = sizeof(vm_swappiness),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO_HUNDRED,
+ },
+#ifdef CONFIG_NUMA
+ {
+ .procname = "zone_reclaim_mode",
+ .data = &node_reclaim_mode,
+ .maxlen = sizeof(node_reclaim_mode),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ }
+#endif
+};
+
static int __init kswapd_init(void)
{
int nid;
@@ -7391,6 +7528,7 @@ static int __init kswapd_init(void)
swap_setup();
for_each_node_state(nid, N_MEMORY)
kswapd_run(nid);
+ register_sysctl_init("vm", vmscan_sysctl_table);
return 0;
}
@@ -7556,11 +7694,11 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
return NODE_RECLAIM_NOSCAN;
- if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
+ if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
return NODE_RECLAIM_NOSCAN;
ret = __node_reclaim(pgdat, gfp_mask, order);
- clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
+ clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
if (ret)
count_vm_event(PGSCAN_ZONE_RECLAIM_SUCCESS);