summaryrefslogtreecommitdiff
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c446
1 files changed, 282 insertions, 164 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c767d71c43d7..f8dfd2864bbf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -271,6 +271,25 @@ static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg)
}
#endif
+/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to
+ * and including the specified highidx
+ * @zone: The current zone in the iterator
+ * @pgdat: The pgdat which node_zones are being iterated
+ * @idx: The index variable
+ * @highidx: The index of the highest zone to return
+ *
+ * This macro iterates through all managed zones up to and including the specified highidx.
+ * The zone iterator enters an invalid state after macro call and must be reinitialized
+ * before it can be used again.
+ */
+#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \
+ for ((idx) = 0, (zone) = (pgdat)->node_zones; \
+ (idx) <= (highidx); \
+ (idx)++, (zone)++) \
+ if (!managed_zone(zone)) \
+ continue; \
+ else
+
static void set_task_reclaim_state(struct task_struct *task,
struct reclaim_state *rs)
{
@@ -323,16 +342,22 @@ static void flush_reclaim_state(struct scan_control *sc)
}
}
-static bool can_demote(int nid, struct scan_control *sc)
+static bool can_demote(int nid, struct scan_control *sc,
+ struct mem_cgroup *memcg)
{
+ int demotion_nid;
+
if (!numa_demotion_enabled)
return false;
if (sc && sc->no_demotion)
return false;
- if (next_demotion_node(nid) == NUMA_NO_NODE)
+
+ demotion_nid = next_demotion_node(nid);
+ if (demotion_nid == NUMA_NO_NODE)
return false;
- return true;
+ /* If demotion node isn't in the cgroup's mems_allowed, fall back */
+ return mem_cgroup_node_allowed(memcg, demotion_nid);
}
static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
@@ -357,7 +382,7 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
*
* Can it be reclaimed from this node via demotion?
*/
- return can_demote(nid, sc);
+ return can_demote(nid, sc, memcg);
}
/*
@@ -396,13 +421,9 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
{
unsigned long size = 0;
int zid;
+ struct zone *zone;
- for (zid = 0; zid <= zone_idx; zid++) {
- struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
-
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, lruvec_pgdat(lruvec), zid, zone_idx) {
if (!mem_cgroup_disabled())
size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
else
@@ -441,21 +462,26 @@ void drop_slab(void)
} while ((freed >> shift++) > 1);
}
-static int reclaimer_offset(void)
+#define CHECK_RECLAIMER_OFFSET(type) \
+ do { \
+ BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD != \
+ PGDEMOTE_##type - PGDEMOTE_KSWAPD); \
+ BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD != \
+ PGSCAN_##type - PGSCAN_KSWAPD); \
+ } while (0)
+
+static int reclaimer_offset(struct scan_control *sc)
{
- BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
- PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD);
- BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
- PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD);
- BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
- PGSCAN_DIRECT - PGSCAN_KSWAPD);
- BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
- PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD);
+ CHECK_RECLAIMER_OFFSET(DIRECT);
+ CHECK_RECLAIMER_OFFSET(KHUGEPAGED);
+ CHECK_RECLAIMER_OFFSET(PROACTIVE);
if (current_is_kswapd())
return 0;
if (current_is_khugepaged())
return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD;
+ if (sc->proactive)
+ return PGSTEAL_PROACTIVE - PGSTEAL_KSWAPD;
return PGSTEAL_DIRECT - PGSTEAL_KSWAPD;
}
@@ -495,7 +521,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat)
{
int reclaimable = 0, write_pending = 0;
int i;
-
+ struct zone *zone;
/*
* If kswapd is disabled, reschedule if necessary but do not
* throttle as the system is likely near OOM.
@@ -508,12 +534,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat)
* throttle as throttling will occur when the folios cycle
* towards the end of the LRU if still under writeback.
*/
- for (i = 0; i < MAX_NR_ZONES; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, i, MAX_NR_ZONES - 1) {
reclaimable += zone_reclaimable_pages(zone);
write_pending += zone_page_state_snapshot(zone,
NR_ZONE_WRITE_PENDING);
@@ -633,21 +654,20 @@ typedef enum {
/*
* pageout is called by shrink_folio_list() for each dirty folio.
- * Calls ->writepage().
*/
static pageout_t pageout(struct folio *folio, struct address_space *mapping,
struct swap_iocb **plug, struct list_head *folio_list)
{
+ int (*writeout)(struct folio *, struct writeback_control *);
+
/*
- * If the folio is dirty, only perform writeback if that write
- * will be non-blocking. To prevent this allocation from being
- * stalled by pagecache activity. But note that there may be
- * stalls if we need to run get_block(). We could test
- * PagePrivate for that.
- *
- * If this process is currently in __generic_file_write_iter() against
- * this folio's queue, we can perform writeback even if that
- * will block.
+ * We no longer attempt to writeback filesystem folios here, other
+ * than tmpfs/shmem. That's taken care of in page-writeback.
+ * If we find a dirty filesystem folio at the end of the LRU list,
+ * typically that means the filesystem is saturating the storage
+ * with contiguous writes and telling it to write a folio here
+ * would only make the situation worse by injecting an element
+ * of random access.
*
* If the folio is swapcache, write it back even if that would
* block, for some throttling. This happens by accident, because
@@ -670,7 +690,11 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
}
return PAGE_KEEP;
}
- if (mapping->a_ops->writepage == NULL)
+ if (shmem_mapping(mapping))
+ writeout = shmem_writeout;
+ else if (folio_test_anon(folio))
+ writeout = swap_writeout;
+ else
return PAGE_ACTIVATE;
if (folio_clear_dirty_for_io(folio)) {
@@ -693,7 +717,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
wbc.list = folio_list;
folio_set_reclaim(folio);
- res = mapping->a_ops->writepage(&folio->page, &wbc);
+ res = writeout(folio, &wbc);
if (res < 0)
handle_write_error(mapping, folio, res);
if (res == AOP_WRITEPAGE_ACTIVATE) {
@@ -702,7 +726,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
}
if (!folio_test_writeback(folio)) {
- /* synchronous write or broken a_ops? */
+ /* synchronous write? */
folio_clear_reclaim(folio);
}
trace_mm_vmscan_write_folio(folio);
@@ -769,7 +793,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(folio, target_memcg);
__delete_from_swap_cache(folio, swap, shadow);
- mem_cgroup_swapout(folio, swap);
+ memcg1_swapout(folio, swap);
xa_unlock_irq(&mapping->i_pages);
put_swap_folio(folio, swap);
} else {
@@ -1081,7 +1105,8 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
*/
static unsigned int shrink_folio_list(struct list_head *folio_list,
struct pglist_data *pgdat, struct scan_control *sc,
- struct reclaim_stat *stat, bool ignore_references)
+ struct reclaim_stat *stat, bool ignore_references,
+ struct mem_cgroup *memcg)
{
struct folio_batch free_folios;
LIST_HEAD(ret_folios);
@@ -1094,7 +1119,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
folio_batch_init(&free_folios);
memset(stat, 0, sizeof(*stat));
cond_resched();
- do_demote_pass = can_demote(pgdat->node_id, sc);
+ do_demote_pass = can_demote(pgdat->node_id, sc, memcg);
retry:
while (!list_empty(folio_list)) {
@@ -1112,6 +1137,13 @@ retry:
if (!folio_trylock(folio))
goto keep;
+ if (folio_contain_hwpoisoned_page(folio)) {
+ unmap_poisoned_folio(folio, folio_pfn(folio), false);
+ folio_unlock(folio);
+ folio_put(folio);
+ continue;
+ }
+
VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
nr_pages = folio_nr_pages(folio);
@@ -1165,8 +1197,10 @@ retry:
* 2) Global or new memcg reclaim encounters a folio that is
* not marked for immediate reclaim, or the caller does not
* have __GFP_FS (or __GFP_IO if it's simply going to swap,
- * not to fs). In this case mark the folio for immediate
- * reclaim and continue scanning.
+ * not to fs), or the folio belongs to a mapping where
+ * waiting on writeback during reclaim may lead to a deadlock.
+ * In this case mark the folio for immediate reclaim and
+ * continue scanning.
*
* Require may_enter_fs() because we would wait on fs, which
* may not have submitted I/O yet. And the loop driver might
@@ -1191,6 +1225,8 @@ retry:
* takes to write them to disk.
*/
if (folio_test_writeback(folio)) {
+ mapping = folio_mapping(folio);
+
/* Case 1 above */
if (current_is_kswapd() &&
folio_test_reclaim(folio) &&
@@ -1201,7 +1237,9 @@ retry:
/* Case 2 above */
} else if (writeback_throttling_sane(sc) ||
!folio_test_reclaim(folio) ||
- !may_enter_fs(folio, sc->gfp_mask)) {
+ !may_enter_fs(folio, sc->gfp_mask) ||
+ (mapping &&
+ mapping_writeback_may_deadlock_on_reclaim(mapping))) {
/*
* This is slightly racy -
* folio_end_writeback() might have
@@ -1279,7 +1317,7 @@ retry:
split_folio_to_list(folio, folio_list))
goto activate_locked;
}
- if (!add_to_swap(folio)) {
+ if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) {
int __maybe_unused order = folio_order(folio);
if (!folio_test_large(folio))
@@ -1295,9 +1333,21 @@ retry:
}
#endif
count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK);
- if (!add_to_swap(folio))
+ if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN))
goto activate_locked_split;
}
+ /*
+ * Normally the folio will be dirtied in unmap because its
+ * pte should be dirty. A special case is MADV_FREE page. The
+ * page's pte could have dirty bit cleared but the folio's
+ * SwapBacked flag is still set because clearing the dirty bit
+ * and SwapBacked flag has no lock protected. For such folio,
+ * unmap will not set dirty bit for it, so folio reclaim will
+ * not write the folio out. This can cause data corruption when
+ * the folio is swapped in later. Always setting the dirty flag
+ * for the folio solves the problem.
+ */
+ folio_mark_dirty(folio);
}
}
@@ -1624,7 +1674,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
*/
noreclaim_flag = memalloc_noreclaim_save();
nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc,
- &stat, true);
+ &stat, true, NULL);
memalloc_noreclaim_restore(noreclaim_flag);
list_splice(&clean_folios, folio_list);
@@ -1691,13 +1741,11 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
unsigned long nr_taken = 0;
unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
- unsigned long skipped = 0;
- unsigned long scan, total_scan, nr_pages;
+ unsigned long skipped = 0, total_scan = 0, scan = 0;
+ unsigned long nr_pages;
unsigned long max_nr_skipped = 0;
LIST_HEAD(folios_skipped);
- total_scan = 0;
- scan = 0;
while (scan < nr_to_scan && !list_empty(src)) {
struct list_head *move_to = src;
struct folio *folio;
@@ -1986,10 +2034,10 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
&nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
- item = PGSCAN_KSWAPD + reclaimer_offset();
+ item = PGSCAN_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_scanned);
- __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
+ count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
__count_vm_events(PGSCAN_ANON + file, nr_scanned);
spin_unlock_irq(&lruvec->lru_lock);
@@ -1997,18 +2045,19 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
if (nr_taken == 0)
return 0;
- nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);
+ nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false,
+ lruvec_memcg(lruvec));
spin_lock_irq(&lruvec->lru_lock);
move_folios_to_lru(lruvec, &folio_list);
- __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(),
+ __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
stat.nr_demoted);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
- item = PGSTEAL_KSWAPD + reclaimer_offset();
+ item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_reclaimed);
- __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
+ count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
spin_unlock_irq(&lruvec->lru_lock);
@@ -2098,7 +2147,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
if (!cgroup_reclaim(sc))
__count_vm_events(PGREFILL, nr_scanned);
- __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
+ count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
spin_unlock_irq(&lruvec->lru_lock);
@@ -2155,7 +2204,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
nr_deactivate = move_folios_to_lru(lruvec, &l_inactive);
__count_vm_events(PGDEACTIVATE, nr_deactivate);
- __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
+ count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&lruvec->lru_lock);
@@ -2180,7 +2229,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
.no_demotion = 1,
};
- nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true);
+ nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true, NULL);
while (!list_empty(folio_list)) {
folio = lru_to_folio(folio_list);
list_del(&folio->lru);
@@ -2372,17 +2421,13 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
unsigned long total_high_wmark = 0;
unsigned long free, anon;
int z;
+ struct zone *zone;
free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
file = node_page_state(pgdat, NR_ACTIVE_FILE) +
node_page_state(pgdat, NR_INACTIVE_FILE);
- for (z = 0; z < MAX_NR_ZONES; z++) {
- struct zone *zone = &pgdat->node_zones[z];
-
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, z, MAX_NR_ZONES - 1) {
total_high_wmark += high_wmark_pages(zone);
}
@@ -2400,6 +2445,43 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
}
}
+static inline void calculate_pressure_balance(struct scan_control *sc,
+ int swappiness, u64 *fraction, u64 *denominator)
+{
+ unsigned long anon_cost, file_cost, total_cost;
+ unsigned long ap, fp;
+
+ /*
+ * Calculate the pressure balance between anon and file pages.
+ *
+ * The amount of pressure we put on each LRU is inversely
+ * proportional to the cost of reclaiming each list, as
+ * determined by the share of pages that are refaulting, times
+ * the relative IO cost of bringing back a swapped out
+ * anonymous page vs reloading a filesystem page (swappiness).
+ *
+ * Although we limit that influence to ensure no list gets
+ * left behind completely: at least a third of the pressure is
+ * applied, before swappiness.
+ *
+ * With swappiness at 100, anon and file have equal IO cost.
+ */
+ total_cost = sc->anon_cost + sc->file_cost;
+ anon_cost = total_cost + sc->anon_cost;
+ file_cost = total_cost + sc->file_cost;
+ total_cost = anon_cost + file_cost;
+
+ ap = swappiness * (total_cost + 1);
+ ap /= anon_cost + 1;
+
+ fp = (MAX_SWAPPINESS - swappiness) * (total_cost + 1);
+ fp /= file_cost + 1;
+
+ fraction[WORKINGSET_ANON] = ap;
+ fraction[WORKINGSET_FILE] = fp;
+ *denominator = ap + fp;
+}
+
/*
* Determine how aggressively the anon and file LRU lists should be
* scanned.
@@ -2412,12 +2494,10 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
{
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
- unsigned long anon_cost, file_cost, total_cost;
int swappiness = sc_swappiness(sc, memcg);
u64 fraction[ANON_AND_FILE];
u64 denominator = 0; /* gcc */
enum scan_balance scan_balance;
- unsigned long ap, fp;
enum lru_list lru;
/* If we have no swap space, do not bother scanning anon folios. */
@@ -2438,6 +2518,13 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
goto out;
}
+ /* Proactive reclaim initiated by userspace for anonymous memory only */
+ if (swappiness == SWAPPINESS_ANON_ONLY) {
+ WARN_ON_ONCE(!sc->proactive);
+ scan_balance = SCAN_ANON;
+ goto out;
+ }
+
/*
* Do not apply any pressure balancing cleverness when the
* system is close to OOM, scan both anon and file equally
@@ -2458,7 +2545,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
/*
* If there is enough inactive page cache, we do not reclaim
- * anything from the anonymous working right now.
+ * anything from the anonymous working right now to make sure
+ * a streaming file access pattern doesn't cause swapping.
*/
if (sc->cache_trim_mode) {
scan_balance = SCAN_FILE;
@@ -2466,35 +2554,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
}
scan_balance = SCAN_FRACT;
- /*
- * Calculate the pressure balance between anon and file pages.
- *
- * The amount of pressure we put on each LRU is inversely
- * proportional to the cost of reclaiming each list, as
- * determined by the share of pages that are refaulting, times
- * the relative IO cost of bringing back a swapped out
- * anonymous page vs reloading a filesystem page (swappiness).
- *
- * Although we limit that influence to ensure no list gets
- * left behind completely: at least a third of the pressure is
- * applied, before swappiness.
- *
- * With swappiness at 100, anon and file have equal IO cost.
- */
- total_cost = sc->anon_cost + sc->file_cost;
- anon_cost = total_cost + sc->anon_cost;
- file_cost = total_cost + sc->file_cost;
- total_cost = anon_cost + file_cost;
-
- ap = swappiness * (total_cost + 1);
- ap /= anon_cost + 1;
-
- fp = (MAX_SWAPPINESS - swappiness) * (total_cost + 1);
- fp /= file_cost + 1;
+ calculate_pressure_balance(sc, swappiness, fraction, &denominator);
- fraction[0] = ap;
- fraction[1] = fp;
- denominator = ap + fp;
out:
for_each_evictable_lru(lru) {
bool file = is_file_lru(lru);
@@ -2608,7 +2669,7 @@ out:
* Anonymous LRU management is a waste if there is
* ultimately no way to reclaim the memory.
*/
-static bool can_age_anon_pages(struct pglist_data *pgdat,
+static bool can_age_anon_pages(struct lruvec *lruvec,
struct scan_control *sc)
{
/* Aging the anon LRU is valuable if swap is present: */
@@ -2616,7 +2677,8 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
return true;
/* Also valuable if anon pages can be demoted: */
- return can_demote(pgdat->node_id, sc);
+ return can_demote(lruvec_pgdat(lruvec)->node_id, sc,
+ lruvec_memcg(lruvec));
}
#ifdef CONFIG_LRU_GEN
@@ -2652,8 +2714,12 @@ static bool should_clear_pmd_young(void)
READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \
}
+/* Get the min/max evictable type based on swappiness */
+#define min_type(swappiness) (!(swappiness))
+#define max_type(swappiness) ((swappiness) < SWAPPINESS_ANON_ONLY)
+
#define evictable_min_seq(min_seq, swappiness) \
- min((min_seq)[!(swappiness)], (min_seq)[(swappiness) <= MAX_SWAPPINESS])
+ min((min_seq)[min_type(swappiness)], (min_seq)[max_type(swappiness)])
#define for_each_gen_type_zone(gen, type, zone) \
for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
@@ -2661,7 +2727,7 @@ static bool should_clear_pmd_young(void)
for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
#define for_each_evictable_type(type, swappiness) \
- for ((type) = !(swappiness); (type) <= ((swappiness) <= MAX_SWAPPINESS); (type)++)
+ for ((type) = min_type(swappiness); (type) <= max_type(swappiness); (type)++)
#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
@@ -2694,7 +2760,7 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
if (!sc->may_swap)
return 0;
- if (!can_demote(pgdat->node_id, sc) &&
+ if (!can_demote(pgdat->node_id, sc, memcg) &&
mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
return 0;
@@ -3812,7 +3878,12 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness)
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
- if (type ? swappiness > MAX_SWAPPINESS : !swappiness)
+ /* For file type, skip the check if swappiness is anon only */
+ if (type && (swappiness == SWAPPINESS_ANON_ONLY))
+ goto done;
+
+ /* For anon type, skip the check if swappiness is zero (file only) */
+ if (!type && !swappiness)
goto done;
/* prevent cold/hot inversion if the type is evictable */
@@ -4545,13 +4616,13 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
break;
}
- item = PGSCAN_KSWAPD + reclaimer_offset();
+ item = PGSCAN_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc)) {
__count_vm_events(item, isolated);
__count_vm_events(PGREFILL, sorted);
}
- __count_memcg_events(memcg, item, isolated);
- __count_memcg_events(memcg, PGREFILL, sorted);
+ count_memcg_events(memcg, item, isolated);
+ count_memcg_events(memcg, PGREFILL, sorted);
__count_vm_events(PGSCAN_ANON + type, isolated);
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH,
scanned, skipped, isolated,
@@ -4657,7 +4728,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
if (list_empty(&list))
return scanned;
retry:
- reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
+ reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, memcg);
sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
sc->nr_reclaimed += reclaimed;
trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
@@ -4695,13 +4766,13 @@ retry:
reset_batch_size(walk);
}
- __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(),
+ __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
stat.nr_demoted);
- item = PGSTEAL_KSWAPD + reclaimer_offset();
+ item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc))
__count_vm_events(item, reclaimed);
- __count_memcg_events(memcg, item, reclaimed);
+ count_memcg_events(memcg, item, reclaimed);
__count_vm_events(PGSTEAL_ANON + type, reclaimed);
spin_unlock_irq(&lruvec->lru_lock);
@@ -5478,7 +5549,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
if (swappiness < MIN_SWAPPINESS)
swappiness = get_swappiness(lruvec, sc);
- else if (swappiness > MAX_SWAPPINESS + 1)
+ else if (swappiness > SWAPPINESS_ANON_ONLY)
goto done;
switch (cmd) {
@@ -5535,24 +5606,35 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
while ((cur = strsep(&next, ",;\n"))) {
int n;
int end;
- char cmd;
+ char cmd, swap_string[5];
unsigned int memcg_id;
unsigned int nid;
unsigned long seq;
- unsigned int swappiness = -1;
+ unsigned int swappiness;
unsigned long opt = -1;
cur = skip_spaces(cur);
if (!*cur)
continue;
- n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
- &seq, &end, &swappiness, &end, &opt, &end);
+ n = sscanf(cur, "%c %u %u %lu %n %4s %n %lu %n", &cmd, &memcg_id, &nid,
+ &seq, &end, swap_string, &end, &opt, &end);
if (n < 4 || cur[end]) {
err = -EINVAL;
break;
}
+ if (n == 4) {
+ swappiness = -1;
+ } else if (!strcmp("max", swap_string)) {
+ /* set by userspace for anonymous memory only */
+ swappiness = SWAPPINESS_ANON_ONLY;
+ } else {
+ err = kstrtouint(swap_string, 0, &swappiness);
+ if (err)
+ break;
+ }
+
err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
if (err)
break;
@@ -5812,7 +5894,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
+ if (can_age_anon_pages(lruvec, sc) &&
inactive_is_low(lruvec, LRU_INACTIVE_ANON))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
@@ -5843,6 +5925,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
unsigned long pages_for_compaction;
unsigned long inactive_lru_pages;
int z;
+ struct zone *zone;
/* If not in reclaim/compaction mode, stop */
if (!in_reclaim_compaction(sc))
@@ -5862,17 +5945,16 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
return false;
/* If compaction would go ahead or the allocation would succeed, stop */
- for (z = 0; z <= sc->reclaim_idx; z++) {
- struct zone *zone = &pgdat->node_zones[z];
- if (!managed_zone(zone))
- continue;
+ for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) {
+ unsigned long watermark = min_wmark_pages(zone);
/* Allocation can already succeed, nothing to do */
- if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone),
+ if (zone_watermark_ok(zone, sc->order, watermark,
sc->reclaim_idx, 0))
return false;
- if (compaction_suitable(zone, sc->order, sc->reclaim_idx))
+ if (compaction_suitable(zone, sc->order, watermark,
+ sc->reclaim_idx))
return false;
}
@@ -6099,22 +6181,21 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
sc->reclaim_idx, 0))
return true;
- /* Compaction cannot yet proceed. Do reclaim. */
- if (!compaction_suitable(zone, sc->order, sc->reclaim_idx))
- return false;
-
/*
- * Compaction is already possible, but it takes time to run and there
- * are potentially other callers using the pages just freed. So proceed
- * with reclaim to make a buffer of free pages available to give
- * compaction a reasonable chance of completing and allocating the page.
+ * Direct reclaim usually targets the min watermark, but compaction
+ * takes time to run and there are potentially other callers using the
+ * pages just freed. So target a higher buffer to give compaction a
+ * reasonable chance of completing and allocating the pages.
+ *
* Note that we won't actually reclaim the whole buffer in one attempt
* as the target watermark in should_continue_reclaim() is lower. But if
* we are already above the high+gap watermark, don't reclaim at all.
*/
- watermark = high_wmark_pages(zone) + compact_gap(sc->order);
+ watermark = high_wmark_pages(zone);
+ if (compaction_suitable(zone, sc->order, watermark, sc->reclaim_idx))
+ return true;
- return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
+ return false;
}
static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
@@ -6393,11 +6474,7 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
return true;
- for (i = 0; i <= ZONE_NORMAL; i++) {
- zone = &pgdat->node_zones[i];
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) {
if (!zone_reclaimable_pages(zone))
continue;
@@ -6648,10 +6725,10 @@ static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
return;
}
- if (!can_age_anon_pages(pgdat, sc))
+ lruvec = mem_cgroup_lruvec(NULL, pgdat);
+ if (!can_age_anon_pages(lruvec, sc))
return;
- lruvec = mem_cgroup_lruvec(NULL, pgdat);
if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
return;
@@ -6702,17 +6779,48 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
* Check watermarks bottom-up as lower zones are more likely to
* meet watermarks.
*/
- for (i = 0; i <= highest_zoneidx; i++) {
- zone = pgdat->node_zones + i;
-
- if (!managed_zone(zone))
- continue;
+ for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
+ enum zone_stat_item item;
+ unsigned long free_pages;
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
mark = promo_wmark_pages(zone);
else
mark = high_wmark_pages(zone);
- if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
+
+ /*
+ * In defrag_mode, watermarks must be met in whole
+ * blocks to avoid polluting allocator fallbacks.
+ *
+ * However, kswapd usually cannot accomplish this on
+ * its own and needs kcompactd support. Once it's
+ * reclaimed a compaction gap, and kswapd_shrink_node
+ * has dropped order, simply ensure there are enough
+ * base pages for compaction, wake kcompactd & sleep.
+ */
+ if (defrag_mode && order)
+ item = NR_FREE_PAGES_BLOCKS;
+ else
+ item = NR_FREE_PAGES;
+
+ /*
+ * When there is a high number of CPUs in the system,
+ * the cumulative error from the vmstat per-cpu cache
+ * can blur the line between the watermarks. In that
+ * case, be safe and get an accurate snapshot.
+ *
+ * TODO: NR_FREE_PAGES_BLOCKS moves in steps of
+ * pageblock_nr_pages, while the vmstat pcp threshold
+ * is limited to 125. On many configurations that
+ * counter won't actually be per-cpu cached. But keep
+ * things simple for now; revisit when somebody cares.
+ */
+ free_pages = zone_page_state(zone, item);
+ if (zone->percpu_drift_mark && free_pages < zone->percpu_drift_mark)
+ free_pages = zone_page_state_snapshot(zone, item);
+
+ if (__zone_watermark_ok(zone, order, mark, highest_zoneidx,
+ 0, free_pages))
return true;
}
@@ -6792,11 +6900,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
/* Reclaim a number of pages proportional to the number of zones */
sc->nr_to_reclaim = 0;
- for (z = 0; z <= sc->reclaim_idx; z++) {
- zone = pgdat->node_zones + z;
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) {
sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
}
@@ -6827,12 +6931,7 @@ update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active)
int i;
struct zone *zone;
- for (i = 0; i <= highest_zoneidx; i++) {
- zone = pgdat->node_zones + i;
-
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
if (active)
set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
else
@@ -6893,11 +6992,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
* stall or direct reclaim until kswapd is finished.
*/
nr_boost_reclaim = 0;
- for (i = 0; i <= highest_zoneidx; i++) {
- zone = pgdat->node_zones + i;
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
nr_boost_reclaim += zone->watermark_boost;
zone_boosts[i] = zone->watermark_boost;
}
@@ -7404,6 +7499,28 @@ void __meminit kswapd_stop(int nid)
pgdat_kswapd_unlock(pgdat);
}
+static const struct ctl_table vmscan_sysctl_table[] = {
+ {
+ .procname = "swappiness",
+ .data = &vm_swappiness,
+ .maxlen = sizeof(vm_swappiness),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO_HUNDRED,
+ },
+#ifdef CONFIG_NUMA
+ {
+ .procname = "zone_reclaim_mode",
+ .data = &node_reclaim_mode,
+ .maxlen = sizeof(node_reclaim_mode),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ }
+#endif
+};
+
static int __init kswapd_init(void)
{
int nid;
@@ -7411,6 +7528,7 @@ static int __init kswapd_init(void)
swap_setup();
for_each_node_state(nid, N_MEMORY)
kswapd_run(nid);
+ register_sysctl_init("vm", vmscan_sysctl_table);
return 0;
}
@@ -7576,11 +7694,11 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
return NODE_RECLAIM_NOSCAN;
- if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
+ if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
return NODE_RECLAIM_NOSCAN;
ret = __node_reclaim(pgdat, gfp_mask, order);
- clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
+ clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
if (ret)
count_vm_event(PGSCAN_ZONE_RECLAIM_SUCCESS);