diff options
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 297 |
1 files changed, 190 insertions, 107 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index a6c5d0b28321..ee4eecc7e1c2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -171,11 +171,22 @@ int vm_swappiness = 60; */ unsigned long vm_total_pages; +static void set_task_reclaim_state(struct task_struct *task, + struct reclaim_state *rs) +{ + /* Check for an overwrite */ + WARN_ON_ONCE(rs && task->reclaim_state); + + /* Check for the nulling of an already-nulled member */ + WARN_ON_ONCE(!rs && !task->reclaim_state); + + task->reclaim_state = rs; +} + static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); -#ifdef CONFIG_MEMCG_KMEM - +#ifdef CONFIG_MEMCG /* * We allow subsystems to populate their shrinker-related * LRU lists before register_shrinker_prepared() is called @@ -227,30 +238,7 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) idr_remove(&shrinker_idr, id); up_write(&shrinker_rwsem); } -#else /* CONFIG_MEMCG_KMEM */ -static int prealloc_memcg_shrinker(struct shrinker *shrinker) -{ - return 0; -} - -static void unregister_memcg_shrinker(struct shrinker *shrinker) -{ -} -#endif /* CONFIG_MEMCG_KMEM */ - -static void set_task_reclaim_state(struct task_struct *task, - struct reclaim_state *rs) -{ - /* Check for an overwrite */ - WARN_ON_ONCE(rs && task->reclaim_state); - - /* Check for the nulling of an already-nulled member */ - WARN_ON_ONCE(!rs && !task->reclaim_state); - - task->reclaim_state = rs; -} -#ifdef CONFIG_MEMCG static bool global_reclaim(struct scan_control *sc) { return !sc->target_mem_cgroup; @@ -305,6 +293,15 @@ static bool memcg_congested(pg_data_t *pgdat, } #else +static int prealloc_memcg_shrinker(struct shrinker *shrinker) +{ + return 0; +} + +static void unregister_memcg_shrinker(struct shrinker *shrinker) +{ +} + static bool global_reclaim(struct scan_control *sc) { return true; @@ -354,12 +351,13 @@ unsigned long zone_reclaimable_pages(struct zone *zone) */ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { - unsigned long lru_size; + unsigned long lru_size = 0; int zid; - if (!mem_cgroup_disabled()) - lru_size = lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); - else + if (!mem_cgroup_disabled()) { + for (zid = 0; zid < MAX_NR_ZONES; zid++) + lru_size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid); + } else lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) { @@ -591,7 +589,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, return freed; } -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { @@ -599,7 +597,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, unsigned long ret, freed = 0; int i; - if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)) + if (!mem_cgroup_online(memcg)) return 0; if (!down_read_trylock(&shrinker_rwsem)) @@ -625,6 +623,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, continue; } + /* Call non-slab shrinkers even though kmem is disabled */ + if (!memcg_kmem_enabled() && + !(shrinker->flags & SHRINKER_NONSLAB)) + continue; + ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) { clear_bit(i, map->map); @@ -661,13 +664,13 @@ unlock: up_read(&shrinker_rwsem); return freed; } -#else /* CONFIG_MEMCG_KMEM */ +#else /* CONFIG_MEMCG */ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { return 0; } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG */ /** * shrink_slab - shrink slab caches @@ -930,10 +933,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * Note that if SetPageDirty is always performed via set_page_dirty, * and thus under the i_pages lock, then this ordering is not required. */ - if (unlikely(PageTransHuge(page)) && PageSwapCache(page)) - refcount = 1 + HPAGE_PMD_NR; - else - refcount = 2; + refcount = 1 + compound_nr(page); if (!page_ref_freeze(page, refcount)) goto cannot_free; /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */ @@ -1121,7 +1121,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct scan_control *sc, enum ttu_flags ttu_flags, struct reclaim_stat *stat, - bool force_reclaim) + bool ignore_references) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); @@ -1135,7 +1135,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct address_space *mapping; struct page *page; int may_enter_fs; - enum page_references references = PAGEREF_RECLAIM_CLEAN; + enum page_references references = PAGEREF_RECLAIM; bool dirty, writeback; unsigned int nr_pages; @@ -1149,7 +1149,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, VM_BUG_ON_PAGE(PageActive(page), page); - nr_pages = 1 << compound_order(page); + nr_pages = compound_nr(page); /* Account the number of base pages even though THP */ sc->nr_scanned += nr_pages; @@ -1266,7 +1266,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } } - if (!force_reclaim) + if (!ignore_references) references = page_check_references(page, sc); switch (references) { @@ -1487,10 +1487,9 @@ free_it: * Is there need to periodically free_page_list? It would * appear not as the counts should be low */ - if (unlikely(PageTransHuge(page))) { - mem_cgroup_uncharge(page); + if (unlikely(PageTransHuge(page))) (*get_compound_page_dtor(page))(page); - } else + else list_add(&page->lru, &free_pages); continue; @@ -1705,7 +1704,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, VM_BUG_ON_PAGE(!PageLRU(page), page); - nr_pages = 1 << compound_order(page); + nr_pages = compound_nr(page); total_scan += nr_pages; if (page_zonenum(page) > sc->reclaim_idx) { @@ -1911,7 +1910,6 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, if (unlikely(PageCompound(page))) { spin_unlock_irq(&pgdat->lru_lock); - mem_cgroup_uncharge(page); (*get_compound_page_dtor(page))(page); spin_lock_irq(&pgdat->lru_lock); } else @@ -2145,6 +2143,62 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_deactivate, nr_rotated, sc->priority, file); } +unsigned long reclaim_pages(struct list_head *page_list) +{ + int nid = -1; + unsigned long nr_reclaimed = 0; + LIST_HEAD(node_page_list); + struct reclaim_stat dummy_stat; + struct page *page; + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .priority = DEF_PRIORITY, + .may_writepage = 1, + .may_unmap = 1, + .may_swap = 1, + }; + + while (!list_empty(page_list)) { + page = lru_to_page(page_list); + if (nid == -1) { + nid = page_to_nid(page); + INIT_LIST_HEAD(&node_page_list); + } + + if (nid == page_to_nid(page)) { + ClearPageActive(page); + list_move(&page->lru, &node_page_list); + continue; + } + + nr_reclaimed += shrink_page_list(&node_page_list, + NODE_DATA(nid), + &sc, 0, + &dummy_stat, false); + while (!list_empty(&node_page_list)) { + page = lru_to_page(&node_page_list); + list_del(&page->lru); + putback_lru_page(page); + } + + nid = -1; + } + + if (!list_empty(&node_page_list)) { + nr_reclaimed += shrink_page_list(&node_page_list, + NODE_DATA(nid), + &sc, 0, + &dummy_stat, false); + while (!list_empty(&node_page_list)) { + page = lru_to_page(&node_page_list); + list_del(&page->lru); + putback_lru_page(page); + } + } + + return nr_reclaimed; +} + /* * The inactive anon list should be small enough that the VM never has * to do too much work. @@ -2403,17 +2457,70 @@ out: *lru_pages = 0; for_each_evictable_lru(lru) { int file = is_file_lru(lru); - unsigned long size; + unsigned long lruvec_size; unsigned long scan; + unsigned long protection; + + lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); + protection = mem_cgroup_protection(memcg, + sc->memcg_low_reclaim); + + if (protection) { + /* + * Scale a cgroup's reclaim pressure by proportioning + * its current usage to its memory.low or memory.min + * setting. + * + * This is important, as otherwise scanning aggression + * becomes extremely binary -- from nothing as we + * approach the memory protection threshold, to totally + * nominal as we exceed it. This results in requiring + * setting extremely liberal protection thresholds. It + * also means we simply get no protection at all if we + * set it too low, which is not ideal. + * + * If there is any protection in place, we reduce scan + * pressure by how much of the total memory used is + * within protection thresholds. + * + * There is one special case: in the first reclaim pass, + * we skip over all groups that are within their low + * protection. If that fails to reclaim enough pages to + * satisfy the reclaim goal, we come back and override + * the best-effort low protection. However, we still + * ideally want to honor how well-behaved groups are in + * that case instead of simply punishing them all + * equally. As such, we reclaim them based on how much + * memory they are using, reducing the scan pressure + * again by how much of the total memory used is under + * hard protection. + */ + unsigned long cgroup_size = mem_cgroup_size(memcg); + + /* Avoid TOCTOU with earlier protection check */ + cgroup_size = max(cgroup_size, protection); + + scan = lruvec_size - lruvec_size * protection / + cgroup_size; + + /* + * Minimally target SWAP_CLUSTER_MAX pages to keep + * reclaim moving forwards, avoiding decremeting + * sc->priority further than desirable. + */ + scan = max(scan, SWAP_CLUSTER_MAX); + } else { + scan = lruvec_size; + } + + scan >>= sc->priority; - size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); - scan = size >> sc->priority; /* * If the cgroup's already been deleted, make sure to * scrape out the remaining cache. */ if (!scan && !mem_cgroup_online(memcg)) - scan = min(size, SWAP_CLUSTER_MAX); + scan = min(lruvec_size, SWAP_CLUSTER_MAX); switch (scan_balance) { case SCAN_EQUAL: @@ -2433,7 +2540,7 @@ out: case SCAN_ANON: /* Scan one type exclusively */ if ((scan_balance == SCAN_FILE) != file) { - size = 0; + lruvec_size = 0; scan = 0; } break; @@ -2442,7 +2549,7 @@ out: BUG(); } - *lru_pages += size; + *lru_pages += lruvec_size; nr[lru] = scan; } } @@ -2586,7 +2693,6 @@ static bool in_reclaim_compaction(struct scan_control *sc) */ static inline bool should_continue_reclaim(struct pglist_data *pgdat, unsigned long nr_reclaimed, - unsigned long nr_scanned, struct scan_control *sc) { unsigned long pages_for_compaction; @@ -2597,40 +2703,18 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, if (!in_reclaim_compaction(sc)) return false; - /* Consider stopping depending on scan and reclaim activity */ - if (sc->gfp_mask & __GFP_RETRY_MAYFAIL) { - /* - * For __GFP_RETRY_MAYFAIL allocations, stop reclaiming if the - * full LRU list has been scanned and we are still failing - * to reclaim pages. This full LRU scan is potentially - * expensive but a __GFP_RETRY_MAYFAIL caller really wants to succeed - */ - if (!nr_reclaimed && !nr_scanned) - return false; - } else { - /* - * For non-__GFP_RETRY_MAYFAIL allocations which can presumably - * fail without consequence, stop if we failed to reclaim - * any pages from the last SWAP_CLUSTER_MAX number of - * pages that were scanned. This will return to the - * caller faster at the risk reclaim/compaction and - * the resulting allocation attempt fails - */ - if (!nr_reclaimed) - return false; - } - /* - * If we have not reclaimed enough pages for compaction and the - * inactive lists are large enough, continue reclaiming + * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX + * number of pages that were scanned. This will return to the caller + * with the risk reclaim/compaction and the resulting allocation attempt + * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL + * allocations through requiring that the full LRU list has been scanned + * first, by assuming that zero delta of sc->nr_scanned means full LRU + * scan, but that approximation was wrong, and there were corner cases + * where always a non-zero amount of pages were scanned. */ - pages_for_compaction = compact_gap(sc->order); - inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); - if (get_nr_swap_pages() > 0) - inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); - if (sc->nr_reclaimed < pages_for_compaction && - inactive_lru_pages > pages_for_compaction) - return true; + if (!nr_reclaimed) + return false; /* If compaction would go ahead or the allocation would succeed, stop */ for (z = 0; z <= sc->reclaim_idx; z++) { @@ -2647,7 +2731,17 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, ; } } - return true; + + /* + * If we have not reclaimed enough pages for compaction and the + * inactive lists are large enough, continue reclaiming + */ + pages_for_compaction = compact_gap(sc->order); + inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); + if (get_nr_swap_pages() > 0) + inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); + + return inactive_lru_pages > pages_for_compaction; } static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg) @@ -2664,10 +2758,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) do { struct mem_cgroup *root = sc->target_mem_cgroup; - struct mem_cgroup_reclaim_cookie reclaim = { - .pgdat = pgdat, - .priority = sc->priority, - }; unsigned long node_lru_pages = 0; struct mem_cgroup *memcg; @@ -2676,7 +2766,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; - memcg = mem_cgroup_iter(root, NULL, &reclaim); + memcg = mem_cgroup_iter(root, NULL, NULL); do { unsigned long lru_pages; unsigned long reclaimed; @@ -2703,6 +2793,13 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) memcg_memory_event(memcg, MEMCG_LOW); break; case MEMCG_PROT_NONE: + /* + * All protection thresholds breached. We may + * still choose to vary the scan pressure + * applied based on by how much the cgroup in + * question has exceeded its protection + * thresholds (see get_scan_count). + */ break; } @@ -2719,21 +2816,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) sc->nr_scanned - scanned, sc->nr_reclaimed - reclaimed); - /* - * Kswapd have to scan all memory cgroups to fulfill - * the overall scan target for the node. - * - * Limit reclaim, on the other hand, only cares about - * nr_to_reclaim pages to be reclaimed and it will - * retry with decreasing priority if one round over the - * whole hierarchy is not sufficient. - */ - if (!current_is_kswapd() && - sc->nr_reclaimed >= sc->nr_to_reclaim) { - mem_cgroup_iter_break(root, memcg); - break; - } - } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim))); + } while ((memcg = mem_cgroup_iter(root, memcg, NULL))); if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; @@ -2810,7 +2893,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) wait_iff_congested(BLK_RW_ASYNC, HZ/10); } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, - sc->nr_scanned - nr_scanned, sc)); + sc)); /* * Kswapd gives up on balancing particular nodes after too |