diff options
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 326 |
1 files changed, 182 insertions, 144 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 2cff0d491c6d..eea668d9cff6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -48,6 +48,7 @@ #include <asm/div64.h> #include <linux/swapops.h> +#include <linux/balloon_compaction.h> #include "internal.h" @@ -146,6 +147,25 @@ static bool global_reclaim(struct scan_control *sc) } #endif +unsigned long zone_reclaimable_pages(struct zone *zone) +{ + int nr; + + nr = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + + if (get_nr_swap_pages() > 0) + nr += zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_ANON); + + return nr; +} + +bool zone_reclaimable(struct zone *zone) +{ + return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; +} + static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) { if (!mem_cgroup_disabled()) @@ -155,14 +175,31 @@ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) } /* - * Add a shrinker callback to be called from the vm + * Add a shrinker callback to be called from the vm. */ -void register_shrinker(struct shrinker *shrinker) +int register_shrinker(struct shrinker *shrinker) { - atomic_long_set(&shrinker->nr_in_batch, 0); + size_t size = sizeof(*shrinker->nr_deferred); + + /* + * If we only have one possible node in the system anyway, save + * ourselves the trouble and disable NUMA aware behavior. This way we + * will save memory and some small loop time later. + */ + if (nr_node_ids == 1) + shrinker->flags &= ~SHRINKER_NUMA_AWARE; + + if (shrinker->flags & SHRINKER_NUMA_AWARE) + size *= nr_node_ids; + + shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); + if (!shrinker->nr_deferred) + return -ENOMEM; + down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); up_write(&shrinker_rwsem); + return 0; } EXPORT_SYMBOL(register_shrinker); @@ -174,18 +211,106 @@ void unregister_shrinker(struct shrinker *shrinker) down_write(&shrinker_rwsem); list_del(&shrinker->list); up_write(&shrinker_rwsem); + kfree(shrinker->nr_deferred); } EXPORT_SYMBOL(unregister_shrinker); -static inline int do_shrinker_shrink(struct shrinker *shrinker, - struct shrink_control *sc, - unsigned long nr_to_scan) -{ - sc->nr_to_scan = nr_to_scan; - return (*shrinker->shrink)(shrinker, sc); +#define SHRINK_BATCH 128 + +static unsigned long +shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, + unsigned long nr_pages_scanned, unsigned long lru_pages) +{ + unsigned long freed = 0; + unsigned long long delta; + long total_scan; + long max_pass; + long nr; + long new_nr; + int nid = shrinkctl->nid; + long batch_size = shrinker->batch ? shrinker->batch + : SHRINK_BATCH; + + max_pass = shrinker->count_objects(shrinker, shrinkctl); + if (max_pass == 0) + return 0; + + /* + * copy the current shrinker scan count into a local variable + * and zero it so that other concurrent shrinker invocations + * don't also do this scanning work. + */ + nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); + + total_scan = nr; + delta = (4 * nr_pages_scanned) / shrinker->seeks; + delta *= max_pass; + do_div(delta, lru_pages + 1); + total_scan += delta; + if (total_scan < 0) { + printk(KERN_ERR + "shrink_slab: %pF negative objects to delete nr=%ld\n", + shrinker->scan_objects, total_scan); + total_scan = max_pass; + } + + /* + * We need to avoid excessive windup on filesystem shrinkers + * due to large numbers of GFP_NOFS allocations causing the + * shrinkers to return -1 all the time. This results in a large + * nr being built up so when a shrink that can do some work + * comes along it empties the entire cache due to nr >>> + * max_pass. This is bad for sustaining a working set in + * memory. + * + * Hence only allow the shrinker to scan the entire cache when + * a large delta change is calculated directly. + */ + if (delta < max_pass / 4) + total_scan = min(total_scan, max_pass / 2); + + /* + * Avoid risking looping forever due to too large nr value: + * never try to free more than twice the estimate number of + * freeable entries. + */ + if (total_scan > max_pass * 2) + total_scan = max_pass * 2; + + trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, + nr_pages_scanned, lru_pages, + max_pass, delta, total_scan); + + while (total_scan >= batch_size) { + unsigned long ret; + + shrinkctl->nr_to_scan = batch_size; + ret = shrinker->scan_objects(shrinker, shrinkctl); + if (ret == SHRINK_STOP) + break; + freed += ret; + + count_vm_events(SLABS_SCANNED, batch_size); + total_scan -= batch_size; + + cond_resched(); + } + + /* + * move the unused scan count back into the shrinker in a + * manner that handles concurrent updates. If we exhausted the + * scan, there is no need to do an update. + */ + if (total_scan > 0) + new_nr = atomic_long_add_return(total_scan, + &shrinker->nr_deferred[nid]); + else + new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); + + trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr); + return freed; } -#define SHRINK_BATCH 128 /* * Call the shrink functions to age shrinkable caches * @@ -205,115 +330,45 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker, * * Returns the number of slab objects which we shrunk. */ -unsigned long shrink_slab(struct shrink_control *shrink, +unsigned long shrink_slab(struct shrink_control *shrinkctl, unsigned long nr_pages_scanned, unsigned long lru_pages) { struct shrinker *shrinker; - unsigned long ret = 0; + unsigned long freed = 0; if (nr_pages_scanned == 0) nr_pages_scanned = SWAP_CLUSTER_MAX; if (!down_read_trylock(&shrinker_rwsem)) { - /* Assume we'll be able to shrink next time */ - ret = 1; + /* + * If we would return 0, our callers would understand that we + * have nothing else to shrink and give up trying. By returning + * 1 we keep it going and assume we'll be able to shrink next + * time. + */ + freed = 1; goto out; } list_for_each_entry(shrinker, &shrinker_list, list) { - unsigned long long delta; - long total_scan; - long max_pass; - int shrink_ret = 0; - long nr; - long new_nr; - long batch_size = shrinker->batch ? shrinker->batch - : SHRINK_BATCH; - - max_pass = do_shrinker_shrink(shrinker, shrink, 0); - if (max_pass <= 0) - continue; - - /* - * copy the current shrinker scan count into a local variable - * and zero it so that other concurrent shrinker invocations - * don't also do this scanning work. - */ - nr = atomic_long_xchg(&shrinker->nr_in_batch, 0); - - total_scan = nr; - delta = (4 * nr_pages_scanned) / shrinker->seeks; - delta *= max_pass; - do_div(delta, lru_pages + 1); - total_scan += delta; - if (total_scan < 0) { - printk(KERN_ERR "shrink_slab: %pF negative objects to " - "delete nr=%ld\n", - shrinker->shrink, total_scan); - total_scan = max_pass; - } - - /* - * We need to avoid excessive windup on filesystem shrinkers - * due to large numbers of GFP_NOFS allocations causing the - * shrinkers to return -1 all the time. This results in a large - * nr being built up so when a shrink that can do some work - * comes along it empties the entire cache due to nr >>> - * max_pass. This is bad for sustaining a working set in - * memory. - * - * Hence only allow the shrinker to scan the entire cache when - * a large delta change is calculated directly. - */ - if (delta < max_pass / 4) - total_scan = min(total_scan, max_pass / 2); - - /* - * Avoid risking looping forever due to too large nr value: - * never try to free more than twice the estimate number of - * freeable entries. - */ - if (total_scan > max_pass * 2) - total_scan = max_pass * 2; - - trace_mm_shrink_slab_start(shrinker, shrink, nr, - nr_pages_scanned, lru_pages, - max_pass, delta, total_scan); - - while (total_scan >= batch_size) { - int nr_before; + for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { + if (!node_online(shrinkctl->nid)) + continue; - nr_before = do_shrinker_shrink(shrinker, shrink, 0); - shrink_ret = do_shrinker_shrink(shrinker, shrink, - batch_size); - if (shrink_ret == -1) + if (!(shrinker->flags & SHRINKER_NUMA_AWARE) && + (shrinkctl->nid != 0)) break; - if (shrink_ret < nr_before) - ret += nr_before - shrink_ret; - count_vm_events(SLABS_SCANNED, batch_size); - total_scan -= batch_size; - cond_resched(); - } + freed += shrink_slab_node(shrinkctl, shrinker, + nr_pages_scanned, lru_pages); - /* - * move the unused scan count back into the shrinker in a - * manner that handles concurrent updates. If we exhausted the - * scan, there is no need to do an update. - */ - if (total_scan > 0) - new_nr = atomic_long_add_return(total_scan, - &shrinker->nr_in_batch); - else - new_nr = atomic_long_read(&shrinker->nr_in_batch); - - trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); + } } up_read(&shrinker_rwsem); out: cond_resched(); - return ret; + return freed; } static inline int is_page_cache_freeable(struct page *page) @@ -545,7 +600,7 @@ int remove_mapping(struct address_space *mapping, struct page *page) */ void putback_lru_page(struct page *page) { - int lru; + bool is_unevictable; int was_unevictable = PageUnevictable(page); VM_BUG_ON(PageLRU(page)); @@ -560,14 +615,14 @@ redo: * unevictable page on [in]active list. * We know how to handle that. */ - lru = page_lru_base_type(page); + is_unevictable = false; lru_cache_add(page); } else { /* * Put unevictable pages directly on zone's unevictable * list. */ - lru = LRU_UNEVICTABLE; + is_unevictable = true; add_page_to_unevictable_list(page); /* * When racing with an mlock or AS_UNEVICTABLE clearing @@ -587,7 +642,7 @@ redo: * page is on unevictable list, it never be freed. To avoid that, * check after we added it to the list, again. */ - if (lru == LRU_UNEVICTABLE && page_evictable(page)) { + if (is_unevictable && page_evictable(page)) { if (!isolate_lru_page(page)) { put_page(page); goto redo; @@ -598,9 +653,9 @@ redo: */ } - if (was_unevictable && lru != LRU_UNEVICTABLE) + if (was_unevictable && !is_unevictable) count_vm_event(UNEVICTABLE_PGRESCUED); - else if (!was_unevictable && lru == LRU_UNEVICTABLE) + else if (!was_unevictable && is_unevictable) count_vm_event(UNEVICTABLE_PGCULLED); put_page(page); /* drop ref from isolate */ @@ -1060,7 +1115,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, LIST_HEAD(clean_pages); list_for_each_entry_safe(page, next, page_list, lru) { - if (page_is_file_cache(page) && !PageDirty(page)) { + if (page_is_file_cache(page) && !PageDirty(page) && + !isolated_balloon_page(page)) { ClearPageActive(page); list_move(&page->lru, &clean_pages); } @@ -1789,7 +1845,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * latencies, so it's better to scan a minimum amount there as * well. */ - if (current_is_kswapd() && zone->all_unreclaimable) + if (current_is_kswapd() && !zone_reclaimable(zone)) force_scan = true; if (!global_reclaim(sc)) force_scan = true; @@ -2244,8 +2300,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (global_reclaim(sc)) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - if (zone->all_unreclaimable && - sc->priority != DEF_PRIORITY) + if (sc->priority != DEF_PRIORITY && + !zone_reclaimable(zone)) continue; /* Let kswapd poll it */ if (IS_ENABLED(CONFIG_COMPACTION)) { /* @@ -2283,11 +2339,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) return aborted_reclaim; } -static bool zone_reclaimable(struct zone *zone) -{ - return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; -} - /* All zones in zonelist are unreclaimable? */ static bool all_unreclaimable(struct zonelist *zonelist, struct scan_control *sc) @@ -2301,7 +2352,7 @@ static bool all_unreclaimable(struct zonelist *zonelist, continue; if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - if (!zone->all_unreclaimable) + if (zone_reclaimable(zone)) return false; } @@ -2354,12 +2405,16 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, */ if (global_reclaim(sc)) { unsigned long lru_pages = 0; + + nodes_clear(shrink->nodes_to_scan); for_each_zone_zonelist(zone, z, zonelist, gfp_zone(sc->gfp_mask)) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; lru_pages += zone_reclaimable_pages(zone); + node_set(zone_to_nid(zone), + shrink->nodes_to_scan); } shrink_slab(shrink, sc->nr_scanned, lru_pages); @@ -2712,7 +2767,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) * DEF_PRIORITY. Effectively, it considers them balanced so * they must be considered balanced here as well! */ - if (zone->all_unreclaimable) { + if (!zone_reclaimable(zone)) { balanced_pages += zone->managed_pages; continue; } @@ -2773,7 +2828,6 @@ static bool kswapd_shrink_zone(struct zone *zone, unsigned long lru_pages, unsigned long *nr_attempted) { - unsigned long nr_slab; int testorder = sc->order; unsigned long balance_gap; struct reclaim_state *reclaim_state = current->reclaim_state; @@ -2816,17 +2870,16 @@ static bool kswapd_shrink_zone(struct zone *zone, return true; shrink_zone(zone, sc); + nodes_clear(shrink.nodes_to_scan); + node_set(zone_to_nid(zone), shrink.nodes_to_scan); reclaim_state->reclaimed_slab = 0; - nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages); + shrink_slab(&shrink, sc->nr_scanned, lru_pages); sc->nr_reclaimed += reclaim_state->reclaimed_slab; /* Account for the number of pages attempted to reclaim */ *nr_attempted += sc->nr_to_reclaim; - if (nr_slab == 0 && !zone_reclaimable(zone)) - zone->all_unreclaimable = 1; - zone_clear_flag(zone, ZONE_WRITEBACK); /* @@ -2835,7 +2888,7 @@ static bool kswapd_shrink_zone(struct zone *zone, * BDIs but as pressure is relieved, speculatively avoid congestion * waits. */ - if (!zone->all_unreclaimable && + if (zone_reclaimable(zone) && zone_balanced(zone, testorder, 0, classzone_idx)) { zone_clear_flag(zone, ZONE_CONGESTED); zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); @@ -2901,8 +2954,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable && - sc.priority != DEF_PRIORITY) + if (sc.priority != DEF_PRIORITY && + !zone_reclaimable(zone)) continue; /* @@ -2980,8 +3033,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable && - sc.priority != DEF_PRIORITY) + if (sc.priority != DEF_PRIORITY && + !zone_reclaimable(zone)) continue; sc.nr_scanned = 0; @@ -3237,7 +3290,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) } if (!waitqueue_active(&pgdat->kswapd_wait)) return; - if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) + if (zone_balanced(zone, order, 0, 0)) return; trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); @@ -3265,20 +3318,6 @@ unsigned long global_reclaimable_pages(void) return nr; } -unsigned long zone_reclaimable_pages(struct zone *zone) -{ - int nr; - - nr = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_FILE); - - if (get_nr_swap_pages() > 0) - nr += zone_page_state(zone, NR_ACTIVE_ANON) + - zone_page_state(zone, NR_INACTIVE_ANON); - - return nr; -} - #ifdef CONFIG_HIBERNATION /* * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of @@ -3524,10 +3563,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * number of slab pages and shake the slab until it is reduced * by the same nr_pages that we used for reclaiming unmapped * pages. - * - * Note that shrink_slab will free memory on all zones and may - * take a long time. */ + nodes_clear(shrink.nodes_to_scan); + node_set(zone_to_nid(zone), shrink.nodes_to_scan); for (;;) { unsigned long lru_pages = zone_reclaimable_pages(zone); @@ -3576,7 +3614,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) return ZONE_RECLAIM_FULL; - if (zone->all_unreclaimable) + if (!zone_reclaimable(zone)) return ZONE_RECLAIM_FULL; /* |