From 599d0c954f91d0689c9bb421b5bc04ea02437a41 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 28 Jul 2016 15:45:31 -0700 Subject: mm, vmscan: move LRU lists to node This moves the LRU lists from the zone to the node and related data such as counters, tracing, congestion tracking and writeback tracking. Unfortunately, due to reclaim and compaction retry logic, it is necessary to account for the number of LRU pages on both zone and node logic. Most reclaim logic is based on the node counters but the retry logic uses the zone counters which do not distinguish inactive and active sizes. It would be possible to leave the LRU counters on a per-zone basis but it's a heavier calculation across multiple cache lines that is much more frequent than the retry checks. Other than the LRU counters, this is mostly a mechanical patch but note that it introduces a number of anomalies. For example, the scans are per-zone but using per-node counters. We also mark a node as congested when a zone is congested. This causes weird problems that are fixed later but is easier to review. In the event that there is excessive overhead on 32-bit systems due to the nodes being on LRU then there are two potential solutions 1. Long-term isolation of highmem pages when reclaim is lowmem When pages are skipped, they are immediately added back onto the LRU list. If lowmem reclaim persisted for long periods of time, the same highmem pages get continually scanned. The idea would be that lowmem keeps those pages on a separate list until a reclaim for highmem pages arrives that splices the highmem pages back onto the LRU. It potentially could be implemented similar to the UNEVICTABLE list. That would reduce the skip rate with the potential corner case is that highmem pages have to be scanned and reclaimed to free lowmem slab pages. 2. Linear scan lowmem pages if the initial LRU shrink fails This will break LRU ordering but may be preferable and faster during memory pressure than skipping LRU pages. Link: http://lkml.kernel.org/r/1467970510-21195-4-git-send-email-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Acked-by: Vlastimil Babka Cc: Hillf Danton Cc: Joonsoo Kim Cc: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'include/linux/mm_inline.h') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 5bd29ba4f174..9aadcc781857 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -23,25 +23,32 @@ static inline int page_is_file_cache(struct page *page) } static __always_inline void __update_lru_size(struct lruvec *lruvec, - enum lru_list lru, int nr_pages) + enum lru_list lru, enum zone_type zid, + int nr_pages) { - __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + __mod_node_page_state(pgdat, NR_LRU_BASE + lru, nr_pages); + __mod_zone_page_state(&pgdat->node_zones[zid], + NR_ZONE_LRU_BASE + !!is_file_lru(lru), + nr_pages); } static __always_inline void update_lru_size(struct lruvec *lruvec, - enum lru_list lru, int nr_pages) + enum lru_list lru, enum zone_type zid, + int nr_pages) { #ifdef CONFIG_MEMCG - mem_cgroup_update_lru_size(lruvec, lru, nr_pages); + mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); #else - __update_lru_size(lruvec, lru, nr_pages); + __update_lru_size(lruvec, lru, zid, nr_pages); #endif } static __always_inline void add_page_to_lru_list(struct page *page, struct lruvec *lruvec, enum lru_list lru) { - update_lru_size(lruvec, lru, hpage_nr_pages(page)); + update_lru_size(lruvec, lru, page_zonenum(page), hpage_nr_pages(page)); list_add(&page->lru, &lruvec->lists[lru]); } @@ -49,7 +56,7 @@ static __always_inline void del_page_from_lru_list(struct page *page, struct lruvec *lruvec, enum lru_list lru) { list_del(&page->lru); - update_lru_size(lruvec, lru, -hpage_nr_pages(page)); + update_lru_size(lruvec, lru, page_zonenum(page), -hpage_nr_pages(page)); } /** -- cgit v1.2.3 From bca6759258dbef378bcf5b872177bcd2259ceb68 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 28 Jul 2016 15:47:05 -0700 Subject: mm, vmstat: remove zone and node double accounting by approximating retries The number of LRU pages, dirty pages and writeback pages must be accounted for on both zones and nodes because of the reclaim retry logic, compaction retry logic and highmem calculations all depending on per-zone stats. Many lowmem allocations are immune from OOM kill due to a check in __alloc_pages_may_oom for (ac->high_zoneidx < ZONE_NORMAL) since commit 03668b3ceb0c ("oom: avoid oom killer for lowmem allocations"). The exception is costly high-order allocations or allocations that cannot fail. If the __alloc_pages_may_oom avoids OOM-kill for low-order lowmem allocations then it would fall through to __alloc_pages_direct_compact. This patch will blindly retry reclaim for zone-constrained allocations in should_reclaim_retry up to MAX_RECLAIM_RETRIES. This is not ideal but without per-zone stats there are not many alternatives. The impact it that zone-constrained allocations may delay before considering the OOM killer. As there is no guarantee enough memory can ever be freed to satisfy compaction, this patch avoids retrying compaction for zone-contrained allocations. In combination, that means that the per-node stats can be used when deciding whether to continue reclaim using a rough approximation. While it is possible this will make the wrong decision on occasion, it will not infinite loop as the number of reclaim attempts is capped by MAX_RECLAIM_RETRIES. The final step is calculating the number of dirtyable highmem pages. As those calculations only care about the global count of file pages in highmem. This patch uses a global counter used instead of per-zone stats as it is sufficient. In combination, this allows the per-zone LRU and dirty state counters to be removed. [mgorman@techsingularity.net: fix acct_highmem_file_pages()] Link: http://lkml.kernel.org/r/1468853426-12858-4-git-send-email-mgorman@techsingularity.netLink: http://lkml.kernel.org/r/1467970510-21195-35-git-send-email-mgorman@techsingularity.net Signed-off-by: Mel Gorman Suggested by: Michal Hocko Acked-by: Hillf Danton Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 20 ++++++++++++++--- include/linux/mmzone.h | 4 ---- include/linux/swap.h | 1 - mm/compaction.c | 20 ++++++++++++++++- mm/migrate.c | 2 -- mm/page-writeback.c | 13 +++++------ mm/page_alloc.c | 55 ++++++++++++++++++++++++++++++++++++----------- mm/vmscan.c | 16 -------------- mm/vmstat.c | 3 --- 9 files changed, 84 insertions(+), 50 deletions(-) (limited to 'include/linux/mm_inline.h') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 9aadcc781857..dd22b08c47be 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -4,6 +4,22 @@ #include #include +#ifdef CONFIG_HIGHMEM +extern atomic_t highmem_file_pages; + +static inline void acct_highmem_file_pages(int zid, enum lru_list lru, + int nr_pages) +{ + if (is_highmem_idx(zid) && is_file_lru(lru)) + atomic_add(nr_pages, &highmem_file_pages); +} +#else +static inline void acct_highmem_file_pages(int zid, enum lru_list lru, + int nr_pages) +{ +} +#endif + /** * page_is_file_cache - should the page be on a file LRU or anon LRU? * @page: the page to test @@ -29,9 +45,7 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec, struct pglist_data *pgdat = lruvec_pgdat(lruvec); __mod_node_page_state(pgdat, NR_LRU_BASE + lru, nr_pages); - __mod_zone_page_state(&pgdat->node_zones[zid], - NR_ZONE_LRU_BASE + !!is_file_lru(lru), - nr_pages); + acct_highmem_file_pages(zid, lru, nr_pages); } static __always_inline void update_lru_size(struct lruvec *lruvec, diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index bd33e6f1bed0..a3b7f45aac56 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -110,10 +110,6 @@ struct zone_padding { enum zone_stat_item { /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, - NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */ - NR_ZONE_LRU_ANON = NR_ZONE_LRU_BASE, - NR_ZONE_LRU_FILE, - NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ NR_SLAB_RECLAIMABLE, NR_SLAB_UNRECLAIMABLE, diff --git a/include/linux/swap.h b/include/linux/swap.h index b17cc4830fa6..cc753c639e3d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -307,7 +307,6 @@ extern void lru_cache_add_active_or_unevictable(struct page *page, struct vm_area_struct *vma); /* linux/mm/vmscan.c */ -extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); diff --git a/mm/compaction.c b/mm/compaction.c index e5995f38d677..cd93ea24c565 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1438,6 +1438,11 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, { struct zone *zone; struct zoneref *z; + pg_data_t *last_pgdat = NULL; + + /* Do not retry compaction for zone-constrained allocations */ + if (ac->high_zoneidx < ZONE_NORMAL) + return false; /* * Make sure at least one zone would pass __compaction_suitable if we continue @@ -1448,14 +1453,27 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, unsigned long available; enum compact_result compact_result; + if (last_pgdat == zone->zone_pgdat) + continue; + + /* + * This over-estimates the number of pages available for + * reclaim/compaction but walking the LRU would take too + * long. The consequences are that compaction may retry + * longer than it should for a zone-constrained allocation + * request. + */ + last_pgdat = zone->zone_pgdat; + available = pgdat_reclaimable_pages(zone->zone_pgdat) / order; + /* * Do not consider all the reclaimable memory because we do not * want to trash just for a single high order allocation which * is even not guaranteed to appear even if __compaction_suitable * is happy about the watermark check. */ - available = zone_reclaimable_pages(zone) / order; available += zone_page_state_snapshot(zone, NR_FREE_PAGES); + available = min(zone->managed_pages, available); compact_result = __compaction_suitable(zone, order, alloc_flags, ac_classzone_idx(ac), available); if (compact_result != COMPACT_SKIPPED && diff --git a/mm/migrate.c b/mm/migrate.c index ed0268268e93..ed2f85e61de1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -513,9 +513,7 @@ int migrate_page_move_mapping(struct address_space *mapping, } if (dirty && mapping_cap_account_dirty(mapping)) { __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY); - __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING); __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY); - __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING); } } local_irq_enable(); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3c02aa603f5a..0bca2376bd42 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -299,6 +299,9 @@ static unsigned long node_dirtyable_memory(struct pglist_data *pgdat) return nr_pages; } +#ifdef CONFIG_HIGHMEM +atomic_t highmem_file_pages; +#endif static unsigned long highmem_dirtyable_memory(unsigned long total) { @@ -306,18 +309,17 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) int node; unsigned long x = 0; int i; + unsigned long dirtyable = atomic_read(&highmem_file_pages); for_each_node_state(node, N_HIGH_MEMORY) { for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) { struct zone *z; - unsigned long dirtyable; if (!is_highmem_idx(i)) continue; z = &NODE_DATA(node)->node_zones[i]; - dirtyable = zone_page_state(z, NR_FREE_PAGES) + - zone_page_state(z, NR_ZONE_LRU_FILE); + dirtyable += zone_page_state(z, NR_FREE_PAGES); /* watch for underflows */ dirtyable -= min(dirtyable, high_wmark_pages(z)); @@ -2460,7 +2462,6 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY); __inc_node_page_state(page, NR_FILE_DIRTY); - __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); __inc_node_page_state(page, NR_DIRTIED); __inc_wb_stat(wb, WB_RECLAIMABLE); __inc_wb_stat(wb, WB_DIRTIED); @@ -2482,7 +2483,6 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, if (mapping_cap_account_dirty(mapping)) { mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); dec_node_page_state(page, NR_FILE_DIRTY); - dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); task_io_account_cancelled_write(PAGE_SIZE); } @@ -2739,7 +2739,6 @@ int clear_page_dirty_for_io(struct page *page) if (TestClearPageDirty(page)) { mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); dec_node_page_state(page, NR_FILE_DIRTY); - dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); ret = 1; } @@ -2786,7 +2785,6 @@ int test_clear_page_writeback(struct page *page) if (ret) { mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); dec_node_page_state(page, NR_WRITEBACK); - dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); inc_node_page_state(page, NR_WRITTEN); } unlock_page_memcg(page); @@ -2841,7 +2839,6 @@ int __test_set_page_writeback(struct page *page, bool keep_write) if (!ret) { mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); inc_node_page_state(page, NR_WRITEBACK); - inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); } unlock_page_memcg(page); return ret; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 03e67f2dfdaa..f1b5a0bc11f2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3402,6 +3402,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, { struct zone *zone; struct zoneref *z; + pg_data_t *current_pgdat = NULL; /* * Make sure we converge to OOM if we cannot make any progress @@ -3411,27 +3412,56 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, return false; /* - * Keep reclaiming pages while there is a chance this will lead somewhere. - * If none of the target zones can satisfy our allocation request even - * if all reclaimable pages are considered then we are screwed and have - * to go OOM. + * Blindly retry lowmem allocation requests that are often ignored by + * the OOM killer up to MAX_RECLAIM_RETRIES as we not have a reliable + * and fast means of calculating reclaimable, dirty and writeback pages + * in eligible zones. + */ + if (ac->high_zoneidx < ZONE_NORMAL) + goto out; + + /* + * Keep reclaiming pages while there is a chance this will lead + * somewhere. If none of the target zones can satisfy our allocation + * request even if all reclaimable pages are considered then we are + * screwed and have to go OOM. */ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) { unsigned long available; unsigned long reclaimable; + int zid; - available = reclaimable = zone_reclaimable_pages(zone); + if (current_pgdat == zone->zone_pgdat) + continue; + + current_pgdat = zone->zone_pgdat; + available = reclaimable = pgdat_reclaimable_pages(current_pgdat); available -= DIV_ROUND_UP(no_progress_loops * available, MAX_RECLAIM_RETRIES); - available += zone_page_state_snapshot(zone, NR_FREE_PAGES); + + /* Account for all free pages on eligible zones */ + for (zid = 0; zid <= zone_idx(zone); zid++) { + struct zone *acct_zone = ¤t_pgdat->node_zones[zid]; + + available += zone_page_state_snapshot(acct_zone, NR_FREE_PAGES); + } /* * Would the allocation succeed if we reclaimed the whole - * available? + * available? This is approximate because there is no + * accurate count of reclaimable pages per zone. */ - if (__zone_watermark_ok(zone, order, min_wmark_pages(zone), - ac_classzone_idx(ac), alloc_flags, available)) { + for (zid = 0; zid <= zone_idx(zone); zid++) { + struct zone *check_zone = ¤t_pgdat->node_zones[zid]; + unsigned long estimate; + + estimate = min(check_zone->managed_pages, available); + if (!__zone_watermark_ok(check_zone, order, + min_wmark_pages(check_zone), ac_classzone_idx(ac), + alloc_flags, estimate)) + continue; + /* * If we didn't make any progress and have a lot of * dirty + writeback pages then we should wait for @@ -3441,15 +3471,16 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, if (!did_some_progress) { unsigned long write_pending; - write_pending = zone_page_state_snapshot(zone, - NR_ZONE_WRITE_PENDING); + write_pending = + node_page_state(current_pgdat, NR_WRITEBACK) + + node_page_state(current_pgdat, NR_FILE_DIRTY); if (2 * write_pending > reclaimable) { congestion_wait(BLK_RW_ASYNC, HZ/10); return true; } } - +out: /* * Memory allocation/reclaim might be called from a WQ * context and the current implementation of the WQ diff --git a/mm/vmscan.c b/mm/vmscan.c index d5ee6d998b5e..5625eccc0140 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -194,22 +194,6 @@ static bool sane_reclaim(struct scan_control *sc) } #endif -/* - * This misses isolated pages which are not accounted for to save counters. - * As the data only determines if reclaim or compaction continues, it is - * not expected that isolated pages will be a dominating factor. - */ -unsigned long zone_reclaimable_pages(struct zone *zone) -{ - unsigned long nr; - - nr = zone_page_state_snapshot(zone, NR_ZONE_LRU_FILE); - if (get_nr_swap_pages() > 0) - nr += zone_page_state_snapshot(zone, NR_ZONE_LRU_ANON); - - return nr; -} - unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat) { unsigned long nr; diff --git a/mm/vmstat.c b/mm/vmstat.c index ac509572a50b..91ecca96dcae 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -921,9 +921,6 @@ int fragmentation_index(struct zone *zone, unsigned int order) const char * const vmstat_text[] = { /* enum zone_stat_item countes */ "nr_free_pages", - "nr_zone_anon_lru", - "nr_zone_file_lru", - "nr_zone_write_pending", "nr_mlock", "nr_slab_reclaimable", "nr_slab_unreclaimable", -- cgit v1.2.3 From 7ee36a14f06cc937f6b2c2932c2e48f590970581 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 28 Jul 2016 15:47:17 -0700 Subject: mm, vmscan: Update all zone LRU sizes before updating memcg Minchan Kim reported setting the following warning on a 32-bit system although it can affect 64-bit systems. WARNING: CPU: 4 PID: 1322 at mm/memcontrol.c:998 mem_cgroup_update_lru_size+0x103/0x110 mem_cgroup_update_lru_size(f44b4000, 1, -7): zid 1 lru_size 1 but empty Modules linked in: CPU: 4 PID: 1322 Comm: cp Not tainted 4.7.0-rc4-mm1+ #143 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x76/0xaf __warn+0xea/0x110 ? mem_cgroup_update_lru_size+0x103/0x110 warn_slowpath_fmt+0x3b/0x40 mem_cgroup_update_lru_size+0x103/0x110 isolate_lru_pages.isra.61+0x2e2/0x360 shrink_active_list+0xac/0x2a0 ? __delay+0xe/0x10 shrink_node_memcg+0x53c/0x7a0 shrink_node+0xab/0x2a0 do_try_to_free_pages+0xc6/0x390 try_to_free_pages+0x245/0x590 LRU list contents and counts are updated separately. Counts are updated before pages are added to the LRU and updated after pages are removed. The warning above is from a check in mem_cgroup_update_lru_size that ensures that list sizes of zero are empty. The problem is that node-lru needs to account for highmem pages if CONFIG_HIGHMEM is set. One impact of the implementation is that the sizes are updated in multiple passes when pages from multiple zones were isolated. This happens whether HIGHMEM is set or not. When multiple zones are isolated, it's possible for a debugging check in memcg to be tripped. This patch forces all the zone counts to be updated before the memcg function is called. Link: http://lkml.kernel.org/r/1468588165-12461-6-git-send-email-mgorman@techsingularity.net Signed-off-by: Mel Gorman Tested-by: Minchan Kim Reported-by: Minchan Kim Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 +- include/linux/mm_inline.h | 5 ++--- mm/memcontrol.c | 5 +---- mm/vmscan.c | 40 +++++++++++++++++++++++++++++++++------- 4 files changed, 37 insertions(+), 15 deletions(-) (limited to 'include/linux/mm_inline.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b759827b2f1e..5147e650287a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -430,7 +430,7 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg) int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, - enum zone_type zid, int nr_pages); + int nr_pages); unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask); diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index dd22b08c47be..bcc4ed07fa90 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -52,10 +52,9 @@ static __always_inline void update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, int nr_pages) { -#ifdef CONFIG_MEMCG - mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); -#else __update_lru_size(lruvec, lru, zid, nr_pages); +#ifdef CONFIG_MEMCG + mem_cgroup_update_lru_size(lruvec, lru, nr_pages); #endif } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9cbd40ebccd1..13be30c3ea78 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -965,7 +965,6 @@ out: * mem_cgroup_update_lru_size - account for adding or removing an lru page * @lruvec: mem_cgroup per zone lru vector * @lru: index of lru list the page is sitting on - * @zid: Zone ID of the zone pages have been added to * @nr_pages: positive when adding or negative when removing * * This function must be called under lru_lock, just before a page is added @@ -973,15 +972,13 @@ out: * so as to allow it to check that lru_size 0 is consistent with list_empty). */ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, - enum zone_type zid, int nr_pages) + int nr_pages) { struct mem_cgroup_per_node *mz; unsigned long *lru_size; long size; bool empty; - __update_lru_size(lruvec, lru, zid, nr_pages); - if (mem_cgroup_disabled()) return; diff --git a/mm/vmscan.c b/mm/vmscan.c index 5625eccc0140..b3f5b359280d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1350,6 +1350,38 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) return ret; } + +/* + * Update LRU sizes after isolating pages. The LRU size updates must + * be complete before mem_cgroup_update_lru_size due to a santity check. + */ +static __always_inline void update_lru_sizes(struct lruvec *lruvec, + enum lru_list lru, unsigned long *nr_zone_taken, + unsigned long nr_taken) +{ +#ifdef CONFIG_HIGHMEM + int zid; + + /* + * Highmem has separate accounting for highmem pages so each zone + * is updated separately. + */ + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + if (!nr_zone_taken[zid]) + continue; + + __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); + } +#else + /* Zone ID does not matter on !HIGHMEM */ + __update_lru_size(lruvec, lru, 0, -nr_taken); +#endif + +#ifdef CONFIG_MEMCG + mem_cgroup_update_lru_size(lruvec, lru, -nr_taken); +#endif +} + /* * zone_lru_lock is heavily contended. Some of the functions that * shrink the lists perform better by taking out a batch of pages @@ -1436,13 +1468,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, *nr_scanned = scan; trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan, nr_taken, mode, is_file_lru(lru)); - for (scan = 0; scan < MAX_NR_ZONES; scan++) { - nr_pages = nr_zone_taken[scan]; - if (!nr_pages) - continue; - - update_lru_size(lruvec, lru, scan, -nr_pages); - } + update_lru_sizes(lruvec, lru, nr_zone_taken, nr_taken); return nr_taken; } -- cgit v1.2.3 From 71c799f4982d340fff86e751898841322f07f235 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 28 Jul 2016 15:47:26 -0700 Subject: mm: add per-zone lru list stat When I did stress test with hackbench, I got OOM message frequently which didn't ever happen in zone-lru. gfp_mask=0x26004c0(GFP_KERNEL|__GFP_REPEAT|__GFP_NOTRACK), order=0 .. .. __alloc_pages_nodemask+0xe52/0xe60 ? new_slab+0x39c/0x3b0 new_slab+0x39c/0x3b0 ___slab_alloc.constprop.87+0x6da/0x840 ? __alloc_skb+0x3c/0x260 ? _raw_spin_unlock_irq+0x27/0x60 ? trace_hardirqs_on_caller+0xec/0x1b0 ? finish_task_switch+0xa6/0x220 ? poll_select_copy_remaining+0x140/0x140 __slab_alloc.isra.81.constprop.86+0x40/0x6d ? __alloc_skb+0x3c/0x260 kmem_cache_alloc+0x22c/0x260 ? __alloc_skb+0x3c/0x260 __alloc_skb+0x3c/0x260 alloc_skb_with_frags+0x4e/0x1a0 sock_alloc_send_pskb+0x16a/0x1b0 ? wait_for_unix_gc+0x31/0x90 ? alloc_set_pte+0x2ad/0x310 unix_stream_sendmsg+0x28d/0x340 sock_sendmsg+0x2d/0x40 sock_write_iter+0x6c/0xc0 __vfs_write+0xc0/0x120 vfs_write+0x9b/0x1a0 ? __might_fault+0x49/0xa0 SyS_write+0x44/0x90 do_fast_syscall_32+0xa6/0x1e0 sysenter_past_esp+0x45/0x74 Mem-Info: active_anon:104698 inactive_anon:105791 isolated_anon:192 active_file:433 inactive_file:283 isolated_file:22 unevictable:0 dirty:0 writeback:296 unstable:0 slab_reclaimable:6389 slab_unreclaimable:78927 mapped:474 shmem:0 pagetables:101426 bounce:0 free:10518 free_pcp:334 free_cma:0 Node 0 active_anon:418792kB inactive_anon:423164kB active_file:1732kB inactive_file:1132kB unevictable:0kB isolated(anon):768kB isolated(file):88kB mapped:1896kB dirty:0kB writeback:1184kB shmem:0kB writeback_tmp:0kB unstable:0kB pages_scanned:1478632 all_unreclaimable? yes DMA free:3304kB min:68kB low:84kB high:100kB present:15992kB managed:15916kB mlocked:0kB slab_reclaimable:0kB slab_unreclaimable:4088kB kernel_stack:0kB pagetables:2480kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB lowmem_reserve[]: 0 809 1965 1965 Normal free:3436kB min:3604kB low:4504kB high:5404kB present:897016kB managed:858460kB mlocked:0kB slab_reclaimable:25556kB slab_unreclaimable:311712kB kernel_stack:164608kB pagetables:30844kB bounce:0kB free_pcp:620kB local_pcp:104kB free_cma:0kB lowmem_reserve[]: 0 0 9247 9247 HighMem free:33808kB min:512kB low:1796kB high:3080kB present:1183736kB managed:1183736kB mlocked:0kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:372252kB bounce:0kB free_pcp:428kB local_pcp:72kB free_cma:0kB lowmem_reserve[]: 0 0 0 0 DMA: 2*4kB (UM) 2*8kB (UM) 0*16kB 1*32kB (U) 1*64kB (U) 2*128kB (UM) 1*256kB (U) 1*512kB (M) 0*1024kB 1*2048kB (U) 0*4096kB = 3192kB Normal: 33*4kB (MH) 79*8kB (ME) 11*16kB (M) 4*32kB (M) 2*64kB (ME) 2*128kB (EH) 7*256kB (EH) 0*512kB 0*1024kB 0*2048kB 0*4096kB = 3244kB HighMem: 2590*4kB (UM) 1568*8kB (UM) 491*16kB (UM) 60*32kB (UM) 6*64kB (M) 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 33064kB Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB 25121 total pagecache pages 24160 pages in swap cache Swap cache stats: add 86371, delete 62211, find 42865/60187 Free swap = 4015560kB Total swap = 4192252kB 524186 pages RAM 295934 pages HighMem/MovableOnly 9658 pages reserved 0 pages cma reserved The order-0 allocation for normal zone failed while there are a lot of reclaimable memory(i.e., anonymous memory with free swap). I wanted to analyze the problem but it was hard because we removed per-zone lru stat so I couldn't know how many of anonymous memory there are in normal/dma zone. When we investigate OOM problem, reclaimable memory count is crucial stat to find a problem. Without it, it's hard to parse the OOM message so I believe we should keep it. With per-zone lru stat, gfp_mask=0x26004c0(GFP_KERNEL|__GFP_REPEAT|__GFP_NOTRACK), order=0 Mem-Info: active_anon:101103 inactive_anon:102219 isolated_anon:0 active_file:503 inactive_file:544 isolated_file:0 unevictable:0 dirty:0 writeback:34 unstable:0 slab_reclaimable:6298 slab_unreclaimable:74669 mapped:863 shmem:0 pagetables:100998 bounce:0 free:23573 free_pcp:1861 free_cma:0 Node 0 active_anon:404412kB inactive_anon:409040kB active_file:2012kB inactive_file:2176kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:3452kB dirty:0kB writeback:136kB shmem:0kB writeback_tmp:0kB unstable:0kB pages_scanned:1320845 all_unreclaimable? yes DMA free:3296kB min:68kB low:84kB high:100kB active_anon:5540kB inactive_anon:0kB active_file:0kB inactive_file:0kB present:15992kB managed:15916kB mlocked:0kB slab_reclaimable:248kB slab_unreclaimable:2628kB kernel_stack:792kB pagetables:2316kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB lowmem_reserve[]: 0 809 1965 1965 Normal free:3600kB min:3604kB low:4504kB high:5404kB active_anon:86304kB inactive_anon:0kB active_file:160kB inactive_file:376kB present:897016kB managed:858524kB mlocked:0kB slab_reclaimable:24944kB slab_unreclaimable:296048kB kernel_stack:163832kB pagetables:35892kB bounce:0kB free_pcp:3076kB local_pcp:656kB free_cma:0kB lowmem_reserve[]: 0 0 9247 9247 HighMem free:86156kB min:512kB low:1796kB high:3080kB active_anon:312852kB inactive_anon:410024kB active_file:1924kB inactive_file:2012kB present:1183736kB managed:1183736kB mlocked:0kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:365784kB bounce:0kB free_pcp:3868kB local_pcp:720kB free_cma:0kB lowmem_reserve[]: 0 0 0 0 DMA: 8*4kB (UM) 8*8kB (UM) 4*16kB (M) 2*32kB (UM) 2*64kB (UM) 1*128kB (M) 3*256kB (UME) 2*512kB (UE) 1*1024kB (E) 0*2048kB 0*4096kB = 3296kB Normal: 240*4kB (UME) 160*8kB (UME) 23*16kB (ME) 3*32kB (UE) 3*64kB (UME) 2*128kB (ME) 1*256kB (U) 0*512kB 0*1024kB 0*2048kB 0*4096kB = 3408kB HighMem: 10942*4kB (UM) 3102*8kB (UM) 866*16kB (UM) 76*32kB (UM) 11*64kB (UM) 4*128kB (UM) 1*256kB (M) 0*512kB 0*1024kB 0*2048kB 0*4096kB = 86344kB Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB 54409 total pagecache pages 53215 pages in swap cache Swap cache stats: add 300982, delete 247765, find 157978/226539 Free swap = 3803244kB Total swap = 4192252kB 524186 pages RAM 295934 pages HighMem/MovableOnly 9642 pages reserved 0 pages cma reserved With that, we can see normal zone has a 86M reclaimable memory so we can know something goes wrong(I will fix the problem in next patch) in reclaim. [mgorman@techsingularity.net: rename zone LRU stats in /proc/vmstat] Link: http://lkml.kernel.org/r/20160725072300.GK10438@techsingularity.net Link: http://lkml.kernel.org/r/1469110261-7365-2-git-send-email-mgorman@techsingularity.net Signed-off-by: Minchan Kim Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 2 ++ include/linux/mmzone.h | 6 ++++++ mm/page_alloc.c | 10 ++++++++++ mm/vmscan.c | 9 --------- mm/vmstat.c | 5 +++++ 5 files changed, 23 insertions(+), 9 deletions(-) (limited to 'include/linux/mm_inline.h') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index bcc4ed07fa90..9cc130f5feb2 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -45,6 +45,8 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec, struct pglist_data *pgdat = lruvec_pgdat(lruvec); __mod_node_page_state(pgdat, NR_LRU_BASE + lru, nr_pages); + __mod_zone_page_state(&pgdat->node_zones[zid], + NR_ZONE_LRU_BASE + lru, nr_pages); acct_highmem_file_pages(zid, lru, nr_pages); } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a3b7f45aac56..1a813ad335f4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -110,6 +110,12 @@ struct zone_padding { enum zone_stat_item { /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, + NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */ + NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE, + NR_ZONE_ACTIVE_ANON, + NR_ZONE_INACTIVE_FILE, + NR_ZONE_ACTIVE_FILE, + NR_ZONE_UNEVICTABLE, NR_MLOCK, /* mlock()ed pages found and moved off LRU */ NR_SLAB_RECLAIMABLE, NR_SLAB_UNRECLAIMABLE, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ffd4fdbae8b5..759cfa8cbbeb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4356,6 +4356,11 @@ void show_free_areas(unsigned int filter) " min:%lukB" " low:%lukB" " high:%lukB" + " active_anon:%lukB" + " inactive_anon:%lukB" + " active_file:%lukB" + " inactive_file:%lukB" + " unevictable:%lukB" " present:%lukB" " managed:%lukB" " mlocked:%lukB" @@ -4373,6 +4378,11 @@ void show_free_areas(unsigned int filter) K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), + K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), + K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), + K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), + K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), + K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), K(zone->present_pages), K(zone->managed_pages), K(zone_page_state(zone, NR_MLOCK)), diff --git a/mm/vmscan.c b/mm/vmscan.c index 22aec2bcfeec..222d5403dd4b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1359,23 +1359,14 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, enum lru_list lru, unsigned long *nr_zone_taken, unsigned long nr_taken) { -#ifdef CONFIG_HIGHMEM int zid; - /* - * Highmem has separate accounting for highmem pages so each zone - * is updated separately. - */ for (zid = 0; zid < MAX_NR_ZONES; zid++) { if (!nr_zone_taken[zid]) continue; __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); } -#else - /* Zone ID does not matter on !HIGHMEM */ - __update_lru_size(lruvec, lru, 0, -nr_taken); -#endif #ifdef CONFIG_MEMCG mem_cgroup_update_lru_size(lruvec, lru, -nr_taken); diff --git a/mm/vmstat.c b/mm/vmstat.c index 91ecca96dcae..053075ac67b8 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -921,6 +921,11 @@ int fragmentation_index(struct zone *zone, unsigned int order) const char * const vmstat_text[] = { /* enum zone_stat_item countes */ "nr_free_pages", + "nr_zone_inactive_anon", + "nr_zone_active_anon", + "nr_zone_inactive_file", + "nr_zone_active_file", + "nr_zone_unevictable", "nr_mlock", "nr_slab_reclaimable", "nr_slab_unreclaimable", -- cgit v1.2.3 From bb4cc2bea6df7854d629bff114ca03237cc718d6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 28 Jul 2016 15:47:29 -0700 Subject: mm, vmscan: remove highmem_file_pages With the reintroduction of per-zone LRU stats, highmem_file_pages is redundant so remove it. [mgorman@techsingularity.net: wrong stat is being accumulated in highmem_dirtyable_memory] Link: http://lkml.kernel.org/r/20160725092324.GM10438@techsingularity.netLink: http://lkml.kernel.org/r/1469110261-7365-3-git-send-email-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Cc: Minchan Kim Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 17 ----------------- mm/page-writeback.c | 12 ++++-------- 2 files changed, 4 insertions(+), 25 deletions(-) (limited to 'include/linux/mm_inline.h') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 9cc130f5feb2..71613e8a720f 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -4,22 +4,6 @@ #include #include -#ifdef CONFIG_HIGHMEM -extern atomic_t highmem_file_pages; - -static inline void acct_highmem_file_pages(int zid, enum lru_list lru, - int nr_pages) -{ - if (is_highmem_idx(zid) && is_file_lru(lru)) - atomic_add(nr_pages, &highmem_file_pages); -} -#else -static inline void acct_highmem_file_pages(int zid, enum lru_list lru, - int nr_pages) -{ -} -#endif - /** * page_is_file_cache - should the page be on a file LRU or anon LRU? * @page: the page to test @@ -47,7 +31,6 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec, __mod_node_page_state(pgdat, NR_LRU_BASE + lru, nr_pages); __mod_zone_page_state(&pgdat->node_zones[zid], NR_ZONE_LRU_BASE + lru, nr_pages); - acct_highmem_file_pages(zid, lru, nr_pages); } static __always_inline void update_lru_size(struct lruvec *lruvec, diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 573d138fa7a5..7b5920a3500f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -299,17 +299,13 @@ static unsigned long node_dirtyable_memory(struct pglist_data *pgdat) return nr_pages; } -#ifdef CONFIG_HIGHMEM -atomic_t highmem_file_pages; -#endif static unsigned long highmem_dirtyable_memory(unsigned long total) { #ifdef CONFIG_HIGHMEM int node; - unsigned long x; + unsigned long x = 0; int i; - unsigned long dirtyable = 0; for_each_node_state(node, N_HIGH_MEMORY) { for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) { @@ -326,12 +322,12 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) nr_pages = zone_page_state(z, NR_FREE_PAGES); /* watch for underflows */ nr_pages -= min(nr_pages, high_wmark_pages(z)); - dirtyable += nr_pages; + nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE); + nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE); + x += nr_pages; } } - x = dirtyable + atomic_read(&highmem_file_pages); - /* * Unreclaimable memory (kernel memory or anonymous memory * without swap) can bring down the dirtyable pages below -- cgit v1.2.3