diff options
Diffstat (limited to 'mm/vmscan.c')
| -rw-r--r-- | mm/vmscan.c | 192 | 
1 files changed, 174 insertions, 18 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index eeb3bc9d1d36..8d01243d9560 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -133,7 +133,7 @@ long vm_total_pages;	/* The total number of pages which the VM controls */  static LIST_HEAD(shrinker_list);  static DECLARE_RWSEM(shrinker_rwsem); -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG  static bool global_reclaim(struct scan_control *sc)  {  	return !sc->target_mem_cgroup; @@ -687,6 +687,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,  	cond_resched(); +	mem_cgroup_uncharge_start();  	while (!list_empty(page_list)) {  		enum page_references references;  		struct address_space *mapping; @@ -720,9 +721,41 @@ static unsigned long shrink_page_list(struct list_head *page_list,  			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));  		if (PageWriteback(page)) { -			nr_writeback++; -			unlock_page(page); -			goto keep; +			/* +			 * memcg doesn't have any dirty pages throttling so we +			 * could easily OOM just because too many pages are in +			 * writeback and there is nothing else to reclaim. +			 * +			 * Check __GFP_IO, certainly because a loop driver +			 * thread might enter reclaim, and deadlock if it waits +			 * on a page for which it is needed to do the write +			 * (loop masks off __GFP_IO|__GFP_FS for this reason); +			 * but more thought would probably show more reasons. +			 * +			 * Don't require __GFP_FS, since we're not going into +			 * the FS, just waiting on its writeback completion. +			 * Worryingly, ext4 gfs2 and xfs allocate pages with +			 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so +			 * testing may_enter_fs here is liable to OOM on them. +			 */ +			if (global_reclaim(sc) || +			    !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { +				/* +				 * This is slightly racy - end_page_writeback() +				 * might have just cleared PageReclaim, then +				 * setting PageReclaim here end up interpreted +				 * as PageReadahead - but that does not matter +				 * enough to care.  What we do want is for this +				 * page to have PageReclaim set next time memcg +				 * reclaim reaches the tests above, so it will +				 * then wait_on_page_writeback() to avoid OOM; +				 * and it's also appropriate in global reclaim. +				 */ +				SetPageReclaim(page); +				nr_writeback++; +				goto keep_locked; +			} +			wait_on_page_writeback(page);  		}  		references = page_check_references(page, sc); @@ -921,6 +954,7 @@ keep:  	list_splice(&ret_pages, page_list);  	count_vm_events(PGACTIVATE, pgactivate); +	mem_cgroup_uncharge_end();  	*ret_nr_dirty += nr_dirty;  	*ret_nr_writeback += nr_writeback;  	return nr_reclaimed; @@ -1567,7 +1601,8 @@ static int vmscan_swappiness(struct scan_control *sc)   * by looking at the fraction of the pages scanned we did rotate back   * onto the active list instead of evict.   * - * nr[0] = anon pages to scan; nr[1] = file pages to scan + * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan + * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan   */  static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,  			   unsigned long *nr) @@ -2111,6 +2146,83 @@ out:  	return 0;  } +static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) +{ +	struct zone *zone; +	unsigned long pfmemalloc_reserve = 0; +	unsigned long free_pages = 0; +	int i; +	bool wmark_ok; + +	for (i = 0; i <= ZONE_NORMAL; i++) { +		zone = &pgdat->node_zones[i]; +		pfmemalloc_reserve += min_wmark_pages(zone); +		free_pages += zone_page_state(zone, NR_FREE_PAGES); +	} + +	wmark_ok = free_pages > pfmemalloc_reserve / 2; + +	/* kswapd must be awake if processes are being throttled */ +	if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { +		pgdat->classzone_idx = min(pgdat->classzone_idx, +						(enum zone_type)ZONE_NORMAL); +		wake_up_interruptible(&pgdat->kswapd_wait); +	} + +	return wmark_ok; +} + +/* + * Throttle direct reclaimers if backing storage is backed by the network + * and the PFMEMALLOC reserve for the preferred node is getting dangerously + * depleted. kswapd will continue to make progress and wake the processes + * when the low watermark is reached + */ +static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, +					nodemask_t *nodemask) +{ +	struct zone *zone; +	int high_zoneidx = gfp_zone(gfp_mask); +	pg_data_t *pgdat; + +	/* +	 * Kernel threads should not be throttled as they may be indirectly +	 * responsible for cleaning pages necessary for reclaim to make forward +	 * progress. kjournald for example may enter direct reclaim while +	 * committing a transaction where throttling it could forcing other +	 * processes to block on log_wait_commit(). +	 */ +	if (current->flags & PF_KTHREAD) +		return; + +	/* Check if the pfmemalloc reserves are ok */ +	first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); +	pgdat = zone->zone_pgdat; +	if (pfmemalloc_watermark_ok(pgdat)) +		return; + +	/* Account for the throttling */ +	count_vm_event(PGSCAN_DIRECT_THROTTLE); + +	/* +	 * If the caller cannot enter the filesystem, it's possible that it +	 * is due to the caller holding an FS lock or performing a journal +	 * transaction in the case of a filesystem like ext[3|4]. In this case, +	 * it is not safe to block on pfmemalloc_wait as kswapd could be +	 * blocked waiting on the same lock. Instead, throttle for up to a +	 * second before continuing. +	 */ +	if (!(gfp_mask & __GFP_FS)) { +		wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, +			pfmemalloc_watermark_ok(pgdat), HZ); +		return; +	} + +	/* Throttle until kswapd wakes the process */ +	wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, +		pfmemalloc_watermark_ok(pgdat)); +} +  unsigned long try_to_free_pages(struct zonelist *zonelist, int order,  				gfp_t gfp_mask, nodemask_t *nodemask)  { @@ -2130,6 +2242,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,  		.gfp_mask = sc.gfp_mask,  	}; +	throttle_direct_reclaim(gfp_mask, zonelist, nodemask); + +	/* +	 * Do not enter reclaim if fatal signal is pending. 1 is returned so +	 * that the page allocator does not consider triggering OOM +	 */ +	if (fatal_signal_pending(current)) +		return 1; +  	trace_mm_vmscan_direct_reclaim_begin(order,  				sc.may_writepage,  				gfp_mask); @@ -2141,7 +2262,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,  	return nr_reclaimed;  } -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG  unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,  						gfp_t gfp_mask, bool noswap, @@ -2274,8 +2395,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,  	return balanced_pages >= (present_pages >> 2);  } -/* is kswapd sleeping prematurely? */ -static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, +/* + * Prepare kswapd for sleeping. This verifies that there are no processes + * waiting in throttle_direct_reclaim() and that watermarks have been met. + * + * Returns true if kswapd is ready to sleep + */ +static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,  					int classzone_idx)  {  	int i; @@ -2284,7 +2410,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,  	/* If a direct reclaimer woke kswapd within HZ/10, it's premature */  	if (remaining) -		return true; +		return false; + +	/* +	 * There is a potential race between when kswapd checks its watermarks +	 * and a process gets throttled. There is also a potential race if +	 * processes get throttled, kswapd wakes, a large process exits therby +	 * balancing the zones that causes kswapd to miss a wakeup. If kswapd +	 * is going to sleep, no process should be sleeping on pfmemalloc_wait +	 * so wake them now if necessary. If necessary, processes will wake +	 * kswapd and get throttled again +	 */ +	if (waitqueue_active(&pgdat->pfmemalloc_wait)) { +		wake_up(&pgdat->pfmemalloc_wait); +		return false; +	}  	/* Check the watermark levels */  	for (i = 0; i <= classzone_idx; i++) { @@ -2317,9 +2457,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,  	 * must be balanced  	 */  	if (order) -		return !pgdat_balanced(pgdat, balanced, classzone_idx); +		return pgdat_balanced(pgdat, balanced, classzone_idx);  	else -		return !all_zones_ok; +		return all_zones_ok;  }  /* @@ -2537,7 +2677,7 @@ loop_again:  				 * consider it to be no longer congested. It's  				 * possible there are dirty pages backed by  				 * congested BDIs but as pressure is relieved, -				 * spectulatively avoid congestion waits +				 * speculatively avoid congestion waits  				 */  				zone_clear_flag(zone, ZONE_CONGESTED);  				if (i <= *classzone_idx) @@ -2545,6 +2685,16 @@ loop_again:  			}  		} + +		/* +		 * If the low watermark is met there is no need for processes +		 * to be throttled on pfmemalloc_wait as they should not be +		 * able to safely make forward progress. Wake them +		 */ +		if (waitqueue_active(&pgdat->pfmemalloc_wait) && +				pfmemalloc_watermark_ok(pgdat)) +			wake_up(&pgdat->pfmemalloc_wait); +  		if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))  			break;		/* kswapd: all done */  		/* @@ -2646,7 +2796,7 @@ out:  	}  	/* -	 * Return the order we were reclaiming at so sleeping_prematurely() +	 * Return the order we were reclaiming at so prepare_kswapd_sleep()  	 * makes a decision on the order we were last reclaiming at. However,  	 * if another caller entered the allocator slow path while kswapd  	 * was awake, order will remain at the higher level @@ -2666,7 +2816,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)  	prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);  	/* Try to sleep for a short interval */ -	if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { +	if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {  		remaining = schedule_timeout(HZ/10);  		finish_wait(&pgdat->kswapd_wait, &wait);  		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); @@ -2676,7 +2826,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)  	 * After a short sleep, check if it was a premature sleep. If not, then  	 * go fully to sleep until explicitly woken up.  	 */ -	if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { +	if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {  		trace_mm_vmscan_kswapd_sleep(pgdat->node_id);  		/* @@ -2688,7 +2838,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)  		 * them before going back to sleep.  		 */  		set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); -		schedule(); + +		if (!kthread_should_stop()) +			schedule(); +  		set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);  	} else {  		if (remaining) @@ -2955,14 +3108,17 @@ int kswapd_run(int nid)  }  /* - * Called by memory hotplug when all memory in a node is offlined. + * Called by memory hotplug when all memory in a node is offlined.  Caller must + * hold lock_memory_hotplug().   */  void kswapd_stop(int nid)  {  	struct task_struct *kswapd = NODE_DATA(nid)->kswapd; -	if (kswapd) +	if (kswapd) {  		kthread_stop(kswapd); +		NODE_DATA(nid)->kswapd = NULL; +	}  }  static int __init kswapd_init(void)  | 
