diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/cma.h | 2 | ||||
-rw-r--r-- | mm/cma_debug.c | 11 | ||||
-rw-r--r-- | mm/huge_memory.c | 7 | ||||
-rw-r--r-- | mm/kasan/kasan.c | 2 | ||||
-rw-r--r-- | mm/kasan/report.c | 2 | ||||
-rw-r--r-- | mm/memory-failure.c | 54 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 13 | ||||
-rw-r--r-- | mm/migrate.c | 8 | ||||
-rw-r--r-- | mm/page-writeback.c | 4 | ||||
-rw-r--r-- | mm/page_alloc.c | 76 | ||||
-rw-r--r-- | mm/page_owner.c | 7 | ||||
-rw-r--r-- | mm/shmem.c | 4 | ||||
-rw-r--r-- | mm/slab.c | 4 | ||||
-rw-r--r-- | mm/slab_common.c | 3 | ||||
-rw-r--r-- | mm/slub.c | 2 | ||||
-rw-r--r-- | mm/vmscan.c | 16 |
16 files changed, 132 insertions, 83 deletions
@@ -16,7 +16,7 @@ struct cma { extern struct cma cma_areas[MAX_CMA_AREAS]; extern unsigned cma_area_count; -static unsigned long cma_bitmap_maxno(struct cma *cma) +static inline unsigned long cma_bitmap_maxno(struct cma *cma) { return cma->count >> cma->order_per_bit; } diff --git a/mm/cma_debug.c b/mm/cma_debug.c index 7621ee34daa0..f8e4b60db167 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -39,7 +39,7 @@ static int cma_used_get(void *data, u64 *val) mutex_lock(&cma->lock); /* pages counter is smaller than sizeof(int) */ - used = bitmap_weight(cma->bitmap, (int)cma->count); + used = bitmap_weight(cma->bitmap, (int)cma_bitmap_maxno(cma)); mutex_unlock(&cma->lock); *val = (u64)used << cma->order_per_bit; @@ -52,13 +52,14 @@ static int cma_maxchunk_get(void *data, u64 *val) struct cma *cma = data; unsigned long maxchunk = 0; unsigned long start, end = 0; + unsigned long bitmap_maxno = cma_bitmap_maxno(cma); mutex_lock(&cma->lock); for (;;) { - start = find_next_zero_bit(cma->bitmap, cma->count, end); + start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end); if (start >= cma->count) break; - end = find_next_bit(cma->bitmap, cma->count, start); + end = find_next_bit(cma->bitmap, bitmap_maxno, start); maxchunk = max(end - start, maxchunk); } mutex_unlock(&cma->lock); @@ -170,10 +171,10 @@ static void cma_debugfs_add_one(struct cma *cma, int idx) tmp = debugfs_create_dir(name, cma_debugfs_root); - debugfs_create_file("alloc", S_IWUSR, cma_debugfs_root, cma, + debugfs_create_file("alloc", S_IWUSR, tmp, cma, &cma_alloc_fops); - debugfs_create_file("free", S_IWUSR, cma_debugfs_root, cma, + debugfs_create_file("free", S_IWUSR, tmp, cma, &cma_free_fops); debugfs_create_file("base_pfn", S_IRUGO, tmp, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c107094f79ba..097c7a4bfbd9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1676,12 +1676,7 @@ static void __split_huge_page_refcount(struct page *page, /* after clearing PageTail the gup refcount can be released */ smp_mb__after_atomic(); - /* - * retain hwpoison flag of the poisoned tail page: - * fix for the unsuitable process killed on Guest Machine(KVM) - * by the memory-failure. - */ - page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON; + page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; page_tail->flags |= (page->flags & ((1L << PG_referenced) | (1L << PG_swapbacked) | diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 6c513a63ea84..7b28e9cdf1c7 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -2,7 +2,7 @@ * This file contains shadow memory manipulation code. * * Copyright (c) 2014 Samsung Electronics Co., Ltd. - * Author: Andrey Ryabinin <a.ryabinin@samsung.com> + * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> * * Some of code borrowed from https://github.com/xairy/linux by * Andrey Konovalov <adech.fo@gmail.com> diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 680ceedf810a..e07c94fbd0ac 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -2,7 +2,7 @@ * This file contains error reporting code. * * Copyright (c) 2014 Samsung Electronics Co., Ltd. - * Author: Andrey Ryabinin <a.ryabinin@samsung.com> + * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> * * Some of code borrowed from https://github.com/xairy/linux by * Andrey Konovalov <adech.fo@gmail.com> diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c53543d89282..1f4446a90cef 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -909,6 +909,18 @@ int get_hwpoison_page(struct page *page) * directly for tail pages. */ if (PageTransHuge(head)) { + /* + * Non anonymous thp exists only in allocation/free time. We + * can't handle such a case correctly, so let's give it up. + * This should be better than triggering BUG_ON when kernel + * tries to touch the "partially handled" page. + */ + if (!PageAnon(head)) { + pr_err("MCE: %#lx: non anonymous thp\n", + page_to_pfn(page)); + return 0; + } + if (get_page_unless_zero(head)) { if (PageTail(page)) get_page(page); @@ -1134,17 +1146,11 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } if (!PageHuge(p) && PageTransHuge(hpage)) { - if (!PageAnon(hpage)) { - pr_err("MCE: %#lx: non anonymous thp\n", pfn); - if (TestClearPageHWPoison(p)) - atomic_long_sub(nr_pages, &num_poisoned_pages); - put_page(p); - if (p != hpage) - put_page(hpage); - return -EBUSY; - } - if (unlikely(split_huge_page(hpage))) { - pr_err("MCE: %#lx: thp split failed\n", pfn); + if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) { + if (!PageAnon(hpage)) + pr_err("MCE: %#lx: non anonymous thp\n", pfn); + else + pr_err("MCE: %#lx: thp split failed\n", pfn); if (TestClearPageHWPoison(p)) atomic_long_sub(nr_pages, &num_poisoned_pages); put_page(p); @@ -1209,9 +1215,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) if (!PageHWPoison(p)) { printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); atomic_long_sub(nr_pages, &num_poisoned_pages); + unlock_page(hpage); put_page(hpage); - res = 0; - goto out; + return 0; } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) @@ -1535,6 +1541,8 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags) */ ret = __get_any_page(page, pfn, 0); if (!PageLRU(page)) { + /* Drop page reference which is from __get_any_page() */ + put_page(page); pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", pfn, page->flags); return -EIO; @@ -1564,13 +1572,12 @@ static int soft_offline_huge_page(struct page *page, int flags) unlock_page(hpage); ret = isolate_huge_page(hpage, &pagelist); - if (ret) { - /* - * get_any_page() and isolate_huge_page() takes a refcount each, - * so need to drop one here. - */ - put_page(hpage); - } else { + /* + * get_any_page() and isolate_huge_page() takes a refcount each, + * so need to drop one here. + */ + put_page(hpage); + if (!ret) { pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn); return -EBUSY; } @@ -1656,6 +1663,8 @@ static int __soft_offline_page(struct page *page, int flags) inc_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); list_add(&page->lru, &pagelist); + if (!TestSetPageHWPoison(page)) + atomic_long_inc(&num_poisoned_pages); ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { @@ -1670,9 +1679,8 @@ static int __soft_offline_page(struct page *page, int flags) pfn, ret, page->flags); if (ret > 0) ret = -EIO; - } else { - SetPageHWPoison(page); - atomic_long_inc(&num_poisoned_pages); + if (TestClearPageHWPoison(page)) + atomic_long_dec(&num_poisoned_pages); } } else { pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 26fbba7d888f..6da82bcb0a8b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -446,7 +446,7 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) int nr_pages = PAGES_PER_SECTION; int nid = pgdat->node_id; int zone_type; - unsigned long flags; + unsigned long flags, pfn; int ret; zone_type = zone - pgdat->node_zones; @@ -461,6 +461,14 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) pgdat_resize_unlock(zone->zone_pgdat, &flags); memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn, MEMMAP_HOTPLUG); + + /* online_page_range is called later and expects pages reserved */ + for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) { + if (!pfn_valid(pfn)) + continue; + + SetPageReserved(pfn_to_page(pfn)); + } return 0; } @@ -1269,6 +1277,7 @@ int __ref add_memory(int nid, u64 start, u64 size) /* create new memmap entry */ firmware_map_add_hotplug(start, start + size, "System RAM"); + memblock_add_node(start, size, nid); goto out; @@ -2005,6 +2014,8 @@ void __ref remove_memory(int nid, u64 start, u64 size) /* remove memmap entry */ firmware_map_remove(start, start + size, "System RAM"); + memblock_free(start, size); + memblock_remove(start, size); arch_remove_memory(start, size); diff --git a/mm/migrate.c b/mm/migrate.c index ee401e4e5ef1..eb4267107d1f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -880,7 +880,8 @@ static int __unmap_and_move(struct page *page, struct page *newpage, /* Establish migration ptes or remove ptes */ if (page_mapped(page)) { try_to_unmap(page, - TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS| + TTU_IGNORE_HWPOISON); page_was_mapped = 1; } @@ -950,7 +951,10 @@ out: list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); - if (reason != MR_MEMORY_FAILURE) + /* Soft-offlined page shouldn't go through lru cache list */ + if (reason == MR_MEMORY_FAILURE) + put_page(page); + else putback_lru_page(page); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 22cddd3e5de8..5cccc127ef81 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2063,10 +2063,10 @@ static struct notifier_block ratelimit_nb = { */ void __init page_writeback_init(void) { + BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL)); + writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); - - BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL)); } /** diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 506eac8b38af..5b5240b7f642 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -18,7 +18,6 @@ #include <linux/mm.h> #include <linux/swap.h> #include <linux/interrupt.h> -#include <linux/rwsem.h> #include <linux/pagemap.h> #include <linux/jiffies.h> #include <linux/bootmem.h> @@ -246,9 +245,7 @@ static inline void reset_deferred_meminit(pg_data_t *pgdat) /* Returns true if the struct page for the pfn is uninitialised */ static inline bool __meminit early_page_uninitialised(unsigned long pfn) { - int nid = early_pfn_to_nid(pfn); - - if (pfn >= NODE_DATA(nid)->first_deferred_pfn) + if (pfn >= NODE_DATA(early_pfn_to_nid(pfn))->first_deferred_pfn) return true; return false; @@ -983,21 +980,21 @@ static void __init __free_pages_boot_core(struct page *page, #if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \ defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) -/* Only safe to use early in boot when initialisation is single-threaded */ + static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; int __meminit early_pfn_to_nid(unsigned long pfn) { + static DEFINE_SPINLOCK(early_pfn_lock); int nid; - /* The system will behave unpredictably otherwise */ - BUG_ON(system_state != SYSTEM_BOOTING); - + spin_lock(&early_pfn_lock); nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); - if (nid >= 0) - return nid; - /* just returns 0 */ - return 0; + if (nid < 0) + nid = 0; + spin_unlock(&early_pfn_lock); + + return nid; } #endif @@ -1062,7 +1059,15 @@ static void __init deferred_free_range(struct page *page, __free_pages_boot_core(page, pfn, 0); } -static __initdata DECLARE_RWSEM(pgdat_init_rwsem); +/* Completion tracking for deferred_init_memmap() threads */ +static atomic_t pgdat_init_n_undone __initdata; +static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp); + +static inline void __init pgdat_init_report_one_done(void) +{ + if (atomic_dec_and_test(&pgdat_init_n_undone)) + complete(&pgdat_init_all_done_comp); +} /* Initialise remaining memory on a node */ static int __init deferred_init_memmap(void *data) @@ -1079,7 +1084,7 @@ static int __init deferred_init_memmap(void *data) const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); if (first_init_pfn == ULONG_MAX) { - up_read(&pgdat_init_rwsem); + pgdat_init_report_one_done(); return 0; } @@ -1179,7 +1184,8 @@ free_range: pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages, jiffies_to_msecs(jiffies - start)); - up_read(&pgdat_init_rwsem); + + pgdat_init_report_one_done(); return 0; } @@ -1187,14 +1193,17 @@ void __init page_alloc_init_late(void) { int nid; + /* There will be num_node_state(N_MEMORY) threads */ + atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY)); for_each_node_state(nid, N_MEMORY) { - down_read(&pgdat_init_rwsem); kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid); } /* Block until all are initialised */ - down_write(&pgdat_init_rwsem); - up_write(&pgdat_init_rwsem); + wait_for_completion(&pgdat_init_all_done_comp); + + /* Reinit limits that are based on free pages after the kernel is up */ + files_maxfiles_init(); } #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ @@ -1287,6 +1296,10 @@ static inline int check_new_page(struct page *page) bad_reason = "non-NULL mapping"; if (unlikely(atomic_read(&page->_count) != 0)) bad_reason = "nonzero _count"; + if (unlikely(page->flags & __PG_HWPOISON)) { + bad_reason = "HWPoisoned (hardware-corrupted)"; + bad_flags = __PG_HWPOISON; + } if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; bad_flags = PAGE_FLAGS_CHECK_AT_PREP; @@ -1330,12 +1343,15 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, set_page_owner(page, order, gfp_flags); /* - * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to + * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to * allocate the page. The expectation is that the caller is taking * steps that will free more memory. The caller should avoid the page * being used for !PFMEMALLOC purposes. */ - page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); + if (alloc_flags & ALLOC_NO_WATERMARKS) + set_page_pfmemalloc(page); + else + clear_page_pfmemalloc(page); return 0; } @@ -1950,6 +1966,7 @@ void free_hot_cold_page_list(struct list_head *list, bool cold) void split_page(struct page *page, unsigned int order) { int i; + gfp_t gfp_mask; VM_BUG_ON_PAGE(PageCompound(page), page); VM_BUG_ON_PAGE(!page_count(page), page); @@ -1963,10 +1980,11 @@ void split_page(struct page *page, unsigned int order) split_page(virt_to_page(page[0].shadow), order); #endif - set_page_owner(page, 0, 0); + gfp_mask = get_page_owner_gfp(page); + set_page_owner(page, 0, gfp_mask); for (i = 1; i < (1 << order); i++) { set_page_refcounted(page + i); - set_page_owner(page + i, 0, 0); + set_page_owner(page + i, 0, gfp_mask); } } EXPORT_SYMBOL_GPL(split_page); @@ -1996,6 +2014,8 @@ int __isolate_free_page(struct page *page, unsigned int order) zone->free_area[order].nr_free--; rmv_page_order(page); + set_page_owner(page, order, __GFP_MOVABLE); + /* Set the pageblock if the isolated page is at least a pageblock */ if (order >= pageblock_order - 1) { struct page *endpage = page + (1 << order) - 1; @@ -2007,7 +2027,7 @@ int __isolate_free_page(struct page *page, unsigned int order) } } - set_page_owner(page, order, 0); + return 1UL << order; } @@ -3328,7 +3348,7 @@ refill: atomic_add(size - 1, &page->_count); /* reset page count bias and offset to start of new frag */ - nc->pfmemalloc = page->pfmemalloc; + nc->pfmemalloc = page_is_pfmemalloc(page); nc->pagecnt_bias = size; nc->offset = size; } @@ -5043,6 +5063,10 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, { unsigned long zone_start_pfn, zone_end_pfn; + /* When hotadd a new node, the node should be empty */ + if (!node_start_pfn && !node_end_pfn) + return 0; + /* Get the start and end of the zone */ zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; @@ -5106,6 +5130,10 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; unsigned long zone_start_pfn, zone_end_pfn; + /* When hotadd a new node, the node should be empty */ + if (!node_start_pfn && !node_end_pfn) + return 0; + zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); diff --git a/mm/page_owner.c b/mm/page_owner.c index bd5f842b56d2..983c3a10fa07 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -76,6 +76,13 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) __set_bit(PAGE_EXT_OWNER, &page_ext->flags); } +gfp_t __get_page_owner_gfp(struct page *page) +{ + struct page_ext *page_ext = lookup_page_ext(page); + + return page_ext->gfp_mask; +} + static ssize_t print_page_owner(char __user *buf, size_t count, unsigned long pfn, struct page *page, struct page_ext *page_ext) diff --git a/mm/shmem.c b/mm/shmem.c index 4caf8ed24d65..dbe0c1e8349c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3363,8 +3363,8 @@ put_path: * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be * kernel internal. There will be NO LSM permission checks against the * underlying inode. So users of this interface must do LSM checks at a - * higher layer. The one user is the big_key implementation. LSM checks - * are provided at the key level rather than the inode level. + * higher layer. The users are the big_key and shm implementations. LSM + * checks are provided at the key or shm level rather than the inode. * @name: name for dentry (to be seen in /proc/<pid>/maps * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size diff --git a/mm/slab.c b/mm/slab.c index 200e22412a16..bbd0b47dc6a9 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1603,7 +1603,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, } /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ - if (unlikely(page->pfmemalloc)) + if (page_is_pfmemalloc(page)) pfmemalloc_active = true; nr_pages = (1 << cachep->gfporder); @@ -1614,7 +1614,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, add_zone_page_state(page_zone(page), NR_SLAB_UNRECLAIMABLE, nr_pages); __SetPageSlab(page); - if (page->pfmemalloc) + if (page_is_pfmemalloc(page)) SetPageSlabPfmemalloc(page); if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { diff --git a/mm/slab_common.c b/mm/slab_common.c index 3e5f8f29c286..86831105a09f 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -37,8 +37,7 @@ struct kmem_cache *kmem_cache; SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ SLAB_FAILSLAB) -#define SLAB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_CACHE_DMA | SLAB_NOTRACK) +#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | SLAB_NOTRACK) /* * Merge control. If this is set then no merging of slab caches will occur. diff --git a/mm/slub.c b/mm/slub.c index 816df0016555..f68c0e50f3c0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1427,7 +1427,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) inc_slabs_node(s, page_to_nid(page), page->objects); page->slab_cache = s; __SetPageSlab(page); - if (page->pfmemalloc) + if (page_is_pfmemalloc(page)) SetPageSlabPfmemalloc(page); start = page_address(page); diff --git a/mm/vmscan.c b/mm/vmscan.c index e61445dce04e..8286938c70de 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -973,22 +973,18 @@ static unsigned long shrink_page_list(struct list_head *page_list, * caller can stall after page list has been processed. * * 2) Global or new memcg reclaim encounters a page that is - * not marked for immediate reclaim or the caller does not - * have __GFP_IO. In this case mark the page for immediate + * not marked for immediate reclaim, or the caller does not + * have __GFP_FS (or __GFP_IO if it's simply going to swap, + * not to fs). In this case mark the page for immediate * reclaim and continue scanning. * - * __GFP_IO is checked because a loop driver thread might + * Require may_enter_fs because we would wait on fs, which + * may not have submitted IO yet. And the loop driver might * enter reclaim, and deadlock if it waits on a page for * which it is needed to do the write (loop masks off * __GFP_IO|__GFP_FS for this reason); but more thought * would probably show more reasons. * - * Don't require __GFP_FS, since we're not going into the - * FS, just waiting on its writeback completion. Worryingly, - * ext4 gfs2 and xfs allocate pages with - * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing - * may_enter_fs here is liable to OOM on them. - * * 3) Legacy memcg encounters a page that is not already marked * PageReclaim. memcg does not have any dirty pages * throttling so we could easily OOM just because too many @@ -1005,7 +1001,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* Case 2 above */ } else if (sane_reclaim(sc) || - !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { + !PageReclaim(page) || !may_enter_fs) { /* * This is slightly racy - end_page_writeback() * might have just cleared PageReclaim, then |