summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig.debug17
-rw-r--r--mm/backing-dev.c1
-rw-r--r--mm/cma.c14
-rw-r--r--mm/cma_debug.c11
-rw-r--r--mm/compaction.c1039
-rw-r--r--mm/debug.c9
-rw-r--r--mm/dmapool.c13
-rw-r--r--mm/failslab.c14
-rw-r--r--mm/filemap.c309
-rw-r--r--mm/gup.c209
-rw-r--r--mm/gup_benchmark.c8
-rw-r--r--mm/hmm.c2
-rw-r--r--mm/huge_memory.c37
-rw-r--r--mm/hugetlb.c117
-rw-r--r--mm/internal.h24
-rw-r--r--mm/kasan/Makefile3
-rw-r--r--mm/kasan/common.c84
-rw-r--r--mm/kasan/generic.c19
-rw-r--r--mm/kasan/generic_report.c3
-rw-r--r--mm/kasan/init.c16
-rw-r--r--mm/kasan/kasan.h3
-rw-r--r--mm/kasan/tags.c2
-rw-r--r--mm/khugepaged.c2
-rw-r--r--mm/kmemleak.c10
-rw-r--r--mm/ksm.c77
-rw-r--r--mm/list_lru.c3
-rw-r--r--mm/maccess.c6
-rw-r--r--mm/memblock.c385
-rw-r--r--mm/memcontrol.c150
-rw-r--r--mm/memfd.c3
-rw-r--r--mm/memory-failure.c33
-rw-r--r--mm/memory.c98
-rw-r--r--mm/memory_hotplug.c178
-rw-r--r--mm/mempolicy.c10
-rw-r--r--mm/mempool.c8
-rw-r--r--mm/migrate.c50
-rw-r--r--mm/mincore.c94
-rw-r--r--mm/mlock.c14
-rw-r--r--mm/mmap.c22
-rw-r--r--mm/mprotect.c6
-rw-r--r--mm/mremap.c17
-rw-r--r--mm/oom_kill.c77
-rw-r--r--mm/page-writeback.c24
-rw-r--r--mm/page_alloc.c210
-rw-r--r--mm/page_ext.c9
-rw-r--r--mm/page_idle.c8
-rw-r--r--mm/page_owner.c8
-rw-r--r--mm/page_poison.c4
-rw-r--r--mm/percpu-km.c2
-rw-r--r--mm/percpu.c86
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/rmap.c10
-rw-r--r--mm/shmem.c753
-rw-r--r--mm/slab.c55
-rw-r--r--mm/slab.h11
-rw-r--r--mm/slab_common.c15
-rw-r--r--mm/slub.c77
-rw-r--r--mm/sparse.c29
-rw-r--r--mm/swap.c33
-rw-r--r--mm/swap_state.c23
-rw-r--r--mm/swapfile.c487
-rw-r--r--mm/truncate.c6
-rw-r--r--mm/usercopy.c9
-rw-r--r--mm/userfaultfd.c11
-rw-r--r--mm/util.c41
-rw-r--r--mm/vmalloc.c459
-rw-r--r--mm/vmscan.c98
-rw-r--r--mm/vmstat.c15
-rw-r--r--mm/workingset.c5
69 files changed, 3402 insertions, 2285 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 9a7b8b049d04..e3df921208c0 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -39,6 +39,23 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT
Enable debug page memory allocations by default? This value
can be overridden by debug_pagealloc=off|on.
+config PAGE_OWNER
+ bool "Track page owner"
+ depends on DEBUG_KERNEL && STACKTRACE_SUPPORT
+ select DEBUG_FS
+ select STACKTRACE
+ select STACKDEPOT
+ select PAGE_EXTENSION
+ help
+ This keeps track of what call chain is the owner of a page, may
+ help to find bare alloc_page(s) leaks. Even if you include this
+ feature on your build, it is disabled in default. You should pass
+ "page_owner=on" to boot parameter in order to enable it. Eats
+ a fair amount of memory if enabled. See tools/vm/page_owner_sort.c
+ for user-space helper.
+
+ If unsure, say N.
+
config PAGE_POISONING
bool "Poison pages after freeing"
select PAGE_POISONING_NO_SANITY if HIBERNATION
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8a8bb8796c6c..72e6d0c55cfa 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -689,6 +689,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
bdi->cgwb_congested_tree = RB_ROOT;
mutex_init(&bdi->cgwb_release_mutex);
+ init_rwsem(&bdi->wb_switch_rwsem);
ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
if (!ret) {
diff --git a/mm/cma.c b/mm/cma.c
index c7b39dd3b4f6..bb2d333ffcb3 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -327,16 +327,14 @@ int __init cma_declare_contiguous(phys_addr_t base,
* memory in case of failure.
*/
if (base < highmem_start && limit > highmem_start) {
- addr = memblock_alloc_range(size, alignment,
- highmem_start, limit,
- MEMBLOCK_NONE);
+ addr = memblock_phys_alloc_range(size, alignment,
+ highmem_start, limit);
limit = highmem_start;
}
if (!addr) {
- addr = memblock_alloc_range(size, alignment, base,
- limit,
- MEMBLOCK_NONE);
+ addr = memblock_phys_alloc_range(size, alignment, base,
+ limit);
if (!addr) {
ret = -ENOMEM;
goto err;
@@ -353,12 +351,14 @@ int __init cma_declare_contiguous(phys_addr_t base,
ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma);
if (ret)
- goto err;
+ goto free_mem;
pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M,
&base);
return 0;
+free_mem:
+ memblock_free(base, size);
err:
pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M);
return ret;
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index ad6723e9d110..8d7b2fd52225 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -21,8 +21,6 @@ struct cma_mem {
unsigned long n;
};
-static struct dentry *cma_debugfs_root;
-
static int cma_debugfs_get(void *data, u64 *val)
{
unsigned long *p = data;
@@ -162,7 +160,7 @@ static int cma_alloc_write(void *data, u64 val)
}
DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
-static void cma_debugfs_add_one(struct cma *cma, int idx)
+static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry)
{
struct dentry *tmp;
char name[16];
@@ -170,7 +168,7 @@ static void cma_debugfs_add_one(struct cma *cma, int idx)
scnprintf(name, sizeof(name), "cma-%s", cma->name);
- tmp = debugfs_create_dir(name, cma_debugfs_root);
+ tmp = debugfs_create_dir(name, root_dentry);
debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops);
debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops);
@@ -188,14 +186,13 @@ static void cma_debugfs_add_one(struct cma *cma, int idx)
static int __init cma_debugfs_init(void)
{
+ struct dentry *cma_debugfs_root;
int i;
cma_debugfs_root = debugfs_create_dir("cma", NULL);
- if (!cma_debugfs_root)
- return -ENOMEM;
for (i = 0; i < cma_area_count; i++)
- cma_debugfs_add_one(&cma_areas[i], i);
+ cma_debugfs_add_one(&cma_areas[i], cma_debugfs_root);
return 0;
}
diff --git a/mm/compaction.c b/mm/compaction.c
index ef29490b0f46..f171a83707ce 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -66,7 +66,7 @@ static unsigned long release_freepages(struct list_head *freelist)
return high_pfn;
}
-static void map_pages(struct list_head *list)
+static void split_map_pages(struct list_head *list)
{
unsigned int i, order, nr_pages;
struct page *page, *next;
@@ -237,6 +237,70 @@ static bool pageblock_skip_persistent(struct page *page)
return false;
}
+static bool
+__reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source,
+ bool check_target)
+{
+ struct page *page = pfn_to_online_page(pfn);
+ struct page *end_page;
+ unsigned long block_pfn;
+
+ if (!page)
+ return false;
+ if (zone != page_zone(page))
+ return false;
+ if (pageblock_skip_persistent(page))
+ return false;
+
+ /*
+ * If skip is already cleared do no further checking once the
+ * restart points have been set.
+ */
+ if (check_source && check_target && !get_pageblock_skip(page))
+ return true;
+
+ /*
+ * If clearing skip for the target scanner, do not select a
+ * non-movable pageblock as the starting point.
+ */
+ if (!check_source && check_target &&
+ get_pageblock_migratetype(page) != MIGRATE_MOVABLE)
+ return false;
+
+ /*
+ * Only clear the hint if a sample indicates there is either a
+ * free page or an LRU page in the block. One or other condition
+ * is necessary for the block to be a migration source/target.
+ */
+ block_pfn = pageblock_start_pfn(pfn);
+ pfn = max(block_pfn, zone->zone_start_pfn);
+ page = pfn_to_page(pfn);
+ if (zone != page_zone(page))
+ return false;
+ pfn = block_pfn + pageblock_nr_pages;
+ pfn = min(pfn, zone_end_pfn(zone));
+ end_page = pfn_to_page(pfn);
+
+ do {
+ if (pfn_valid_within(pfn)) {
+ if (check_source && PageLRU(page)) {
+ clear_pageblock_skip(page);
+ return true;
+ }
+
+ if (check_target && PageBuddy(page)) {
+ clear_pageblock_skip(page);
+ return true;
+ }
+ }
+
+ page += (1 << PAGE_ALLOC_COSTLY_ORDER);
+ pfn += (1 << PAGE_ALLOC_COSTLY_ORDER);
+ } while (page < end_page);
+
+ return false;
+}
+
/*
* This function is called to clear all cached information on pageblocks that
* should be skipped for page isolation when the migrate and free page scanner
@@ -244,30 +308,54 @@ static bool pageblock_skip_persistent(struct page *page)
*/
static void __reset_isolation_suitable(struct zone *zone)
{
- unsigned long start_pfn = zone->zone_start_pfn;
- unsigned long end_pfn = zone_end_pfn(zone);
- unsigned long pfn;
+ unsigned long migrate_pfn = zone->zone_start_pfn;
+ unsigned long free_pfn = zone_end_pfn(zone);
+ unsigned long reset_migrate = free_pfn;
+ unsigned long reset_free = migrate_pfn;
+ bool source_set = false;
+ bool free_set = false;
+
+ if (!zone->compact_blockskip_flush)
+ return;
zone->compact_blockskip_flush = false;
- /* Walk the zone and mark every pageblock as suitable for isolation */
- for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
- struct page *page;
-
+ /*
+ * Walk the zone and update pageblock skip information. Source looks
+ * for PageLRU while target looks for PageBuddy. When the scanner
+ * is found, both PageBuddy and PageLRU are checked as the pageblock
+ * is suitable as both source and target.
+ */
+ for (; migrate_pfn < free_pfn; migrate_pfn += pageblock_nr_pages,
+ free_pfn -= pageblock_nr_pages) {
cond_resched();
- page = pfn_to_online_page(pfn);
- if (!page)
- continue;
- if (zone != page_zone(page))
- continue;
- if (pageblock_skip_persistent(page))
- continue;
+ /* Update the migrate PFN */
+ if (__reset_isolation_pfn(zone, migrate_pfn, true, source_set) &&
+ migrate_pfn < reset_migrate) {
+ source_set = true;
+ reset_migrate = migrate_pfn;
+ zone->compact_init_migrate_pfn = reset_migrate;
+ zone->compact_cached_migrate_pfn[0] = reset_migrate;
+ zone->compact_cached_migrate_pfn[1] = reset_migrate;
+ }
- clear_pageblock_skip(page);
+ /* Update the free PFN */
+ if (__reset_isolation_pfn(zone, free_pfn, free_set, true) &&
+ free_pfn > reset_free) {
+ free_set = true;
+ reset_free = free_pfn;
+ zone->compact_init_free_pfn = reset_free;
+ zone->compact_cached_free_pfn = reset_free;
+ }
}
- reset_cached_positions(zone);
+ /* Leave no distance if no suitable block was reset */
+ if (reset_migrate >= reset_free) {
+ zone->compact_cached_migrate_pfn[0] = migrate_pfn;
+ zone->compact_cached_migrate_pfn[1] = migrate_pfn;
+ zone->compact_cached_free_pfn = free_pfn;
+ }
}
void reset_isolation_suitable(pg_data_t *pgdat)
@@ -286,15 +374,53 @@ void reset_isolation_suitable(pg_data_t *pgdat)
}
/*
+ * Sets the pageblock skip bit if it was clear. Note that this is a hint as
+ * locks are not required for read/writers. Returns true if it was already set.
+ */
+static bool test_and_set_skip(struct compact_control *cc, struct page *page,
+ unsigned long pfn)
+{
+ bool skip;
+
+ /* Do no update if skip hint is being ignored */
+ if (cc->ignore_skip_hint)
+ return false;
+
+ if (!IS_ALIGNED(pfn, pageblock_nr_pages))
+ return false;
+
+ skip = get_pageblock_skip(page);
+ if (!skip && !cc->no_set_skip_hint)
+ set_pageblock_skip(page);
+
+ return skip;
+}
+
+static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
+{
+ struct zone *zone = cc->zone;
+
+ pfn = pageblock_end_pfn(pfn);
+
+ /* Set for isolation rather than compaction */
+ if (cc->no_set_skip_hint)
+ return;
+
+ if (pfn > zone->compact_cached_migrate_pfn[0])
+ zone->compact_cached_migrate_pfn[0] = pfn;
+ if (cc->mode != MIGRATE_ASYNC &&
+ pfn > zone->compact_cached_migrate_pfn[1])
+ zone->compact_cached_migrate_pfn[1] = pfn;
+}
+
+/*
* If no pages were isolated then mark this pageblock to be skipped in the
* future. The information is later cleared by __reset_isolation_suitable().
*/
static void update_pageblock_skip(struct compact_control *cc,
- struct page *page, unsigned long nr_isolated,
- bool migrate_scanner)
+ struct page *page, unsigned long pfn)
{
struct zone *zone = cc->zone;
- unsigned long pfn;
if (cc->no_set_skip_hint)
return;
@@ -302,24 +428,11 @@ static void update_pageblock_skip(struct compact_control *cc,
if (!page)
return;
- if (nr_isolated)
- return;
-
set_pageblock_skip(page);
- pfn = page_to_pfn(page);
-
/* Update where async and sync compaction should restart */
- if (migrate_scanner) {
- if (pfn > zone->compact_cached_migrate_pfn[0])
- zone->compact_cached_migrate_pfn[0] = pfn;
- if (cc->mode != MIGRATE_ASYNC &&
- pfn > zone->compact_cached_migrate_pfn[1])
- zone->compact_cached_migrate_pfn[1] = pfn;
- } else {
- if (pfn < zone->compact_cached_free_pfn)
- zone->compact_cached_free_pfn = pfn;
- }
+ if (pfn < zone->compact_cached_free_pfn)
+ zone->compact_cached_free_pfn = pfn;
}
#else
static inline bool isolation_suitable(struct compact_control *cc,
@@ -334,32 +447,42 @@ static inline bool pageblock_skip_persistent(struct page *page)
}
static inline void update_pageblock_skip(struct compact_control *cc,
- struct page *page, unsigned long nr_isolated,
- bool migrate_scanner)
+ struct page *page, unsigned long pfn)
+{
+}
+
+static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
+{
+}
+
+static bool test_and_set_skip(struct compact_control *cc, struct page *page,
+ unsigned long pfn)
{
+ return false;
}
#endif /* CONFIG_COMPACTION */
/*
* Compaction requires the taking of some coarse locks that are potentially
- * very heavily contended. For async compaction, back out if the lock cannot
- * be taken immediately. For sync compaction, spin on the lock if needed.
+ * very heavily contended. For async compaction, trylock and record if the
+ * lock is contended. The lock will still be acquired but compaction will
+ * abort when the current block is finished regardless of success rate.
+ * Sync compaction acquires the lock.
*
- * Returns true if the lock is held
- * Returns false if the lock is not held and compaction should abort
+ * Always returns true which makes it easier to track lock state in callers.
*/
-static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags,
+static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
struct compact_control *cc)
{
- if (cc->mode == MIGRATE_ASYNC) {
- if (!spin_trylock_irqsave(lock, *flags)) {
- cc->contended = true;
- return false;
- }
- } else {
- spin_lock_irqsave(lock, *flags);
+ /* Track if the lock is contended in async mode */
+ if (cc->mode == MIGRATE_ASYNC && !cc->contended) {
+ if (spin_trylock_irqsave(lock, *flags))
+ return true;
+
+ cc->contended = true;
}
+ spin_lock_irqsave(lock, *flags);
return true;
}
@@ -391,37 +514,7 @@ static bool compact_unlock_should_abort(spinlock_t *lock,
return true;
}
- if (need_resched()) {
- if (cc->mode == MIGRATE_ASYNC) {
- cc->contended = true;
- return true;
- }
- cond_resched();
- }
-
- return false;
-}
-
-/*
- * Aside from avoiding lock contention, compaction also periodically checks
- * need_resched() and either schedules in sync compaction or aborts async
- * compaction. This is similar to what compact_unlock_should_abort() does, but
- * is used where no lock is concerned.
- *
- * Returns false when no scheduling was needed, or sync compaction scheduled.
- * Returns true when async compaction should abort.
- */
-static inline bool compact_should_abort(struct compact_control *cc)
-{
- /* async compaction aborts if contended */
- if (need_resched()) {
- if (cc->mode == MIGRATE_ASYNC) {
- cc->contended = true;
- return true;
- }
-
- cond_resched();
- }
+ cond_resched();
return false;
}
@@ -435,19 +528,24 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
unsigned long *start_pfn,
unsigned long end_pfn,
struct list_head *freelist,
+ unsigned int stride,
bool strict)
{
int nr_scanned = 0, total_isolated = 0;
- struct page *cursor, *valid_page = NULL;
+ struct page *cursor;
unsigned long flags = 0;
bool locked = false;
unsigned long blockpfn = *start_pfn;
unsigned int order;
+ /* Strict mode is for isolation, speed is secondary */
+ if (strict)
+ stride = 1;
+
cursor = pfn_to_page(blockpfn);
/* Isolate free pages. */
- for (; blockpfn < end_pfn; blockpfn++, cursor++) {
+ for (; blockpfn < end_pfn; blockpfn += stride, cursor += stride) {
int isolated;
struct page *page = cursor;
@@ -465,9 +563,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
if (!pfn_valid_within(blockpfn))
goto isolate_fail;
- if (!valid_page)
- valid_page = page;
-
/*
* For compound pages such as THP and hugetlbfs, we can save
* potentially a lot of iterations if we skip them at once.
@@ -495,18 +590,8 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
* recheck as well.
*/
if (!locked) {
- /*
- * The zone lock must be held to isolate freepages.
- * Unfortunately this is a very coarse lock and can be
- * heavily contended if there are parallel allocations
- * or parallel compactions. For async compaction do not
- * spin on the lock and we acquire the lock as late as
- * possible.
- */
- locked = compact_trylock_irqsave(&cc->zone->lock,
+ locked = compact_lock_irqsave(&cc->zone->lock,
&flags, cc);
- if (!locked)
- break;
/* Recheck this is a buddy page under lock */
if (!PageBuddy(page))
@@ -565,10 +650,6 @@ isolate_fail:
if (strict && blockpfn < end_pfn)
total_isolated = 0;
- /* Update the pageblock-skip if the whole pageblock was scanned */
- if (blockpfn == end_pfn)
- update_pageblock_skip(cc, valid_page, total_isolated, false);
-
cc->total_free_scanned += nr_scanned;
if (total_isolated)
count_compact_events(COMPACTISOLATED, total_isolated);
@@ -626,7 +707,7 @@ isolate_freepages_range(struct compact_control *cc,
break;
isolated = isolate_freepages_block(cc, &isolate_start_pfn,
- block_end_pfn, &freelist, true);
+ block_end_pfn, &freelist, 0, true);
/*
* In strict mode, isolate_freepages_block() returns 0 if
@@ -644,7 +725,7 @@ isolate_freepages_range(struct compact_control *cc,
}
/* __isolate_free_page() does not map the pages */
- map_pages(&freelist);
+ split_map_pages(&freelist);
if (pfn < end_pfn) {
/* Loop terminated early, cleanup. */
@@ -657,16 +738,16 @@ isolate_freepages_range(struct compact_control *cc,
}
/* Similar to reclaim, but different enough that they don't share logic */
-static bool too_many_isolated(struct zone *zone)
+static bool too_many_isolated(pg_data_t *pgdat)
{
unsigned long active, inactive, isolated;
- inactive = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) +
- node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON);
- active = node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE) +
- node_page_state(zone->zone_pgdat, NR_ACTIVE_ANON);
- isolated = node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE) +
- node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON);
+ inactive = node_page_state(pgdat, NR_INACTIVE_FILE) +
+ node_page_state(pgdat, NR_INACTIVE_ANON);
+ active = node_page_state(pgdat, NR_ACTIVE_FILE) +
+ node_page_state(pgdat, NR_ACTIVE_ANON);
+ isolated = node_page_state(pgdat, NR_ISOLATED_FILE) +
+ node_page_state(pgdat, NR_ISOLATED_ANON);
return isolated > (inactive + active) / 2;
}
@@ -693,7 +774,7 @@ static unsigned long
isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
unsigned long end_pfn, isolate_mode_t isolate_mode)
{
- struct zone *zone = cc->zone;
+ pg_data_t *pgdat = cc->zone->zone_pgdat;
unsigned long nr_scanned = 0, nr_isolated = 0;
struct lruvec *lruvec;
unsigned long flags = 0;
@@ -702,13 +783,14 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
unsigned long start_pfn = low_pfn;
bool skip_on_failure = false;
unsigned long next_skip_pfn = 0;
+ bool skip_updated = false;
/*
* Ensure that there are not too many pages isolated from the LRU
* list by either parallel reclaimers or compaction. If there are,
* delay for some time until fewer pages are isolated
*/
- while (unlikely(too_many_isolated(zone))) {
+ while (unlikely(too_many_isolated(pgdat))) {
/* async migration should just abort */
if (cc->mode == MIGRATE_ASYNC)
return 0;
@@ -719,8 +801,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
return 0;
}
- if (compact_should_abort(cc))
- return 0;
+ cond_resched();
if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) {
skip_on_failure = true;
@@ -758,8 +839,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* if contended.
*/
if (!(low_pfn % SWAP_CLUSTER_MAX)
- && compact_unlock_should_abort(zone_lru_lock(zone), flags,
- &locked, cc))
+ && compact_unlock_should_abort(&pgdat->lru_lock,
+ flags, &locked, cc))
break;
if (!pfn_valid_within(low_pfn))
@@ -768,8 +849,19 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
page = pfn_to_page(low_pfn);
- if (!valid_page)
+ /*
+ * Check if the pageblock has already been marked skipped.
+ * Only the aligned PFN is checked as the caller isolates
+ * COMPACT_CLUSTER_MAX at a time so the second call must
+ * not falsely conclude that the block should be skipped.
+ */
+ if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) {
+ if (!cc->ignore_skip_hint && get_pageblock_skip(page)) {
+ low_pfn = end_pfn;
+ goto isolate_abort;
+ }
valid_page = page;
+ }
/*
* Skip if free. We read page order here without zone lock
@@ -818,7 +910,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (unlikely(__PageMovable(page)) &&
!PageIsolated(page)) {
if (locked) {
- spin_unlock_irqrestore(zone_lru_lock(zone),
+ spin_unlock_irqrestore(&pgdat->lru_lock,
flags);
locked = false;
}
@@ -848,10 +940,15 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* If we already hold the lock, we can skip some rechecking */
if (!locked) {
- locked = compact_trylock_irqsave(zone_lru_lock(zone),
+ locked = compact_lock_irqsave(&pgdat->lru_lock,
&flags, cc);
- if (!locked)
- break;
+
+ /* Try get exclusive access under lock */
+ if (!skip_updated) {
+ skip_updated = true;
+ if (test_and_set_skip(cc, page, low_pfn))
+ goto isolate_abort;
+ }
/* Recheck PageLRU and PageCompound under lock */
if (!PageLRU(page))
@@ -868,7 +965,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
}
}
- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
/* Try isolate the page */
if (__isolate_lru_page(page, isolate_mode) != 0)
@@ -887,16 +984,13 @@ isolate_success:
nr_isolated++;
/*
- * Record where we could have freed pages by migration and not
- * yet flushed them to buddy allocator.
- * - this is the lowest page that was isolated and likely be
- * then freed by migration.
+ * Avoid isolating too much unless this block is being
+ * rescanned (e.g. dirty/writeback pages, parallel allocation)
+ * or a lock is contended. For contention, isolate quickly to
+ * potentially remove one source of contention.
*/
- if (!cc->last_migrated_pfn)
- cc->last_migrated_pfn = low_pfn;
-
- /* Avoid isolating too much */
- if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
+ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX &&
+ !cc->rescan && !cc->contended) {
++low_pfn;
break;
}
@@ -913,12 +1007,11 @@ isolate_fail:
*/
if (nr_isolated) {
if (locked) {
- spin_unlock_irqrestore(zone_lru_lock(zone), flags);
+ spin_unlock_irqrestore(&pgdat->lru_lock, flags);
locked = false;
}
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
- cc->last_migrated_pfn = 0;
nr_isolated = 0;
}
@@ -939,15 +1032,23 @@ isolate_fail:
if (unlikely(low_pfn > end_pfn))
low_pfn = end_pfn;
+isolate_abort:
if (locked)
- spin_unlock_irqrestore(zone_lru_lock(zone), flags);
+ spin_unlock_irqrestore(&pgdat->lru_lock, flags);
/*
- * Update the pageblock-skip information and cached scanner pfn,
- * if the whole pageblock was scanned without isolating any page.
+ * Updated the cached scanner pfn once the pageblock has been scanned
+ * Pages will either be migrated in which case there is no point
+ * scanning in the near future or migration failed in which case the
+ * failure reason may persist. The block is marked for skipping if
+ * there were no pages isolated in the block or if the block is
+ * rescanned twice in a row.
*/
- if (low_pfn == end_pfn)
- update_pageblock_skip(cc, valid_page, nr_isolated, true);
+ if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) {
+ if (valid_page && !skip_updated)
+ set_pageblock_skip(valid_page);
+ update_cached_migrate(cc, low_pfn);
+ }
trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
nr_scanned, nr_isolated);
@@ -1013,6 +1114,9 @@ static bool suitable_migration_source(struct compact_control *cc,
{
int block_mt;
+ if (pageblock_skip_persistent(page))
+ return false;
+
if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction)
return true;
@@ -1050,6 +1154,12 @@ static bool suitable_migration_target(struct compact_control *cc,
return false;
}
+static inline unsigned int
+freelist_scan_limit(struct compact_control *cc)
+{
+ return (COMPACT_CLUSTER_MAX >> cc->fast_search_fail) + 1;
+}
+
/*
* Test whether the free scanner has reached the same or lower pageblock than
* the migration scanner, and compaction should thus terminate.
@@ -1061,6 +1171,248 @@ static inline bool compact_scanners_met(struct compact_control *cc)
}
/*
+ * Used when scanning for a suitable migration target which scans freelists
+ * in reverse. Reorders the list such as the unscanned pages are scanned
+ * first on the next iteration of the free scanner
+ */
+static void
+move_freelist_head(struct list_head *freelist, struct page *freepage)
+{
+ LIST_HEAD(sublist);
+
+ if (!list_is_last(freelist, &freepage->lru)) {
+ list_cut_before(&sublist, freelist, &freepage->lru);
+ if (!list_empty(&sublist))
+ list_splice_tail(&sublist, freelist);
+ }
+}
+
+/*
+ * Similar to move_freelist_head except used by the migration scanner
+ * when scanning forward. It's possible for these list operations to
+ * move against each other if they search the free list exactly in
+ * lockstep.
+ */
+static void
+move_freelist_tail(struct list_head *freelist, struct page *freepage)
+{
+ LIST_HEAD(sublist);
+
+ if (!list_is_first(freelist, &freepage->lru)) {
+ list_cut_position(&sublist, freelist, &freepage->lru);
+ if (!list_empty(&sublist))
+ list_splice_tail(&sublist, freelist);
+ }
+}
+
+static void
+fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long nr_isolated)
+{
+ unsigned long start_pfn, end_pfn;
+ struct page *page = pfn_to_page(pfn);
+
+ /* Do not search around if there are enough pages already */
+ if (cc->nr_freepages >= cc->nr_migratepages)
+ return;
+
+ /* Minimise scanning during async compaction */
+ if (cc->direct_compaction && cc->mode == MIGRATE_ASYNC)
+ return;
+
+ /* Pageblock boundaries */
+ start_pfn = pageblock_start_pfn(pfn);
+ end_pfn = min(start_pfn + pageblock_nr_pages, zone_end_pfn(cc->zone));
+
+ /* Scan before */
+ if (start_pfn != pfn) {
+ isolate_freepages_block(cc, &start_pfn, pfn, &cc->freepages, 1, false);
+ if (cc->nr_freepages >= cc->nr_migratepages)
+ return;
+ }
+
+ /* Scan after */
+ start_pfn = pfn + nr_isolated;
+ if (start_pfn != end_pfn)
+ isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false);
+
+ /* Skip this pageblock in the future as it's full or nearly full */
+ if (cc->nr_freepages < cc->nr_migratepages)
+ set_pageblock_skip(page);
+}
+
+/* Search orders in round-robin fashion */
+static int next_search_order(struct compact_control *cc, int order)
+{
+ order--;
+ if (order < 0)
+ order = cc->order - 1;
+
+ /* Search wrapped around? */
+ if (order == cc->search_order) {
+ cc->search_order--;
+ if (cc->search_order < 0)
+ cc->search_order = cc->order - 1;
+ return -1;
+ }
+
+ return order;
+}
+
+static unsigned long
+fast_isolate_freepages(struct compact_control *cc)
+{
+ unsigned int limit = min(1U, freelist_scan_limit(cc) >> 1);
+ unsigned int nr_scanned = 0;
+ unsigned long low_pfn, min_pfn, high_pfn = 0, highest = 0;
+ unsigned long nr_isolated = 0;
+ unsigned long distance;
+ struct page *page = NULL;
+ bool scan_start = false;
+ int order;
+
+ /* Full compaction passes in a negative order */
+ if (cc->order <= 0)
+ return cc->free_pfn;
+
+ /*
+ * If starting the scan, use a deeper search and use the highest
+ * PFN found if a suitable one is not found.
+ */
+ if (cc->free_pfn >= cc->zone->compact_init_free_pfn) {
+ limit = pageblock_nr_pages >> 1;
+ scan_start = true;
+ }
+
+ /*
+ * Preferred point is in the top quarter of the scan space but take
+ * a pfn from the top half if the search is problematic.
+ */
+ distance = (cc->free_pfn - cc->migrate_pfn);
+ low_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 2));
+ min_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 1));
+
+ if (WARN_ON_ONCE(min_pfn > low_pfn))
+ low_pfn = min_pfn;
+
+ /*
+ * Search starts from the last successful isolation order or the next
+ * order to search after a previous failure
+ */
+ cc->search_order = min_t(unsigned int, cc->order - 1, cc->search_order);
+
+ for (order = cc->search_order;
+ !page && order >= 0;
+ order = next_search_order(cc, order)) {
+ struct free_area *area = &cc->zone->free_area[order];
+ struct list_head *freelist;
+ struct page *freepage;
+ unsigned long flags;
+ unsigned int order_scanned = 0;
+
+ if (!area->nr_free)
+ continue;
+
+ spin_lock_irqsave(&cc->zone->lock, flags);
+ freelist = &area->free_list[MIGRATE_MOVABLE];
+ list_for_each_entry_reverse(freepage, freelist, lru) {
+ unsigned long pfn;
+
+ order_scanned++;
+ nr_scanned++;
+ pfn = page_to_pfn(freepage);
+
+ if (pfn >= highest)
+ highest = pageblock_start_pfn(pfn);
+
+ if (pfn >= low_pfn) {
+ cc->fast_search_fail = 0;
+ cc->search_order = order;
+ page = freepage;
+ break;
+ }
+
+ if (pfn >= min_pfn && pfn > high_pfn) {
+ high_pfn = pfn;
+
+ /* Shorten the scan if a candidate is found */
+ limit >>= 1;
+ }
+
+ if (order_scanned >= limit)
+ break;
+ }
+
+ /* Use a minimum pfn if a preferred one was not found */
+ if (!page && high_pfn) {
+ page = pfn_to_page(high_pfn);
+
+ /* Update freepage for the list reorder below */
+ freepage = page;
+ }
+
+ /* Reorder to so a future search skips recent pages */
+ move_freelist_head(freelist, freepage);
+
+ /* Isolate the page if available */
+ if (page) {
+ if (__isolate_free_page(page, order)) {
+ set_page_private(page, order);
+ nr_isolated = 1 << order;
+ cc->nr_freepages += nr_isolated;
+ list_add_tail(&page->lru, &cc->freepages);
+ count_compact_events(COMPACTISOLATED, nr_isolated);
+ } else {
+ /* If isolation fails, abort the search */
+ order = -1;
+ page = NULL;
+ }
+ }
+
+ spin_unlock_irqrestore(&cc->zone->lock, flags);
+
+ /*
+ * Smaller scan on next order so the total scan ig related
+ * to freelist_scan_limit.
+ */
+ if (order_scanned >= limit)
+ limit = min(1U, limit >> 1);
+ }
+
+ if (!page) {
+ cc->fast_search_fail++;
+ if (scan_start) {
+ /*
+ * Use the highest PFN found above min. If one was
+ * not found, be pessemistic for direct compaction
+ * and use the min mark.
+ */
+ if (highest) {
+ page = pfn_to_page(highest);
+ cc->free_pfn = highest;
+ } else {
+ if (cc->direct_compaction) {
+ page = pfn_to_page(min_pfn);
+ cc->free_pfn = min_pfn;
+ }
+ }
+ }
+ }
+
+ if (highest && highest >= cc->zone->compact_cached_free_pfn) {
+ highest -= pageblock_nr_pages;
+ cc->zone->compact_cached_free_pfn = highest;
+ }
+
+ cc->total_free_scanned += nr_scanned;
+ if (!page)
+ return cc->free_pfn;
+
+ low_pfn = page_to_pfn(page);
+ fast_isolate_around(cc, low_pfn, nr_isolated);
+ return low_pfn;
+}
+
+/*
* Based on information in the current compact_control, find blocks
* suitable for isolating free pages from and then isolate them.
*/
@@ -1073,6 +1425,12 @@ static void isolate_freepages(struct compact_control *cc)
unsigned long block_end_pfn; /* end of current pageblock */
unsigned long low_pfn; /* lowest pfn scanner is able to scan */
struct list_head *freelist = &cc->freepages;
+ unsigned int stride;
+
+ /* Try a small search of the free lists for a candidate */
+ isolate_start_pfn = fast_isolate_freepages(cc);
+ if (cc->nr_freepages)
+ goto splitmap;
/*
* Initialise the free scanner. The starting point is where we last
@@ -1086,10 +1444,11 @@ static void isolate_freepages(struct compact_control *cc)
* is using.
*/
isolate_start_pfn = cc->free_pfn;
- block_start_pfn = pageblock_start_pfn(cc->free_pfn);
+ block_start_pfn = pageblock_start_pfn(isolate_start_pfn);
block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
zone_end_pfn(zone));
low_pfn = pageblock_end_pfn(cc->migrate_pfn);
+ stride = cc->mode == MIGRATE_ASYNC ? COMPACT_CLUSTER_MAX : 1;
/*
* Isolate free pages until enough are available to migrate the
@@ -1100,14 +1459,14 @@ static void isolate_freepages(struct compact_control *cc)
block_end_pfn = block_start_pfn,
block_start_pfn -= pageblock_nr_pages,
isolate_start_pfn = block_start_pfn) {
+ unsigned long nr_isolated;
+
/*
* This can iterate a massively long zone without finding any
- * suitable migration targets, so periodically check if we need
- * to schedule, or even abort async compaction.
+ * suitable migration targets, so periodically check resched.
*/
- if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
- && compact_should_abort(cc))
- break;
+ if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
+ cond_resched();
page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
zone);
@@ -1123,15 +1482,15 @@ static void isolate_freepages(struct compact_control *cc)
continue;
/* Found a block suitable for isolating free pages from. */
- isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn,
- freelist, false);
+ nr_isolated = isolate_freepages_block(cc, &isolate_start_pfn,
+ block_end_pfn, freelist, stride, false);
- /*
- * If we isolated enough freepages, or aborted due to lock
- * contention, terminate.
- */
- if ((cc->nr_freepages >= cc->nr_migratepages)
- || cc->contended) {
+ /* Update the skip hint if the full pageblock was scanned */
+ if (isolate_start_pfn == block_end_pfn)
+ update_pageblock_skip(cc, page, block_start_pfn);
+
+ /* Are enough freepages isolated? */
+ if (cc->nr_freepages >= cc->nr_migratepages) {
if (isolate_start_pfn >= block_end_pfn) {
/*
* Restart at previous pageblock if more
@@ -1148,10 +1507,14 @@ static void isolate_freepages(struct compact_control *cc)
*/
break;
}
- }
- /* __isolate_free_page() does not map the pages */
- map_pages(freelist);
+ /* Adjust stride depending on isolation */
+ if (nr_isolated) {
+ stride = 1;
+ continue;
+ }
+ stride = min_t(unsigned int, COMPACT_CLUSTER_MAX, stride << 1);
+ }
/*
* Record where the free scanner will restart next time. Either we
@@ -1160,6 +1523,10 @@ static void isolate_freepages(struct compact_control *cc)
* and the loop terminated due to isolate_start_pfn < low_pfn
*/
cc->free_pfn = isolate_start_pfn;
+
+splitmap:
+ /* __isolate_free_page() does not map the pages */
+ split_map_pages(freelist);
}
/*
@@ -1172,13 +1539,8 @@ static struct page *compaction_alloc(struct page *migratepage,
struct compact_control *cc = (struct compact_control *)data;
struct page *freepage;
- /*
- * Isolate free pages if necessary, and if we are not aborting due to
- * contention.
- */
if (list_empty(&cc->freepages)) {
- if (!cc->contended)
- isolate_freepages(cc);
+ isolate_freepages(cc);
if (list_empty(&cc->freepages))
return NULL;
@@ -1217,6 +1579,147 @@ typedef enum {
*/
int sysctl_compact_unevictable_allowed __read_mostly = 1;
+static inline void
+update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)
+{
+ if (cc->fast_start_pfn == ULONG_MAX)
+ return;
+
+ if (!cc->fast_start_pfn)
+ cc->fast_start_pfn = pfn;
+
+ cc->fast_start_pfn = min(cc->fast_start_pfn, pfn);
+}
+
+static inline unsigned long
+reinit_migrate_pfn(struct compact_control *cc)
+{
+ if (!cc->fast_start_pfn || cc->fast_start_pfn == ULONG_MAX)
+ return cc->migrate_pfn;
+
+ cc->migrate_pfn = cc->fast_start_pfn;
+ cc->fast_start_pfn = ULONG_MAX;
+
+ return cc->migrate_pfn;
+}
+
+/*
+ * Briefly search the free lists for a migration source that already has
+ * some free pages to reduce the number of pages that need migration
+ * before a pageblock is free.
+ */
+static unsigned long fast_find_migrateblock(struct compact_control *cc)
+{
+ unsigned int limit = freelist_scan_limit(cc);
+ unsigned int nr_scanned = 0;
+ unsigned long distance;
+ unsigned long pfn = cc->migrate_pfn;
+ unsigned long high_pfn;
+ int order;
+
+ /* Skip hints are relied on to avoid repeats on the fast search */
+ if (cc->ignore_skip_hint)
+ return pfn;
+
+ /*
+ * If the migrate_pfn is not at the start of a zone or the start
+ * of a pageblock then assume this is a continuation of a previous
+ * scan restarted due to COMPACT_CLUSTER_MAX.
+ */
+ if (pfn != cc->zone->zone_start_pfn && pfn != pageblock_start_pfn(pfn))
+ return pfn;
+
+ /*
+ * For smaller orders, just linearly scan as the number of pages
+ * to migrate should be relatively small and does not necessarily
+ * justify freeing up a large block for a small allocation.
+ */
+ if (cc->order <= PAGE_ALLOC_COSTLY_ORDER)
+ return pfn;
+
+ /*
+ * Only allow kcompactd and direct requests for movable pages to
+ * quickly clear out a MOVABLE pageblock for allocation. This
+ * reduces the risk that a large movable pageblock is freed for
+ * an unmovable/reclaimable small allocation.
+ */
+ if (cc->direct_compaction && cc->migratetype != MIGRATE_MOVABLE)
+ return pfn;
+
+ /*
+ * When starting the migration scanner, pick any pageblock within the
+ * first half of the search space. Otherwise try and pick a pageblock
+ * within the first eighth to reduce the chances that a migration
+ * target later becomes a source.
+ */
+ distance = (cc->free_pfn - cc->migrate_pfn) >> 1;
+ if (cc->migrate_pfn != cc->zone->zone_start_pfn)
+ distance >>= 2;
+ high_pfn = pageblock_start_pfn(cc->migrate_pfn + distance);
+
+ for (order = cc->order - 1;
+ order >= PAGE_ALLOC_COSTLY_ORDER && pfn == cc->migrate_pfn && nr_scanned < limit;
+ order--) {
+ struct free_area *area = &cc->zone->free_area[order];
+ struct list_head *freelist;
+ unsigned long flags;
+ struct page *freepage;
+
+ if (!area->nr_free)
+ continue;
+
+ spin_lock_irqsave(&cc->zone->lock, flags);
+ freelist = &area->free_list[MIGRATE_MOVABLE];
+ list_for_each_entry(freepage, freelist, lru) {
+ unsigned long free_pfn;
+
+ nr_scanned++;
+ free_pfn = page_to_pfn(freepage);
+ if (free_pfn < high_pfn) {
+ /*
+ * Avoid if skipped recently. Ideally it would
+ * move to the tail but even safe iteration of
+ * the list assumes an entry is deleted, not
+ * reordered.
+ */
+ if (get_pageblock_skip(freepage)) {
+ if (list_is_last(freelist, &freepage->lru))
+ break;
+
+ continue;
+ }
+
+ /* Reorder to so a future search skips recent pages */
+ move_freelist_tail(freelist, freepage);
+
+ update_fast_start_pfn(cc, free_pfn);
+ pfn = pageblock_start_pfn(free_pfn);
+ cc->fast_search_fail = 0;
+ set_pageblock_skip(freepage);
+ break;
+ }
+
+ if (nr_scanned >= limit) {
+ cc->fast_search_fail++;
+ move_freelist_tail(freelist, freepage);
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&cc->zone->lock, flags);
+ }
+
+ cc->total_migrate_scanned += nr_scanned;
+
+ /*
+ * If fast scanning failed then use a cached entry for a page block
+ * that had free pages as the basis for starting a linear scan.
+ */
+ if (pfn == cc->migrate_pfn)
+ pfn = reinit_migrate_pfn(cc);
+
+ return pfn;
+}
+
/*
* Isolate all pages that can be migrated from the first suitable block,
* starting at the block pointed to by the migrate scanner pfn within
@@ -1232,16 +1735,25 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
const isolate_mode_t isolate_mode =
(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
(cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0);
+ bool fast_find_block;
/*
* Start at where we last stopped, or beginning of the zone as
- * initialized by compact_zone()
+ * initialized by compact_zone(). The first failure will use
+ * the lowest PFN as the starting point for linear scanning.
*/
- low_pfn = cc->migrate_pfn;
+ low_pfn = fast_find_migrateblock(cc);
block_start_pfn = pageblock_start_pfn(low_pfn);
if (block_start_pfn < zone->zone_start_pfn)
block_start_pfn = zone->zone_start_pfn;
+ /*
+ * fast_find_migrateblock marks a pageblock skipped so to avoid
+ * the isolation_suitable check below, check whether the fast
+ * search was successful.
+ */
+ fast_find_block = low_pfn != cc->migrate_pfn && !cc->fast_search_fail;
+
/* Only scan within a pageblock boundary */
block_end_pfn = pageblock_end_pfn(low_pfn);
@@ -1250,6 +1762,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
* Do not cross the free scanner.
*/
for (; block_end_pfn <= cc->free_pfn;
+ fast_find_block = false,
low_pfn = block_end_pfn,
block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {
@@ -1257,34 +1770,45 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
/*
* This can potentially iterate a massively long zone with
* many pageblocks unsuitable, so periodically check if we
- * need to schedule, or even abort async compaction.
+ * need to schedule.
*/
- if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
- && compact_should_abort(cc))
- break;
+ if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
+ cond_resched();
page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
zone);
if (!page)
continue;
- /* If isolation recently failed, do not retry */
- if (!isolation_suitable(cc, page))
+ /*
+ * If isolation recently failed, do not retry. Only check the
+ * pageblock once. COMPACT_CLUSTER_MAX causes a pageblock
+ * to be visited multiple times. Assume skip was checked
+ * before making it "skip" so other compaction instances do
+ * not scan the same block.
+ */
+ if (IS_ALIGNED(low_pfn, pageblock_nr_pages) &&
+ !fast_find_block && !isolation_suitable(cc, page))
continue;
/*
- * For async compaction, also only scan in MOVABLE blocks.
- * Async compaction is optimistic to see if the minimum amount
- * of work satisfies the allocation.
+ * For async compaction, also only scan in MOVABLE blocks
+ * without huge pages. Async compaction is optimistic to see
+ * if the minimum amount of work satisfies the allocation.
+ * The cached PFN is updated as it's possible that all
+ * remaining blocks between source and target are unsuitable
+ * and the compaction scanners fail to meet.
*/
- if (!suitable_migration_source(cc, page))
+ if (!suitable_migration_source(cc, page)) {
+ update_cached_migrate(cc, block_end_pfn);
continue;
+ }
/* Perform the isolation */
low_pfn = isolate_migratepages_block(cc, low_pfn,
block_end_pfn, isolate_mode);
- if (!low_pfn || cc->contended)
+ if (!low_pfn)
return ISOLATE_ABORT;
/*
@@ -1310,19 +1834,16 @@ static inline bool is_via_compact_memory(int order)
return order == -1;
}
-static enum compact_result __compact_finished(struct zone *zone,
- struct compact_control *cc)
+static enum compact_result __compact_finished(struct compact_control *cc)
{
unsigned int order;
const int migratetype = cc->migratetype;
-
- if (cc->contended || fatal_signal_pending(current))
- return COMPACT_CONTENDED;
+ int ret;
/* Compaction run completes if the migrate and free scanner meet */
if (compact_scanners_met(cc)) {
/* Let the next compaction start anew. */
- reset_cached_positions(zone);
+ reset_cached_positions(cc->zone);
/*
* Mark that the PG_migrate_skip information should be cleared
@@ -1331,7 +1852,7 @@ static enum compact_result __compact_finished(struct zone *zone,
* based on an allocation request.
*/
if (cc->direct_compaction)
- zone->compact_blockskip_flush = true;
+ cc->zone->compact_blockskip_flush = true;
if (cc->whole_zone)
return COMPACT_COMPLETE;
@@ -1342,20 +1863,19 @@ static enum compact_result __compact_finished(struct zone *zone,
if (is_via_compact_memory(cc->order))
return COMPACT_CONTINUE;
- if (cc->finishing_block) {
- /*
- * We have finished the pageblock, but better check again that
- * we really succeeded.
- */
- if (IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages))
- cc->finishing_block = false;
- else
- return COMPACT_CONTINUE;
- }
+ /*
+ * Always finish scanning a pageblock to reduce the possibility of
+ * fallbacks in the future. This is particularly important when
+ * migration source is unmovable/reclaimable but it's not worth
+ * special casing.
+ */
+ if (!IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages))
+ return COMPACT_CONTINUE;
/* Direct compactor: Is a suitable page free? */
+ ret = COMPACT_NO_SUITABLE_PAGE;
for (order = cc->order; order < MAX_ORDER; order++) {
- struct free_area *area = &zone->free_area[order];
+ struct free_area *area = &cc->zone->free_area[order];
bool can_steal;
/* Job done if page is free of the right migratetype */
@@ -1393,21 +1913,23 @@ static enum compact_result __compact_finished(struct zone *zone,
return COMPACT_SUCCESS;
}
- cc->finishing_block = true;
- return COMPACT_CONTINUE;
+ ret = COMPACT_CONTINUE;
+ break;
}
}
- return COMPACT_NO_SUITABLE_PAGE;
+ if (cc->contended || fatal_signal_pending(current))
+ ret = COMPACT_CONTENDED;
+
+ return ret;
}
-static enum compact_result compact_finished(struct zone *zone,
- struct compact_control *cc)
+static enum compact_result compact_finished(struct compact_control *cc)
{
int ret;
- ret = __compact_finished(zone, cc);
- trace_mm_compaction_finished(zone, cc->order, ret);
+ ret = __compact_finished(cc);
+ trace_mm_compaction_finished(cc->zone, cc->order, ret);
if (ret == COMPACT_NO_SUITABLE_PAGE)
ret = COMPACT_CONTINUE;
@@ -1534,15 +2056,18 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
return false;
}
-static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc)
+static enum compact_result
+compact_zone(struct compact_control *cc, struct capture_control *capc)
{
enum compact_result ret;
- unsigned long start_pfn = zone->zone_start_pfn;
- unsigned long end_pfn = zone_end_pfn(zone);
+ unsigned long start_pfn = cc->zone->zone_start_pfn;
+ unsigned long end_pfn = zone_end_pfn(cc->zone);
+ unsigned long last_migrated_pfn;
const bool sync = cc->mode != MIGRATE_ASYNC;
+ bool update_cached;
cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);
- ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
+ ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags,
cc->classzone_idx);
/* Compaction is likely to fail */
if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
@@ -1555,8 +2080,8 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
* Clear pageblock skip if there were failures recently and compaction
* is about to be retried after being deferred.
*/
- if (compaction_restarting(zone, cc->order))
- __reset_isolation_suitable(zone);
+ if (compaction_restarting(cc->zone, cc->order))
+ __reset_isolation_suitable(cc->zone);
/*
* Setup to move all movable pages to the end of the zone. Used cached
@@ -1564,43 +2089,76 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
* want to compact the whole zone), but check that it is initialised
* by ensuring the values are within zone boundaries.
*/
+ cc->fast_start_pfn = 0;
if (cc->whole_zone) {
cc->migrate_pfn = start_pfn;
cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
} else {
- cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
- cc->free_pfn = zone->compact_cached_free_pfn;
+ cc->migrate_pfn = cc->zone->compact_cached_migrate_pfn[sync];
+ cc->free_pfn = cc->zone->compact_cached_free_pfn;
if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
- zone->compact_cached_free_pfn = cc->free_pfn;
+ cc->zone->compact_cached_free_pfn = cc->free_pfn;
}
if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
cc->migrate_pfn = start_pfn;
- zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
- zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
+ cc->zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
+ cc->zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
}
- if (cc->migrate_pfn == start_pfn)
+ if (cc->migrate_pfn <= cc->zone->compact_init_migrate_pfn)
cc->whole_zone = true;
}
- cc->last_migrated_pfn = 0;
+ last_migrated_pfn = 0;
+
+ /*
+ * Migrate has separate cached PFNs for ASYNC and SYNC* migration on
+ * the basis that some migrations will fail in ASYNC mode. However,
+ * if the cached PFNs match and pageblocks are skipped due to having
+ * no isolation candidates, then the sync state does not matter.
+ * Until a pageblock with isolation candidates is found, keep the
+ * cached PFNs in sync to avoid revisiting the same blocks.
+ */
+ update_cached = !sync &&
+ cc->zone->compact_cached_migrate_pfn[0] == cc->zone->compact_cached_migrate_pfn[1];
trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
cc->free_pfn, end_pfn, sync);
migrate_prep_local();
- while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
+ while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) {
int err;
+ unsigned long start_pfn = cc->migrate_pfn;
+
+ /*
+ * Avoid multiple rescans which can happen if a page cannot be
+ * isolated (dirty/writeback in async mode) or if the migrated
+ * pages are being allocated before the pageblock is cleared.
+ * The first rescan will capture the entire pageblock for
+ * migration. If it fails, it'll be marked skip and scanning
+ * will proceed as normal.
+ */
+ cc->rescan = false;
+ if (pageblock_start_pfn(last_migrated_pfn) ==
+ pageblock_start_pfn(start_pfn)) {
+ cc->rescan = true;
+ }
- switch (isolate_migratepages(zone, cc)) {
+ switch (isolate_migratepages(cc->zone, cc)) {
case ISOLATE_ABORT:
ret = COMPACT_CONTENDED;
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
+ last_migrated_pfn = 0;
goto out;
case ISOLATE_NONE:
+ if (update_cached) {
+ cc->zone->compact_cached_migrate_pfn[1] =
+ cc->zone->compact_cached_migrate_pfn[0];
+ }
+
/*
* We haven't isolated and migrated anything, but
* there might still be unflushed migrations from
@@ -1608,6 +2166,8 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
*/
goto check_drain;
case ISOLATE_SUCCESS:
+ update_cached = false;
+ last_migrated_pfn = start_pfn;
;
}
@@ -1639,8 +2199,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
cc->migrate_pfn = block_end_pfn(
cc->migrate_pfn - 1, cc->order);
/* Draining pcplists is useless in this case */
- cc->last_migrated_pfn = 0;
-
+ last_migrated_pfn = 0;
}
}
@@ -1652,21 +2211,26 @@ check_drain:
* compact_finished() can detect immediately if allocation
* would succeed.
*/
- if (cc->order > 0 && cc->last_migrated_pfn) {
+ if (cc->order > 0 && last_migrated_pfn) {
int cpu;
unsigned long current_block_start =
block_start_pfn(cc->migrate_pfn, cc->order);
- if (cc->last_migrated_pfn < current_block_start) {
+ if (last_migrated_pfn < current_block_start) {
cpu = get_cpu();
lru_add_drain_cpu(cpu);
- drain_local_pages(zone);
+ drain_local_pages(cc->zone);
put_cpu();
/* No more flushing until we migrate again */
- cc->last_migrated_pfn = 0;
+ last_migrated_pfn = 0;
}
}
+ /* Stop if a page has been captured */
+ if (capc && capc->page) {
+ ret = COMPACT_SUCCESS;
+ break;
+ }
}
out:
@@ -1685,8 +2249,8 @@ out:
* Only go back, not forward. The cached pfn might have been
* already reset to zone end in compact_finished()
*/
- if (free_pfn > zone->compact_cached_free_pfn)
- zone->compact_cached_free_pfn = free_pfn;
+ if (free_pfn > cc->zone->compact_cached_free_pfn)
+ cc->zone->compact_cached_free_pfn = free_pfn;
}
count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned);
@@ -1700,7 +2264,8 @@ out:
static enum compact_result compact_zone_order(struct zone *zone, int order,
gfp_t gfp_mask, enum compact_priority prio,
- unsigned int alloc_flags, int classzone_idx)
+ unsigned int alloc_flags, int classzone_idx,
+ struct page **capture)
{
enum compact_result ret;
struct compact_control cc = {
@@ -1709,6 +2274,7 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
.total_migrate_scanned = 0,
.total_free_scanned = 0,
.order = order,
+ .search_order = order,
.gfp_mask = gfp_mask,
.zone = zone,
.mode = (prio == COMPACT_PRIO_ASYNC) ?
@@ -1720,14 +2286,24 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
.ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),
.ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY)
};
+ struct capture_control capc = {
+ .cc = &cc,
+ .page = NULL,
+ };
+
+ if (capture)
+ current->capture_control = &capc;
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
- ret = compact_zone(zone, &cc);
+ ret = compact_zone(&cc, &capc);
VM_BUG_ON(!list_empty(&cc.freepages));
VM_BUG_ON(!list_empty(&cc.migratepages));
+ *capture = capc.page;
+ current->capture_control = NULL;
+
return ret;
}
@@ -1745,7 +2321,7 @@ int sysctl_extfrag_threshold = 500;
*/
enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
- enum compact_priority prio)
+ enum compact_priority prio, struct page **capture)
{
int may_perform_io = gfp_mask & __GFP_IO;
struct zoneref *z;
@@ -1773,7 +2349,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
}
status = compact_zone_order(zone, order, gfp_mask, prio,
- alloc_flags, ac_classzone_idx(ac));
+ alloc_flags, ac_classzone_idx(ac), capture);
rc = max(status, rc);
/* The allocation should succeed, stop compacting */
@@ -1841,7 +2417,7 @@ static void compact_node(int nid)
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
- compact_zone(zone, &cc);
+ compact_zone(&cc, NULL);
VM_BUG_ON(!list_empty(&cc.freepages));
VM_BUG_ON(!list_empty(&cc.migratepages));
@@ -1876,14 +2452,6 @@ int sysctl_compaction_handler(struct ctl_table *table, int write,
return 0;
}
-int sysctl_extfrag_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
-{
- proc_dointvec_minmax(table, write, buffer, length, ppos);
-
- return 0;
-}
-
#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
static ssize_t sysfs_compact_node(struct device *dev,
struct device_attribute *attr,
@@ -1948,6 +2516,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
struct zone *zone;
struct compact_control cc = {
.order = pgdat->kcompactd_max_order,
+ .search_order = pgdat->kcompactd_max_order,
.total_migrate_scanned = 0,
.total_free_scanned = 0,
.classzone_idx = pgdat->kcompactd_classzone_idx,
@@ -1983,7 +2552,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
if (kthread_should_stop())
return;
- status = compact_zone(zone, &cc);
+ status = compact_zone(&cc, NULL);
if (status == COMPACT_SUCCESS) {
compaction_defer_reset(zone, cc.order, false);
diff --git a/mm/debug.c b/mm/debug.c
index 0abb987dad9b..c0b31b6c3877 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -44,7 +44,7 @@ const struct trace_print_flags vmaflag_names[] = {
void __dump_page(struct page *page, const char *reason)
{
- struct address_space *mapping = page_mapping(page);
+ struct address_space *mapping;
bool page_poisoned = PagePoisoned(page);
int mapcount;
@@ -58,6 +58,8 @@ void __dump_page(struct page *page, const char *reason)
goto hex_only;
}
+ mapping = page_mapping(page);
+
/*
* Avoid VM_BUG_ON() in page_mapcount().
* page->_mapcount space in struct page is used by sl[aou]b pages to
@@ -135,7 +137,7 @@ void dump_mm(const struct mm_struct *mm)
"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
"pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n"
"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
- "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n"
+ "pinned_vm %llx data_vm %lx exec_vm %lx stack_vm %lx\n"
"start_code %lx end_code %lx start_data %lx end_data %lx\n"
"start_brk %lx brk %lx start_stack %lx\n"
"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
@@ -166,7 +168,8 @@ void dump_mm(const struct mm_struct *mm)
mm_pgtables_bytes(mm),
mm->map_count,
mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
- mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm,
+ atomic64_read(&mm->pinned_vm),
+ mm->data_vm, mm->exec_vm, mm->stack_vm,
mm->start_code, mm->end_code, mm->start_data, mm->end_data,
mm->start_brk, mm->brk, mm->start_stack,
mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 6d4b97e7e9e9..76a160083506 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -114,10 +114,9 @@ static DEVICE_ATTR(pools, 0444, show_pools, NULL);
* @size: size of the blocks in this pool.
* @align: alignment requirement for blocks; must be a power of two
* @boundary: returned blocks won't cross this power of two boundary
- * Context: !in_interrupt()
+ * Context: not in_interrupt()
*
- * Returns a dma allocation pool with the requested characteristics, or
- * null if one can't be created. Given one of these pools, dma_pool_alloc()
+ * Given one of these pools, dma_pool_alloc()
* may be used to allocate memory. Such memory will all have "consistent"
* DMA mappings, accessible by the device and its driver without using
* cache flushing primitives. The actual size of blocks allocated may be
@@ -127,6 +126,9 @@ static DEVICE_ATTR(pools, 0444, show_pools, NULL);
* cross that size boundary. This is useful for devices which have
* addressing restrictions on individual DMA transfers, such as not crossing
* boundaries of 4KBytes.
+ *
+ * Return: a dma allocation pool with the requested characteristics, or
+ * %NULL if one can't be created.
*/
struct dma_pool *dma_pool_create(const char *name, struct device *dev,
size_t size, size_t align, size_t boundary)
@@ -313,7 +315,7 @@ EXPORT_SYMBOL(dma_pool_destroy);
* @mem_flags: GFP_* bitmask
* @handle: pointer to dma address of block
*
- * This returns the kernel virtual address of a currently unused block,
+ * Return: the kernel virtual address of a currently unused block,
* and reports its dma address through the handle.
* If such a memory block can't be allocated, %NULL is returned.
*/
@@ -498,6 +500,9 @@ static int dmam_pool_match(struct device *dev, void *res, void *match_data)
*
* Managed dma_pool_create(). DMA pool created with this function is
* automatically destroyed on driver detach.
+ *
+ * Return: a managed dma allocation pool with the requested
+ * characteristics, or %NULL if one can't be created.
*/
struct dma_pool *dmam_pool_create(const char *name, struct device *dev,
size_t size, size_t align, size_t allocation)
diff --git a/mm/failslab.c b/mm/failslab.c
index b135ebb88b6f..ec5aad211c5b 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -48,18 +48,12 @@ static int __init failslab_debugfs_init(void)
if (IS_ERR(dir))
return PTR_ERR(dir);
- if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
- &failslab.ignore_gfp_reclaim))
- goto fail;
- if (!debugfs_create_bool("cache-filter", mode, dir,
- &failslab.cache_filter))
- goto fail;
+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &failslab.ignore_gfp_reclaim);
+ debugfs_create_bool("cache-filter", mode, dir,
+ &failslab.cache_filter);
return 0;
-fail:
- debugfs_remove_recursive(dir);
-
- return -ENOMEM;
}
late_initcall(failslab_debugfs_init);
diff --git a/mm/filemap.c b/mm/filemap.c
index 9f5e323e883e..d78f577baef2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -98,8 +98,8 @@
* ->swap_lock (try_to_unmap_one)
* ->private_lock (try_to_unmap_one)
* ->i_pages lock (try_to_unmap_one)
- * ->zone_lru_lock(zone) (follow_page->mark_page_accessed)
- * ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page)
+ * ->pgdat->lru_lock (follow_page->mark_page_accessed)
+ * ->pgdat->lru_lock (check_pte_range->isolate_lru_page)
* ->private_lock (page_remove_rmap->set_page_dirty)
* ->i_pages lock (page_remove_rmap->set_page_dirty)
* bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
@@ -392,6 +392,8 @@ static int filemap_check_and_keep_errors(struct address_space *mapping)
* opposed to a regular memory cleansing writeback. The difference between
* these two operations is that if a dirty page/buffer is encountered, it must
* be waited upon, and not just skipped over.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
loff_t end, int sync_mode)
@@ -438,6 +440,8 @@ EXPORT_SYMBOL(filemap_fdatawrite_range);
*
* This is a mostly non-blocking flush. Not suitable for data-integrity
* purposes - I/O may not be started against all dirty pages.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int filemap_flush(struct address_space *mapping)
{
@@ -453,6 +457,9 @@ EXPORT_SYMBOL(filemap_flush);
*
* Find at least one page in the range supplied, usually used to check if
* direct writing in this range will trigger a writeback.
+ *
+ * Return: %true if at least one page exists in the specified range,
+ * %false otherwise.
*/
bool filemap_range_has_page(struct address_space *mapping,
loff_t start_byte, loff_t end_byte)
@@ -529,6 +536,8 @@ static void __filemap_fdatawait_range(struct address_space *mapping,
* Since the error status of the address space is cleared by this function,
* callers are responsible for checking the return value and handling and/or
* reporting the error.
+ *
+ * Return: error status of the address space.
*/
int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
loff_t end_byte)
@@ -551,6 +560,8 @@ EXPORT_SYMBOL(filemap_fdatawait_range);
* Since the error status of the file is advanced by this function,
* callers are responsible for checking the return value and handling and/or
* reporting the error.
+ *
+ * Return: error status of the address space vs. the file->f_wb_err cursor.
*/
int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
{
@@ -572,6 +583,8 @@ EXPORT_SYMBOL(file_fdatawait_range);
* Use this function if callers don't handle errors themselves. Expected
* call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
* fsfreeze(8)
+ *
+ * Return: error status of the address space.
*/
int filemap_fdatawait_keep_errors(struct address_space *mapping)
{
@@ -623,6 +636,8 @@ EXPORT_SYMBOL(filemap_write_and_wait);
*
* Note that @lend is inclusive (describes the last byte to be written) so
* that this function can be used to write to the very end-of-file (end = -1).
+ *
+ * Return: error status of the address space.
*/
int filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend)
@@ -678,6 +693,8 @@ EXPORT_SYMBOL(__filemap_set_wb_err);
* While we handle mapping->wb_err with atomic operations, the f_wb_err
* value is protected by the f_lock since we must ensure that it reflects
* the latest value swapped in for this file descriptor.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int file_check_and_advance_wb_err(struct file *file)
{
@@ -720,6 +737,8 @@ EXPORT_SYMBOL(file_check_and_advance_wb_err);
*
* After writing out and waiting on the data, we check and advance the
* f_wb_err cursor to the latest value, and return any errors detected there.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
{
@@ -753,6 +772,8 @@ EXPORT_SYMBOL(file_write_and_wait_range);
* caller must do that.
*
* The remove + add is atomic. This function cannot fail.
+ *
+ * Return: %0
*/
int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
{
@@ -867,6 +888,8 @@ error:
*
* This function is used to add a page to the pagecache. It must be locked.
* This function does not add the page to the LRU. The caller must do that.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
pgoff_t offset, gfp_t gfp_mask)
@@ -1463,7 +1486,7 @@ EXPORT_SYMBOL(page_cache_prev_miss);
* If the slot holds a shadow entry of a previously evicted page, or a
* swap entry from shmem/tmpfs, it is returned.
*
- * Otherwise, %NULL is returned.
+ * Return: the found page or shadow entry, %NULL if nothing is found.
*/
struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
{
@@ -1521,9 +1544,9 @@ EXPORT_SYMBOL(find_get_entry);
* If the slot holds a shadow entry of a previously evicted page, or a
* swap entry from shmem/tmpfs, it is returned.
*
- * Otherwise, %NULL is returned.
- *
* find_lock_entry() may sleep.
+ *
+ * Return: the found page or shadow entry, %NULL if nothing is found.
*/
struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
{
@@ -1563,12 +1586,17 @@ EXPORT_SYMBOL(find_lock_entry);
* - FGP_CREAT: If page is not present then a new page is allocated using
* @gfp_mask and added to the page cache and the VM's LRU
* list. The page is returned locked and with an increased
- * refcount. Otherwise, NULL is returned.
+ * refcount.
+ * - FGP_FOR_MMAP: Similar to FGP_CREAT, only we want to allow the caller to do
+ * its own locking dance if the page is already in cache, or unlock the page
+ * before returning if we had to add the page to pagecache.
*
* If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
* if the GFP flags specified for FGP_CREAT are atomic.
*
* If there is a page cache page, it is returned with an increased refcount.
+ *
+ * Return: the found page or %NULL otherwise.
*/
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
int fgp_flags, gfp_t gfp_mask)
@@ -1616,7 +1644,7 @@ no_page:
if (!page)
return NULL;
- if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
+ if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
fgp_flags |= FGP_LOCK;
/* Init accessed so avoid atomic mark_page_accessed later */
@@ -1630,6 +1658,13 @@ no_page:
if (err == -EEXIST)
goto repeat;
}
+
+ /*
+ * add_to_page_cache_lru locks the page, and for mmap we expect
+ * an unlocked page.
+ */
+ if (page && (fgp_flags & FGP_FOR_MMAP))
+ unlock_page(page);
}
return page;
@@ -1656,8 +1691,7 @@ EXPORT_SYMBOL(pagecache_get_page);
* Any shadow entries of evicted pages, or swap entries from
* shmem/tmpfs, are included in the returned array.
*
- * find_get_entries() returns the number of pages and shadow entries
- * which were found.
+ * Return: the number of pages and shadow entries which were found.
*/
unsigned find_get_entries(struct address_space *mapping,
pgoff_t start, unsigned int nr_entries,
@@ -1727,8 +1761,8 @@ retry:
* indexes. There may be holes in the indices due to not-present pages.
* We also update @start to index the next page for the traversal.
*
- * find_get_pages_range() returns the number of pages which were found. If this
- * number is smaller than @nr_pages, the end of specified range has been
+ * Return: the number of pages which were found. If this number is
+ * smaller than @nr_pages, the end of specified range has been
* reached.
*/
unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
@@ -1765,7 +1799,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
pages[ret] = page;
if (++ret == nr_pages) {
- *start = page->index + 1;
+ *start = xas.xa_index + 1;
goto out;
}
continue;
@@ -1801,7 +1835,7 @@ out:
* find_get_pages_contig() works exactly like find_get_pages(), except
* that the returned number of pages are guaranteed to be contiguous.
*
- * find_get_pages_contig() returns the number of pages which were found.
+ * Return: the number of pages which were found.
*/
unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
unsigned int nr_pages, struct page **pages)
@@ -1837,16 +1871,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
if (unlikely(page != xas_reload(&xas)))
goto put_page;
- /*
- * must check mapping and index after taking the ref.
- * otherwise we can get both false positives and false
- * negatives, which is just confusing to the caller.
- */
- if (!page->mapping || page_to_pgoff(page) != xas.xa_index) {
- put_page(page);
- break;
- }
-
pages[ret] = page;
if (++ret == nr_pages)
break;
@@ -1872,6 +1896,8 @@ EXPORT_SYMBOL(find_get_pages_contig);
*
* Like find_get_pages, except we only return pages which are tagged with
* @tag. We update @index to index the next page for the traversal.
+ *
+ * Return: the number of pages which were found.
*/
unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
@@ -1911,7 +1937,7 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
pages[ret] = page;
if (++ret == nr_pages) {
- *index = page->index + 1;
+ *index = xas.xa_index + 1;
goto out;
}
continue;
@@ -1949,6 +1975,8 @@ EXPORT_SYMBOL(find_get_pages_range_tag);
*
* Like find_get_entries, except we only return entries which are tagged with
* @tag.
+ *
+ * Return: the number of entries which were found.
*/
unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
xa_mark_t tag, unsigned int nr_entries,
@@ -2034,6 +2062,10 @@ static void shrink_readahead_size_eio(struct file *filp,
*
* This is really ugly. But the goto's actually try to clarify some
* of the logic when it comes to error handling etc.
+ *
+ * Return:
+ * * total number of bytes copied, including those the were already @written
+ * * negative error code if nothing was copied
*/
static ssize_t generic_file_buffered_read(struct kiocb *iocb,
struct iov_iter *iter, ssize_t written)
@@ -2295,6 +2327,9 @@ out:
*
* This is the "read_iter()" routine for all filesystems
* that can use the page cache directly.
+ * Return:
+ * * number of bytes copied, even for partial reads
+ * * negative error code if nothing was read
*/
ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
@@ -2354,62 +2389,98 @@ out:
EXPORT_SYMBOL(generic_file_read_iter);
#ifdef CONFIG_MMU
-/**
- * page_cache_read - adds requested page to the page cache if not already there
- * @file: file to read
- * @offset: page index
- * @gfp_mask: memory allocation flags
- *
- * This adds the requested page to the page cache if it isn't already there,
- * and schedules an I/O to read in its contents from disk.
- */
-static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
+#define MMAP_LOTSAMISS (100)
+static struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
+ struct file *fpin)
{
- struct address_space *mapping = file->f_mapping;
- struct page *page;
- int ret;
+ int flags = vmf->flags;
- do {
- page = __page_cache_alloc(gfp_mask);
- if (!page)
- return -ENOMEM;
+ if (fpin)
+ return fpin;
- ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask);
- if (ret == 0)
- ret = mapping->a_ops->readpage(file, page);
- else if (ret == -EEXIST)
- ret = 0; /* losing race to add is OK */
+ /*
+ * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
+ * anything, so we only pin the file and drop the mmap_sem if only
+ * FAULT_FLAG_ALLOW_RETRY is set.
+ */
+ if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) ==
+ FAULT_FLAG_ALLOW_RETRY) {
+ fpin = get_file(vmf->vma->vm_file);
+ up_read(&vmf->vma->vm_mm->mmap_sem);
+ }
+ return fpin;
+}
- put_page(page);
+/*
+ * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem
+ * @vmf - the vm_fault for this fault.
+ * @page - the page to lock.
+ * @fpin - the pointer to the file we may pin (or is already pinned).
+ *
+ * This works similar to lock_page_or_retry in that it can drop the mmap_sem.
+ * It differs in that it actually returns the page locked if it returns 1 and 0
+ * if it couldn't lock the page. If we did have to drop the mmap_sem then fpin
+ * will point to the pinned file and needs to be fput()'ed at a later point.
+ */
+static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
+ struct file **fpin)
+{
+ if (trylock_page(page))
+ return 1;
- } while (ret == AOP_TRUNCATED_PAGE);
+ /*
+ * NOTE! This will make us return with VM_FAULT_RETRY, but with
+ * the mmap_sem still held. That's how FAULT_FLAG_RETRY_NOWAIT
+ * is supposed to work. We have way too many special cases..
+ */
+ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
+ return 0;
- return ret;
+ *fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
+ if (vmf->flags & FAULT_FLAG_KILLABLE) {
+ if (__lock_page_killable(page)) {
+ /*
+ * We didn't have the right flags to drop the mmap_sem,
+ * but all fault_handlers only check for fatal signals
+ * if we return VM_FAULT_RETRY, so we need to drop the
+ * mmap_sem here and return 0 if we don't have a fpin.
+ */
+ if (*fpin == NULL)
+ up_read(&vmf->vma->vm_mm->mmap_sem);
+ return 0;
+ }
+ } else
+ __lock_page(page);
+ return 1;
}
-#define MMAP_LOTSAMISS (100)
/*
- * Synchronous readahead happens when we don't even find
- * a page in the page cache at all.
+ * Synchronous readahead happens when we don't even find a page in the page
+ * cache at all. We don't want to perform IO under the mmap sem, so if we have
+ * to drop the mmap sem we return the file that was pinned in order for us to do
+ * that. If we didn't pin a file then we return NULL. The file that is
+ * returned needs to be fput()'ed when we're done with it.
*/
-static void do_sync_mmap_readahead(struct vm_area_struct *vma,
- struct file_ra_state *ra,
- struct file *file,
- pgoff_t offset)
+static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
{
+ struct file *file = vmf->vma->vm_file;
+ struct file_ra_state *ra = &file->f_ra;
struct address_space *mapping = file->f_mapping;
+ struct file *fpin = NULL;
+ pgoff_t offset = vmf->pgoff;
/* If we don't want any read-ahead, don't bother */
- if (vma->vm_flags & VM_RAND_READ)
- return;
+ if (vmf->vma->vm_flags & VM_RAND_READ)
+ return fpin;
if (!ra->ra_pages)
- return;
+ return fpin;
- if (vma->vm_flags & VM_SEQ_READ) {
+ if (vmf->vma->vm_flags & VM_SEQ_READ) {
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
page_cache_sync_readahead(mapping, ra, file, offset,
ra->ra_pages);
- return;
+ return fpin;
}
/* Avoid banging the cache line if not needed */
@@ -2421,37 +2492,44 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
* stop bothering with read-ahead. It will only hurt.
*/
if (ra->mmap_miss > MMAP_LOTSAMISS)
- return;
+ return fpin;
/*
* mmap read-around
*/
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
ra->size = ra->ra_pages;
ra->async_size = ra->ra_pages / 4;
ra_submit(ra, mapping, file);
+ return fpin;
}
/*
* Asynchronous readahead happens when we find the page and PG_readahead,
- * so we want to possibly extend the readahead further..
+ * so we want to possibly extend the readahead further. We return the file that
+ * was pinned if we have to drop the mmap_sem in order to do IO.
*/
-static void do_async_mmap_readahead(struct vm_area_struct *vma,
- struct file_ra_state *ra,
- struct file *file,
- struct page *page,
- pgoff_t offset)
+static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
+ struct page *page)
{
+ struct file *file = vmf->vma->vm_file;
+ struct file_ra_state *ra = &file->f_ra;
struct address_space *mapping = file->f_mapping;
+ struct file *fpin = NULL;
+ pgoff_t offset = vmf->pgoff;
/* If we don't want any read-ahead, don't bother */
- if (vma->vm_flags & VM_RAND_READ)
- return;
+ if (vmf->vma->vm_flags & VM_RAND_READ)
+ return fpin;
if (ra->mmap_miss > 0)
ra->mmap_miss--;
- if (PageReadahead(page))
+ if (PageReadahead(page)) {
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
page_cache_async_readahead(mapping, ra, file,
page, offset, ra->ra_pages);
+ }
+ return fpin;
}
/**
@@ -2476,11 +2554,14 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma,
* has not been released.
*
* We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
+ *
+ * Return: bitwise-OR of %VM_FAULT_ codes.
*/
vm_fault_t filemap_fault(struct vm_fault *vmf)
{
int error;
struct file *file = vmf->vma->vm_file;
+ struct file *fpin = NULL;
struct address_space *mapping = file->f_mapping;
struct file_ra_state *ra = &file->f_ra;
struct inode *inode = mapping->host;
@@ -2502,23 +2583,26 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
* We found the page, so try async readahead before
* waiting for the lock.
*/
- do_async_mmap_readahead(vmf->vma, ra, file, page, offset);
+ fpin = do_async_mmap_readahead(vmf, page);
} else if (!page) {
/* No page in the page cache at all */
- do_sync_mmap_readahead(vmf->vma, ra, file, offset);
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
ret = VM_FAULT_MAJOR;
+ fpin = do_sync_mmap_readahead(vmf);
retry_find:
- page = find_get_page(mapping, offset);
- if (!page)
- goto no_cached_page;
+ page = pagecache_get_page(mapping, offset,
+ FGP_CREAT|FGP_FOR_MMAP,
+ vmf->gfp_mask);
+ if (!page) {
+ if (fpin)
+ goto out_retry;
+ return vmf_error(-ENOMEM);
+ }
}
- if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) {
- put_page(page);
- return ret | VM_FAULT_RETRY;
- }
+ if (!lock_page_maybe_drop_mmap(vmf, page, &fpin))
+ goto out_retry;
/* Did it get truncated? */
if (unlikely(page->mapping != mapping)) {
@@ -2536,6 +2620,16 @@ retry_find:
goto page_not_uptodate;
/*
+ * We've made it this far and we had to drop our mmap_sem, now is the
+ * time to return to the upper layer and have it re-find the vma and
+ * redo the fault.
+ */
+ if (fpin) {
+ unlock_page(page);
+ goto out_retry;
+ }
+
+ /*
* Found the page and have a reference on it.
* We must recheck i_size under page lock.
*/
@@ -2549,28 +2643,6 @@ retry_find:
vmf->page = page;
return ret | VM_FAULT_LOCKED;
-no_cached_page:
- /*
- * We're only likely to ever get here if MADV_RANDOM is in
- * effect.
- */
- error = page_cache_read(file, offset, vmf->gfp_mask);
-
- /*
- * The page we want has now been added to the page cache.
- * In the unlikely event that someone removed it in the
- * meantime, we'll just come back here and read it again.
- */
- if (error >= 0)
- goto retry_find;
-
- /*
- * An error return from page_cache_read can result if the
- * system is low on memory, or a problem occurs while trying
- * to schedule I/O.
- */
- return vmf_error(error);
-
page_not_uptodate:
/*
* Umm, take care of errors if the page isn't up-to-date.
@@ -2579,12 +2651,15 @@ page_not_uptodate:
* and we need to check for errors.
*/
ClearPageError(page);
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
error = mapping->a_ops->readpage(file, page);
if (!error) {
wait_on_page_locked(page);
if (!PageUptodate(page))
error = -EIO;
}
+ if (fpin)
+ goto out_retry;
put_page(page);
if (!error || error == AOP_TRUNCATED_PAGE)
@@ -2593,6 +2668,18 @@ page_not_uptodate:
/* Things didn't work out. Return zero to tell the mm layer so. */
shrink_readahead_size_eio(file, ra);
return VM_FAULT_SIGBUS;
+
+out_retry:
+ /*
+ * We dropped the mmap_sem, we need to return to the fault handler to
+ * re-find the vma and come back and find our hopefully still populated
+ * page.
+ */
+ if (page)
+ put_page(page);
+ if (fpin)
+ fput(fpin);
+ return ret | VM_FAULT_RETRY;
}
EXPORT_SYMBOL(filemap_fault);
@@ -2861,6 +2948,8 @@ out:
* not set, try to fill the page and wait for it to become unlocked.
*
* If the page does not get brought uptodate, return -EIO.
+ *
+ * Return: up to date page on success, ERR_PTR() on failure.
*/
struct page *read_cache_page(struct address_space *mapping,
pgoff_t index,
@@ -2881,6 +2970,8 @@ EXPORT_SYMBOL(read_cache_page);
* any new page allocations done using the specified allocation flags.
*
* If the page does not get brought uptodate, return -EIO.
+ *
+ * Return: up to date page on success, ERR_PTR() on failure.
*/
struct page *read_cache_page_gfp(struct address_space *mapping,
pgoff_t index,
@@ -3081,7 +3172,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
if (iocb->ki_flags & IOCB_NOWAIT) {
/* If there are pages to writeback, return */
if (filemap_range_has_page(inode->i_mapping, pos,
- pos + write_len))
+ pos + write_len - 1))
return -EAGAIN;
} else {
written = filemap_write_and_wait_range(mapping, pos,
@@ -3264,6 +3355,10 @@ EXPORT_SYMBOL(generic_perform_write);
* This function does *not* take care of syncing data in case of O_SYNC write.
* A caller has to handle it. This is mainly due to the fact that we want to
* avoid syncing under i_mutex.
+ *
+ * Return:
+ * * number of bytes written, even for truncated writes
+ * * negative error code if no data has been written at all
*/
ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
@@ -3348,6 +3443,10 @@ EXPORT_SYMBOL(__generic_file_write_iter);
* This is a wrapper around __generic_file_write_iter() to be used by most
* filesystems. It takes care of syncing the file in case of O_SYNC file
* and acquires i_mutex as needed.
+ * Return:
+ * * negative error code if no data has been written at all of
+ * vfs_fsync_range() failed for a synchronous write
+ * * number of bytes written, even for truncated writes
*/
ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
@@ -3374,8 +3473,7 @@ EXPORT_SYMBOL(generic_file_write_iter);
* @gfp_mask: memory allocation flags (and I/O mode)
*
* The address_space is to try to release any data against the page
- * (presumably at page->private). If the release was successful, return '1'.
- * Otherwise return zero.
+ * (presumably at page->private).
*
* This may also be called if PG_fscache is set on a page, indicating that the
* page is known to the local caching routines.
@@ -3383,6 +3481,7 @@ EXPORT_SYMBOL(generic_file_write_iter);
* The @gfp_mask argument specifies whether I/O may be performed to release
* this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS).
*
+ * Return: %1 if the release was successful, otherwise return zero.
*/
int try_to_release_page(struct page *page, gfp_t gfp_mask)
{
diff --git a/mm/gup.c b/mm/gup.c
index 05acd7e2eb22..f84e22685aaa 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -13,6 +13,9 @@
#include <linux/sched/signal.h>
#include <linux/rwsem.h>
#include <linux/hugetlb.h>
+#include <linux/migrate.h>
+#include <linux/mm_inline.h>
+#include <linux/sched/mm.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
@@ -1126,7 +1129,167 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
}
EXPORT_SYMBOL(get_user_pages);
+#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
+
#ifdef CONFIG_FS_DAX
+static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
+{
+ long i;
+ struct vm_area_struct *vma_prev = NULL;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct vm_area_struct *vma = vmas[i];
+
+ if (vma == vma_prev)
+ continue;
+
+ vma_prev = vma;
+
+ if (vma_is_fsdax(vma))
+ return true;
+ }
+ return false;
+}
+#else
+static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
+{
+ return false;
+}
+#endif
+
+#ifdef CONFIG_CMA
+static struct page *new_non_cma_page(struct page *page, unsigned long private)
+{
+ /*
+ * We want to make sure we allocate the new page from the same node
+ * as the source page.
+ */
+ int nid = page_to_nid(page);
+ /*
+ * Trying to allocate a page for migration. Ignore allocation
+ * failure warnings. We don't force __GFP_THISNODE here because
+ * this node here is the node where we have CMA reservation and
+ * in some case these nodes will have really less non movable
+ * allocation memory.
+ */
+ gfp_t gfp_mask = GFP_USER | __GFP_NOWARN;
+
+ if (PageHighMem(page))
+ gfp_mask |= __GFP_HIGHMEM;
+
+#ifdef CONFIG_HUGETLB_PAGE
+ if (PageHuge(page)) {
+ struct hstate *h = page_hstate(page);
+ /*
+ * We don't want to dequeue from the pool because pool pages will
+ * mostly be from the CMA region.
+ */
+ return alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
+ }
+#endif
+ if (PageTransHuge(page)) {
+ struct page *thp;
+ /*
+ * ignore allocation failure warnings
+ */
+ gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_NOWARN;
+
+ /*
+ * Remove the movable mask so that we don't allocate from
+ * CMA area again.
+ */
+ thp_gfpmask &= ~__GFP_MOVABLE;
+ thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER);
+ if (!thp)
+ return NULL;
+ prep_transhuge_page(thp);
+ return thp;
+ }
+
+ return __alloc_pages_node(nid, gfp_mask, 0);
+}
+
+static long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
+ unsigned int gup_flags,
+ struct page **pages,
+ struct vm_area_struct **vmas)
+{
+ long i;
+ bool drain_allow = true;
+ bool migrate_allow = true;
+ LIST_HEAD(cma_page_list);
+
+check_again:
+ for (i = 0; i < nr_pages; i++) {
+ /*
+ * If we get a page from the CMA zone, since we are going to
+ * be pinning these entries, we might as well move them out
+ * of the CMA zone if possible.
+ */
+ if (is_migrate_cma_page(pages[i])) {
+
+ struct page *head = compound_head(pages[i]);
+
+ if (PageHuge(head)) {
+ isolate_huge_page(head, &cma_page_list);
+ } else {
+ if (!PageLRU(head) && drain_allow) {
+ lru_add_drain_all();
+ drain_allow = false;
+ }
+
+ if (!isolate_lru_page(head)) {
+ list_add_tail(&head->lru, &cma_page_list);
+ mod_node_page_state(page_pgdat(head),
+ NR_ISOLATED_ANON +
+ page_is_file_cache(head),
+ hpage_nr_pages(head));
+ }
+ }
+ }
+ }
+
+ if (!list_empty(&cma_page_list)) {
+ /*
+ * drop the above get_user_pages reference.
+ */
+ for (i = 0; i < nr_pages; i++)
+ put_page(pages[i]);
+
+ if (migrate_pages(&cma_page_list, new_non_cma_page,
+ NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
+ /*
+ * some of the pages failed migration. Do get_user_pages
+ * without migration.
+ */
+ migrate_allow = false;
+
+ if (!list_empty(&cma_page_list))
+ putback_movable_pages(&cma_page_list);
+ }
+ /*
+ * We did migrate all the pages, Try to get the page references again
+ * migrating any new CMA pages which we failed to isolate earlier.
+ */
+ nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+ if ((nr_pages > 0) && migrate_allow) {
+ drain_allow = true;
+ goto check_again;
+ }
+ }
+
+ return nr_pages;
+}
+#else
+static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
+ unsigned int gup_flags,
+ struct page **pages,
+ struct vm_area_struct **vmas)
+{
+ return nr_pages;
+}
+#endif
+
/*
* This is the same as get_user_pages() in that it assumes we are
* operating on the current task's mm, but it goes further to validate
@@ -1140,11 +1303,11 @@ EXPORT_SYMBOL(get_user_pages);
* Contrast this to iov_iter_get_pages() usages which are transient.
*/
long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas_arg)
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas_arg)
{
struct vm_area_struct **vmas = vmas_arg;
- struct vm_area_struct *vma_prev = NULL;
+ unsigned long flags;
long rc, i;
if (!pages)
@@ -1157,31 +1320,20 @@ long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
return -ENOMEM;
}
+ flags = memalloc_nocma_save();
rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+ memalloc_nocma_restore(flags);
+ if (rc < 0)
+ goto out;
- for (i = 0; i < rc; i++) {
- struct vm_area_struct *vma = vmas[i];
-
- if (vma == vma_prev)
- continue;
-
- vma_prev = vma;
-
- if (vma_is_fsdax(vma))
- break;
- }
-
- /*
- * Either get_user_pages() failed, or the vma validation
- * succeeded, in either case we don't need to put_page() before
- * returning.
- */
- if (i >= rc)
+ if (check_dax_vmas(vmas, rc)) {
+ for (i = 0; i < rc; i++)
+ put_page(pages[i]);
+ rc = -EOPNOTSUPP;
goto out;
+ }
- for (i = 0; i < rc; i++)
- put_page(pages[i]);
- rc = -EOPNOTSUPP;
+ rc = check_and_migrate_cma_pages(start, rc, gup_flags, pages, vmas);
out:
if (vmas != vmas_arg)
kfree(vmas);
@@ -1674,7 +1826,8 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
if (!pmd_present(pmd))
return 0;
- if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
+ if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
+ pmd_devmap(pmd))) {
/*
* NUMA hinting faults need to be handled in the GUP
* slowpath for accounting purposes and so that they
@@ -1786,7 +1939,7 @@ static void gup_pgd_range(unsigned long addr, unsigned long end,
* Check if it's allowed to use __get_user_pages_fast() for the range, or
* we need to fall back to the slow version:
*/
-bool gup_fast_permitted(unsigned long start, int nr_pages, int write)
+bool gup_fast_permitted(unsigned long start, int nr_pages)
{
unsigned long len, end;
@@ -1828,7 +1981,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
* block IPIs that come from THPs splitting.
*/
- if (gup_fast_permitted(start, nr_pages, write)) {
+ if (gup_fast_permitted(start, nr_pages)) {
local_irq_save(flags);
gup_pgd_range(start, end, write, pages, &nr);
local_irq_restore(flags);
@@ -1870,7 +2023,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
if (unlikely(!access_ok((void __user *)start, len)))
return -EFAULT;
- if (gup_fast_permitted(start, nr_pages, write)) {
+ if (gup_fast_permitted(start, nr_pages)) {
local_irq_disable();
gup_pgd_range(addr, end, write, pages, &nr);
local_irq_enable();
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
index 5b42d3d4b60a..6c0279e70cc4 100644
--- a/mm/gup_benchmark.c
+++ b/mm/gup_benchmark.c
@@ -122,12 +122,8 @@ static const struct file_operations gup_benchmark_fops = {
static int gup_benchmark_init(void)
{
- void *ret;
-
- ret = debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL,
- &gup_benchmark_fops);
- if (!ret)
- pr_warn("Failed to create gup_benchmark in debugfs");
+ debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL,
+ &gup_benchmark_fops);
return 0;
}
diff --git a/mm/hmm.c b/mm/hmm.c
index a04e4b810610..fe1cd87e49ac 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -990,7 +990,7 @@ static void hmm_devmem_ref_kill(struct percpu_ref *ref)
percpu_ref_kill(ref);
}
-static int hmm_devmem_fault(struct vm_area_struct *vma,
+static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma,
unsigned long addr,
const struct page *page,
unsigned int flags,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index faf357eaf0ce..404acdcd0455 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -33,6 +33,7 @@
#include <linux/page_idle.h>
#include <linux/shmem_fs.h>
#include <linux/oom.h>
+#include <linux/numa.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -616,6 +617,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
mm_inc_nr_ptes(vma->vm_mm);
spin_unlock(vmf->ptl);
count_vm_event(THP_FAULT_ALLOC);
+ count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
}
return 0;
@@ -1337,6 +1339,7 @@ alloc:
}
count_vm_event(THP_FAULT_ALLOC);
+ count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
if (!page)
clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
@@ -1475,7 +1478,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
struct anon_vma *anon_vma = NULL;
struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- int page_nid = -1, this_nid = numa_node_id();
+ int page_nid = NUMA_NO_NODE, this_nid = numa_node_id();
int target_nid, last_cpupid = -1;
bool page_locked;
bool migrated = false;
@@ -1520,7 +1523,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
*/
page_locked = trylock_page(page);
target_nid = mpol_misplaced(page, vma, haddr);
- if (target_nid == -1) {
+ if (target_nid == NUMA_NO_NODE) {
/* If the page was locked, there are no parallel migrations */
if (page_locked)
goto clear_pmdnuma;
@@ -1528,7 +1531,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
/* Migration could have started since the pmd_trans_migrating check */
if (!page_locked) {
- page_nid = -1;
+ page_nid = NUMA_NO_NODE;
if (!get_page_unless_zero(page))
goto out_unlock;
spin_unlock(vmf->ptl);
@@ -1549,14 +1552,14 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
unlock_page(page);
put_page(page);
- page_nid = -1;
+ page_nid = NUMA_NO_NODE;
goto out_unlock;
}
/* Bail if we fail to protect against THP splits for any reason */
if (unlikely(!anon_vma)) {
put_page(page);
- page_nid = -1;
+ page_nid = NUMA_NO_NODE;
goto clear_pmdnuma;
}
@@ -1618,7 +1621,7 @@ out:
if (anon_vma)
page_unlock_anon_vma_read(anon_vma);
- if (page_nid != -1)
+ if (page_nid != NUMA_NO_NODE)
task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
flags);
@@ -1979,7 +1982,6 @@ spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
pud_t *pud, unsigned long addr)
{
- pud_t orig_pud;
spinlock_t *ptl;
ptl = __pud_trans_huge_lock(pud, vma);
@@ -1991,8 +1993,7 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
* pgtable_trans_huge_withdraw after finishing pudp related
* operations.
*/
- orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud,
- tlb->fullmm);
+ pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm);
tlb_remove_pud_tlb_entry(tlb, pud, addr);
if (vma_is_dax(vma)) {
spin_unlock(ptl);
@@ -2437,11 +2438,11 @@ static void __split_huge_page(struct page *page, struct list_head *list,
pgoff_t end, unsigned long flags)
{
struct page *head = compound_head(page);
- struct zone *zone = page_zone(head);
+ pg_data_t *pgdat = page_pgdat(head);
struct lruvec *lruvec;
int i;
- lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat);
+ lruvec = mem_cgroup_page_lruvec(head, pgdat);
/* complete memcg works before add pages to LRU */
mem_cgroup_split_huge_fixup(head);
@@ -2472,7 +2473,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
xa_unlock(&head->mapping->i_pages);
}
- spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
+ spin_unlock_irqrestore(&pgdat->lru_lock, flags);
remap_page(head);
@@ -2683,7 +2684,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
lru_add_drain();
/* prevent PageLRU to go away from under us, and freeze lru stats */
- spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags);
+ spin_lock_irqsave(&pgdata->lru_lock, flags);
if (mapping) {
XA_STATE(xas, &mapping->i_pages, page_index(head));
@@ -2728,7 +2729,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
spin_unlock(&pgdata->split_queue_lock);
fail: if (mapping)
xa_unlock(&mapping->i_pages);
- spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
+ spin_unlock_irqrestore(&pgdata->lru_lock, flags);
remap_page(head);
ret = -EBUSY;
}
@@ -2886,12 +2887,8 @@ DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
static int __init split_huge_pages_debugfs(void)
{
- void *ret;
-
- ret = debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
- &split_huge_pages_fops);
- if (!ret)
- pr_warn("Failed to create split_huge_pages in debugfs");
+ debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
+ &split_huge_pages_fops);
return 0;
}
late_initcall(split_huge_pages_debugfs);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 745088810965..97b1e0290c66 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -25,6 +25,7 @@
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/jhash.h>
+#include <linux/numa.h>
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -887,7 +888,7 @@ static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask,
struct zonelist *zonelist;
struct zone *zone;
struct zoneref *z;
- int node = -1;
+ int node = NUMA_NO_NODE;
zonelist = node_zonelist(nid, gfp_mask);
@@ -919,7 +920,7 @@ retry_cpuset:
/* Movability of hugepages depends on migration support. */
static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
- if (hugepage_migration_supported(h))
+ if (hugepage_movable_supported(h))
return GFP_HIGHUSER_MOVABLE;
else
return GFP_HIGHUSER;
@@ -1586,8 +1587,8 @@ out_unlock:
return page;
}
-static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
- int nid, nodemask_t *nmask)
+struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nmask)
{
struct page *page;
@@ -3238,7 +3239,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct page *ptepage;
unsigned long addr;
int cow;
- struct address_space *mapping = vma->vm_file->f_mapping;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
struct mmu_notifier_range range;
@@ -3250,23 +3250,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
mmu_notifier_range_init(&range, src, vma->vm_start,
vma->vm_end);
mmu_notifier_invalidate_range_start(&range);
- } else {
- /*
- * For shared mappings i_mmap_rwsem must be held to call
- * huge_pte_alloc, otherwise the returned ptep could go
- * away if part of a shared pmd and another thread calls
- * huge_pmd_unshare.
- */
- i_mmap_lock_read(mapping);
}
for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
spinlock_t *src_ptl, *dst_ptl;
-
src_pte = huge_pte_offset(src, addr, sz);
if (!src_pte)
continue;
-
dst_pte = huge_pte_alloc(dst, addr, sz);
if (!dst_pte) {
ret = -ENOMEM;
@@ -3337,8 +3327,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
if (cow)
mmu_notifier_invalidate_range_end(&range);
- else
- i_mmap_unlock_read(mapping);
return ret;
}
@@ -3637,7 +3625,6 @@ retry_avoidcopy:
copy_user_huge_page(new_page, old_page, address, vma,
pages_per_huge_page(h));
__SetPageUptodate(new_page);
- set_page_huge_active(new_page);
mmu_notifier_range_init(&range, mm, haddr, haddr + huge_page_size(h));
mmu_notifier_invalidate_range_start(&range);
@@ -3658,6 +3645,7 @@ retry_avoidcopy:
make_huge_pte(vma, new_page, 1));
page_remove_rmap(old_page, true);
hugepage_add_new_anon_rmap(new_page, vma, haddr);
+ set_page_huge_active(new_page);
/* Make the old page be freed below */
new_page = old_page;
}
@@ -3742,6 +3730,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
pte_t new_pte;
spinlock_t *ptl;
unsigned long haddr = address & huge_page_mask(h);
+ bool new_page = false;
/*
* Currently, we are forced to kill the process in the event the
@@ -3755,16 +3744,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
}
/*
- * We can not race with truncation due to holding i_mmap_rwsem.
- * Check once here for faults beyond end of file.
+ * Use page lock to guard against racing truncation
+ * before we get page_table_lock.
*/
- size = i_size_read(mapping->host) >> huge_page_shift(h);
- if (idx >= size)
- goto out;
-
retry:
page = find_lock_page(mapping, idx);
if (!page) {
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ if (idx >= size)
+ goto out;
+
/*
* Check for page in userfault range
*/
@@ -3784,18 +3773,14 @@ retry:
};
/*
- * hugetlb_fault_mutex and i_mmap_rwsem must be
- * dropped before handling userfault. Reacquire
- * after handling fault to make calling code simpler.
+ * hugetlb_fault_mutex must be dropped before
+ * handling userfault. Reacquire after handling
+ * fault to make calling code simpler.
*/
hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
idx, haddr);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
-
ret = handle_userfault(&vmf, VM_UFFD_MISSING);
-
- i_mmap_lock_read(mapping);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
goto out;
}
@@ -3807,7 +3792,7 @@ retry:
}
clear_huge_page(page, address, pages_per_huge_page(h));
__SetPageUptodate(page);
- set_page_huge_active(page);
+ new_page = true;
if (vma->vm_flags & VM_MAYSHARE) {
int err = huge_add_to_page_cache(page, mapping, idx);
@@ -3854,6 +3839,9 @@ retry:
}
ptl = huge_pte_lock(h, mm, ptep);
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ if (idx >= size)
+ goto backout;
ret = 0;
if (!huge_pte_none(huge_ptep_get(ptep)))
@@ -3875,6 +3863,15 @@ retry:
}
spin_unlock(ptl);
+
+ /*
+ * Only make newly allocated pages active. Existing pages found
+ * in the pagecache could be !page_huge_active() if they have been
+ * isolated for migration.
+ */
+ if (new_page)
+ set_page_huge_active(page);
+
unlock_page(page);
out:
return ret;
@@ -3940,11 +3937,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (ptep) {
- /*
- * Since we hold no locks, ptep could be stale. That is
- * OK as we are only making decisions based on content and
- * not actually modifying content here.
- */
entry = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_migration(entry))) {
migration_entry_wait_huge(vma, mm, ptep);
@@ -3952,33 +3944,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
return VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
+ } else {
+ ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
+ if (!ptep)
+ return VM_FAULT_OOM;
}
- /*
- * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
- * until finished with ptep. This serves two purposes:
- * 1) It prevents huge_pmd_unshare from being called elsewhere
- * and making the ptep no longer valid.
- * 2) It synchronizes us with file truncation.
- *
- * ptep could have already be assigned via huge_pte_offset. That
- * is OK, as huge_pte_alloc will return the same value unless
- * something changed.
- */
mapping = vma->vm_file->f_mapping;
- i_mmap_lock_read(mapping);
- ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
- if (!ptep) {
- i_mmap_unlock_read(mapping);
- return VM_FAULT_OOM;
- }
+ idx = vma_hugecache_offset(h, vma, haddr);
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
- idx = vma_hugecache_offset(h, vma, haddr);
hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -4066,7 +4045,6 @@ out_ptl:
}
out_mutex:
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
/*
* Generally it's safe to hold refcount during waiting page lock. But
* here we just wait to defer the next page fault to avoid busy loop and
@@ -4128,7 +4106,6 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
* the set_pte_at() write.
*/
__SetPageUptodate(page);
- set_page_huge_active(page);
mapping = dst_vma->vm_file->f_mapping;
idx = vma_hugecache_offset(h, dst_vma, dst_addr);
@@ -4196,6 +4173,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
update_mmu_cache(dst_vma, dst_addr, dst_pte);
spin_unlock(ptl);
+ set_page_huge_active(page);
if (vm_shared)
unlock_page(page);
ret = 0;
@@ -4301,7 +4279,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
break;
}
if (ret & VM_FAULT_RETRY) {
- if (nonblocking)
+ if (nonblocking &&
+ !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
*nonblocking = 0;
*nr_pages = 0;
/*
@@ -4420,10 +4399,12 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
continue;
}
if (!huge_pte_none(pte)) {
- pte = huge_ptep_get_and_clear(mm, address, ptep);
- pte = pte_mkhuge(huge_pte_modify(pte, newprot));
+ pte_t old_pte;
+
+ old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
+ pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
pte = arch_make_huge_pte(pte, vma, NULL, 0);
- set_huge_pte_at(mm, address, ptep, pte);
+ huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
pages++;
}
spin_unlock(ptl);
@@ -4671,12 +4652,10 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
* and returns the corresponding pte. While this is not necessary for the
* !shared pmd case because we can allocate the pmd later as well, it makes the
- * code much cleaner.
- *
- * This routine must be called with i_mmap_rwsem held in at least read mode.
- * For hugetlbfs, this prevents removal of any page table entries associated
- * with the address space. This is important as we are setting up sharing
- * based on existing page table entries (mappings).
+ * code much cleaner. pmd allocation is essential for the shared case because
+ * pud has to be populated inside the same i_mmap_rwsem section - otherwise
+ * racing tasks could either miss the sharing (see huge_pte_offset) or select a
+ * bad pmd for sharing.
*/
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
{
@@ -4693,6 +4672,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
if (!vma_shareable(vma, addr))
return (pte_t *)pmd_alloc(mm, pud, addr);
+ i_mmap_lock_write(mapping);
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;
@@ -4722,6 +4702,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
spin_unlock(ptl);
out:
pte = (pte_t *)pmd_alloc(mm, pud, addr);
+ i_mmap_unlock_write(mapping);
return pte;
}
@@ -4732,7 +4713,7 @@ out:
* indicated by page_count > 1, unmap is achieved by clearing pud and
* decrementing the ref count. If count == 1, the pte page is not shared.
*
- * Called with page table lock held and i_mmap_rwsem held in write mode.
+ * called with page table lock held.
*
* returns: 1 successfully unmapped a shared pte page
* 0 the underlying pte page is not shared, or it is the last user
diff --git a/mm/internal.h b/mm/internal.h
index f4a7bb02decf..9eeaf2b95166 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -163,6 +163,7 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
extern int __isolate_free_page(struct page *page, unsigned int order);
extern void memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order);
+extern void __free_pages_core(struct page *page, unsigned int order);
extern void prep_compound_page(struct page *page, unsigned int order);
extern void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags);
@@ -183,14 +184,16 @@ extern int user_min_free_kbytes;
struct compact_control {
struct list_head freepages; /* List of free pages to migrate to */
struct list_head migratepages; /* List of pages being migrated */
+ unsigned int nr_freepages; /* Number of isolated free pages */
+ unsigned int nr_migratepages; /* Number of pages to migrate */
+ unsigned long free_pfn; /* isolate_freepages search base */
+ unsigned long migrate_pfn; /* isolate_migratepages search base */
+ unsigned long fast_start_pfn; /* a pfn to start linear scan from */
struct zone *zone;
- unsigned long nr_freepages; /* Number of isolated free pages */
- unsigned long nr_migratepages; /* Number of pages to migrate */
unsigned long total_migrate_scanned;
unsigned long total_free_scanned;
- unsigned long free_pfn; /* isolate_freepages search base */
- unsigned long migrate_pfn; /* isolate_migratepages search base */
- unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
+ unsigned short fast_search_fail;/* failures to use free list searches */
+ short search_order; /* order to start a fast search at */
const gfp_t gfp_mask; /* gfp mask of a direct compactor */
int order; /* order a direct compactor needs */
int migratetype; /* migratetype of direct compactor */
@@ -203,7 +206,16 @@ struct compact_control {
bool direct_compaction; /* False from kcompactd or /proc/... */
bool whole_zone; /* Whole zone should/has been scanned */
bool contended; /* Signal lock or sched contention */
- bool finishing_block; /* Finishing current pageblock */
+ bool rescan; /* Rescanning the same pageblock */
+};
+
+/*
+ * Used in direct compaction when a page should be taken from the freelists
+ * immediately when one is created during the free path.
+ */
+struct capture_control {
+ struct compact_control *cc;
+ struct page *page;
};
unsigned long
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 0a14fcff70ed..5d1065efbd47 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -5,7 +5,10 @@ UBSAN_SANITIZE_generic.o := n
UBSAN_SANITIZE_tags.o := n
KCOV_INSTRUMENT := n
+CFLAGS_REMOVE_common.o = -pg
CFLAGS_REMOVE_generic.o = -pg
+CFLAGS_REMOVE_tags.o = -pg
+
# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 03d5d1374ca7..80bbe62b16cd 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -14,6 +14,8 @@
*
*/
+#define __KASAN_INTERNAL
+
#include <linux/export.h>
#include <linux/interrupt.h>
#include <linux/init.h>
@@ -298,8 +300,6 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
return;
}
- cache->align = round_up(cache->align, KASAN_SHADOW_SCALE_SIZE);
-
*flags |= SLAB_KASAN;
}
@@ -349,28 +349,48 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
}
/*
- * Since it's desirable to only call object contructors once during slab
- * allocation, we preassign tags to all such objects. Also preassign tags for
- * SLAB_TYPESAFE_BY_RCU slabs to avoid use-after-free reports.
- * For SLAB allocator we can't preassign tags randomly since the freelist is
- * stored as an array of indexes instead of a linked list. Assign tags based
- * on objects indexes, so that objects that are next to each other get
- * different tags.
- * After a tag is assigned, the object always gets allocated with the same tag.
- * The reason is that we can't change tags for objects with constructors on
- * reallocation (even for non-SLAB_TYPESAFE_BY_RCU), because the constructor
- * code can save the pointer to the object somewhere (e.g. in the object
- * itself). Then if we retag it, the old saved pointer will become invalid.
+ * This function assigns a tag to an object considering the following:
+ * 1. A cache might have a constructor, which might save a pointer to a slab
+ * object somewhere (e.g. in the object itself). We preassign a tag for
+ * each object in caches with constructors during slab creation and reuse
+ * the same tag each time a particular object is allocated.
+ * 2. A cache might be SLAB_TYPESAFE_BY_RCU, which means objects can be
+ * accessed after being freed. We preassign tags for objects in these
+ * caches as well.
+ * 3. For SLAB allocator we can't preassign tags randomly since the freelist
+ * is stored as an array of indexes instead of a linked list. Assign tags
+ * based on objects indexes, so that objects that are next to each other
+ * get different tags.
*/
-static u8 assign_tag(struct kmem_cache *cache, const void *object, bool new)
+static u8 assign_tag(struct kmem_cache *cache, const void *object,
+ bool init, bool keep_tag)
{
+ /*
+ * 1. When an object is kmalloc()'ed, two hooks are called:
+ * kasan_slab_alloc() and kasan_kmalloc(). We assign the
+ * tag only in the first one.
+ * 2. We reuse the same tag for krealloc'ed objects.
+ */
+ if (keep_tag)
+ return get_tag(object);
+
+ /*
+ * If the cache neither has a constructor nor has SLAB_TYPESAFE_BY_RCU
+ * set, assign a tag when the object is being allocated (init == false).
+ */
if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU))
- return new ? KASAN_TAG_KERNEL : random_tag();
+ return init ? KASAN_TAG_KERNEL : random_tag();
+ /* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */
#ifdef CONFIG_SLAB
+ /* For SLAB assign tags based on the object index in the freelist. */
return (u8)obj_to_index(cache, virt_to_page(object), (void *)object);
#else
- return new ? random_tag() : get_tag(object);
+ /*
+ * For SLUB assign a random tag during slab creation, otherwise reuse
+ * the already assigned tag.
+ */
+ return init ? random_tag() : get_tag(object);
#endif
}
@@ -386,17 +406,12 @@ void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
__memset(alloc_info, 0, sizeof(*alloc_info));
if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
- object = set_tag(object, assign_tag(cache, object, true));
+ object = set_tag(object,
+ assign_tag(cache, object, true, false));
return (void *)object;
}
-void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object,
- gfp_t flags)
-{
- return kasan_kmalloc(cache, object, cache->object_size, flags);
-}
-
static inline bool shadow_invalid(u8 tag, s8 shadow_byte)
{
if (IS_ENABLED(CONFIG_KASAN_GENERIC))
@@ -452,8 +467,8 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
return __kasan_slab_free(cache, object, ip, true);
}
-void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
- size_t size, gfp_t flags)
+static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
+ size_t size, gfp_t flags, bool keep_tag)
{
unsigned long redzone_start;
unsigned long redzone_end;
@@ -471,7 +486,7 @@ void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
KASAN_SHADOW_SCALE_SIZE);
if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
- tag = assign_tag(cache, object, false);
+ tag = assign_tag(cache, object, false, keep_tag);
/* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */
kasan_unpoison_shadow(set_tag(object, tag), size);
@@ -483,6 +498,18 @@ void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
return set_tag(object, tag);
}
+
+void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object,
+ gfp_t flags)
+{
+ return __kasan_kmalloc(cache, object, cache->object_size, flags, false);
+}
+
+void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
+ size_t size, gfp_t flags)
+{
+ return __kasan_kmalloc(cache, object, size, flags, true);
+}
EXPORT_SYMBOL(kasan_kmalloc);
void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
@@ -522,7 +549,8 @@ void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags)
if (unlikely(!PageSlab(page)))
return kasan_kmalloc_large(object, size, flags);
else
- return kasan_kmalloc(page->slab_cache, object, size, flags);
+ return __kasan_kmalloc(page->slab_cache, object, size,
+ flags, true);
}
void kasan_poison_kfree(void *ptr, unsigned long ip)
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index ccb6207276e3..504c79363a34 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -275,25 +275,6 @@ EXPORT_SYMBOL(__asan_storeN_noabort);
void __asan_handle_no_return(void) {}
EXPORT_SYMBOL(__asan_handle_no_return);
-/* Emitted by compiler to poison large objects when they go out of scope. */
-void __asan_poison_stack_memory(const void *addr, size_t size)
-{
- /*
- * Addr is KASAN_SHADOW_SCALE_SIZE-aligned and the object is surrounded
- * by redzones, so we simply round up size to simplify logic.
- */
- kasan_poison_shadow(addr, round_up(size, KASAN_SHADOW_SCALE_SIZE),
- KASAN_USE_AFTER_SCOPE);
-}
-EXPORT_SYMBOL(__asan_poison_stack_memory);
-
-/* Emitted by compiler to unpoison large objects when they go into scope. */
-void __asan_unpoison_stack_memory(const void *addr, size_t size)
-{
- kasan_unpoison_shadow(addr, size);
-}
-EXPORT_SYMBOL(__asan_unpoison_stack_memory);
-
/* Emitted by compiler to poison alloca()ed objects. */
void __asan_alloca_poison(unsigned long addr, size_t size)
{
diff --git a/mm/kasan/generic_report.c b/mm/kasan/generic_report.c
index 5e12035888f2..36c645939bc9 100644
--- a/mm/kasan/generic_report.c
+++ b/mm/kasan/generic_report.c
@@ -82,9 +82,6 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info)
case KASAN_KMALLOC_FREE:
bug_type = "use-after-free";
break;
- case KASAN_USE_AFTER_SCOPE:
- bug_type = "use-after-scope";
- break;
case KASAN_ALLOCA_LEFT:
case KASAN_ALLOCA_RIGHT:
bug_type = "alloca-out-of-bounds";
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index 45a1b5e38e1e..ce45c491ebcd 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -42,7 +42,7 @@ static inline bool kasan_p4d_table(pgd_t pgd)
#else
static inline bool kasan_p4d_table(pgd_t pgd)
{
- return 0;
+ return false;
}
#endif
#if CONFIG_PGTABLE_LEVELS > 3
@@ -54,7 +54,7 @@ static inline bool kasan_pud_table(p4d_t p4d)
#else
static inline bool kasan_pud_table(p4d_t p4d)
{
- return 0;
+ return false;
}
#endif
#if CONFIG_PGTABLE_LEVELS > 2
@@ -66,7 +66,7 @@ static inline bool kasan_pmd_table(pud_t pud)
#else
static inline bool kasan_pmd_table(pud_t pud)
{
- return 0;
+ return false;
}
#endif
pte_t kasan_early_shadow_pte[PTRS_PER_PTE] __page_aligned_bss;
@@ -83,8 +83,14 @@ static inline bool kasan_early_shadow_page_entry(pte_t pte)
static __init void *early_alloc(size_t size, int node)
{
- return memblock_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS),
- MEMBLOCK_ALLOC_ACCESSIBLE, node);
+ void *ptr = memblock_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS),
+ MEMBLOCK_ALLOC_ACCESSIBLE, node);
+
+ if (!ptr)
+ panic("%s: Failed to allocate %zu bytes align=%zx nid=%d from=%llx\n",
+ __func__, size, size, node, (u64)__pa(MAX_DMA_ADDRESS));
+
+ return ptr;
}
static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr,
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index ea51b2d898ec..3e0c11f7d7a1 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -34,7 +34,6 @@
#define KASAN_STACK_MID 0xF2
#define KASAN_STACK_RIGHT 0xF3
#define KASAN_STACK_PARTIAL 0xF4
-#define KASAN_USE_AFTER_SCOPE 0xF8
/*
* alloca redzone shadow values
@@ -187,8 +186,6 @@ void __asan_unregister_globals(struct kasan_global *globals, size_t size);
void __asan_loadN(unsigned long addr, size_t size);
void __asan_storeN(unsigned long addr, size_t size);
void __asan_handle_no_return(void);
-void __asan_poison_stack_memory(const void *addr, size_t size);
-void __asan_unpoison_stack_memory(const void *addr, size_t size);
void __asan_alloca_poison(unsigned long addr, size_t size);
void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom);
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
index 0777649e07c4..63fca3172659 100644
--- a/mm/kasan/tags.c
+++ b/mm/kasan/tags.c
@@ -46,7 +46,7 @@ void kasan_init_tags(void)
int cpu;
for_each_possible_cpu(cpu)
- per_cpu(prng_state, cpu) = get_random_u32();
+ per_cpu(prng_state, cpu) = (u32)get_cycles();
}
/*
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 4f017339ddb2..449044378782 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1074,6 +1074,7 @@ static void collapse_huge_page(struct mm_struct *mm,
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address, true);
mem_cgroup_commit_charge(new_page, memcg, false, true);
+ count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
lru_cache_add_active_or_unevictable(new_page, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
@@ -1502,6 +1503,7 @@ xa_unlocked:
page_ref_add(new_page, HPAGE_PMD_NR - 1);
set_page_dirty(new_page);
mem_cgroup_commit_charge(new_page, memcg, false, true);
+ count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
lru_cache_add_anon(new_page);
/*
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index f9d9dc250428..707fa5579f66 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -574,6 +574,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
unsigned long flags;
struct kmemleak_object *object, *parent;
struct rb_node **link, *rb_parent;
+ unsigned long untagged_ptr;
object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
if (!object) {
@@ -619,8 +620,9 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
write_lock_irqsave(&kmemleak_lock, flags);
- min_addr = min(min_addr, ptr);
- max_addr = max(max_addr, ptr + size);
+ untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
+ min_addr = min(min_addr, untagged_ptr);
+ max_addr = max(max_addr, untagged_ptr + size);
link = &object_tree_root.rb_node;
rb_parent = NULL;
while (*link) {
@@ -1333,6 +1335,7 @@ static void scan_block(void *_start, void *_end,
unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER);
unsigned long *end = _end - (BYTES_PER_POINTER - 1);
unsigned long flags;
+ unsigned long untagged_ptr;
read_lock_irqsave(&kmemleak_lock, flags);
for (ptr = start; ptr < end; ptr++) {
@@ -1347,7 +1350,8 @@ static void scan_block(void *_start, void *_end,
pointer = *ptr;
kasan_enable_current();
- if (pointer < min_addr || pointer >= max_addr)
+ untagged_ptr = (unsigned long)kasan_reset_tag((void *)pointer);
+ if (untagged_ptr < min_addr || untagged_ptr >= max_addr)
continue;
/*
diff --git a/mm/ksm.c b/mm/ksm.c
index 6c48ad13b4c9..fc64874dc6f4 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -598,7 +598,7 @@ static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
chain->chain_prune_time = jiffies;
chain->rmap_hlist_len = STABLE_NODE_CHAIN;
#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
- chain->nid = -1; /* debug */
+ chain->nid = NUMA_NO_NODE; /* debug */
#endif
ksm_stable_node_chains++;
@@ -667,6 +667,12 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
free_stable_node(stable_node);
}
+enum get_ksm_page_flags {
+ GET_KSM_PAGE_NOLOCK,
+ GET_KSM_PAGE_LOCK,
+ GET_KSM_PAGE_TRYLOCK
+};
+
/*
* get_ksm_page: checks if the page indicated by the stable node
* is still its ksm page, despite having held no reference to it.
@@ -686,7 +692,8 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
* a page to put something that might look like our key in page->mapping.
* is on its way to being freed; but it is an anomaly to bear in mind.
*/
-static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
+static struct page *get_ksm_page(struct stable_node *stable_node,
+ enum get_ksm_page_flags flags)
{
struct page *page;
void *expected_mapping;
@@ -706,8 +713,9 @@ again:
* case this node is no longer referenced, and should be freed;
* however, it might mean that the page is under page_ref_freeze().
* The __remove_mapping() case is easy, again the node is now stale;
- * but if page is swapcache in migrate_page_move_mapping(), it might
- * still be our page, in which case it's essential to keep the node.
+ * the same is in reuse_ksm_page() case; but if page is swapcache
+ * in migrate_page_move_mapping(), it might still be our page,
+ * in which case it's essential to keep the node.
*/
while (!get_page_unless_zero(page)) {
/*
@@ -728,8 +736,15 @@ again:
goto stale;
}
- if (lock_it) {
+ if (flags == GET_KSM_PAGE_TRYLOCK) {
+ if (!trylock_page(page)) {
+ put_page(page);
+ return ERR_PTR(-EBUSY);
+ }
+ } else if (flags == GET_KSM_PAGE_LOCK)
lock_page(page);
+
+ if (flags != GET_KSM_PAGE_NOLOCK) {
if (READ_ONCE(page->mapping) != expected_mapping) {
unlock_page(page);
put_page(page);
@@ -763,7 +778,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
struct page *page;
stable_node = rmap_item->head;
- page = get_ksm_page(stable_node, true);
+ page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
if (!page)
goto out;
@@ -863,7 +878,7 @@ static int remove_stable_node(struct stable_node *stable_node)
struct page *page;
int err;
- page = get_ksm_page(stable_node, true);
+ page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
if (!page) {
/*
* get_ksm_page did remove_node_from_stable_tree itself.
@@ -1385,7 +1400,7 @@ static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
* stable_node parameter itself will be freed from
* under us if it returns NULL.
*/
- _tree_page = get_ksm_page(dup, false);
+ _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK);
if (!_tree_page)
continue;
nr += 1;
@@ -1508,7 +1523,7 @@ static struct page *__stable_node_chain(struct stable_node **_stable_node_dup,
if (!is_stable_node_chain(stable_node)) {
if (is_page_sharing_candidate(stable_node)) {
*_stable_node_dup = stable_node;
- return get_ksm_page(stable_node, false);
+ return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK);
}
/*
* _stable_node_dup set to NULL means the stable_node
@@ -1613,7 +1628,8 @@ again:
* wrprotected at all times. Any will work
* fine to continue the walk.
*/
- tree_page = get_ksm_page(stable_node_any, false);
+ tree_page = get_ksm_page(stable_node_any,
+ GET_KSM_PAGE_NOLOCK);
}
VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
if (!tree_page) {
@@ -1673,7 +1689,12 @@ again:
* It would be more elegant to return stable_node
* than kpage, but that involves more changes.
*/
- tree_page = get_ksm_page(stable_node_dup, true);
+ tree_page = get_ksm_page(stable_node_dup,
+ GET_KSM_PAGE_TRYLOCK);
+
+ if (PTR_ERR(tree_page) == -EBUSY)
+ return ERR_PTR(-EBUSY);
+
if (unlikely(!tree_page))
/*
* The tree may have been rebalanced,
@@ -1842,7 +1863,8 @@ again:
* wrprotected at all times. Any will work
* fine to continue the walk.
*/
- tree_page = get_ksm_page(stable_node_any, false);
+ tree_page = get_ksm_page(stable_node_any,
+ GET_KSM_PAGE_NOLOCK);
}
VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
if (!tree_page) {
@@ -2068,6 +2090,9 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
remove_rmap_item_from_tree(rmap_item);
if (kpage) {
+ if (PTR_ERR(kpage) == -EBUSY)
+ return;
+
err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
if (!err) {
/*
@@ -2242,7 +2267,8 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
list_for_each_entry_safe(stable_node, next,
&migrate_nodes, list) {
- page = get_ksm_page(stable_node, false);
+ page = get_ksm_page(stable_node,
+ GET_KSM_PAGE_NOLOCK);
if (page)
put_page(page);
cond_resched();
@@ -2642,6 +2668,31 @@ again:
goto again;
}
+bool reuse_ksm_page(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address)
+{
+#ifdef CONFIG_DEBUG_VM
+ if (WARN_ON(is_zero_pfn(page_to_pfn(page))) ||
+ WARN_ON(!page_mapped(page)) ||
+ WARN_ON(!PageLocked(page))) {
+ dump_page(page, "reuse_ksm_page");
+ return false;
+ }
+#endif
+
+ if (PageSwapCache(page) || !page_stable_node(page))
+ return false;
+ /* Prohibit parallel get_ksm_page() */
+ if (!page_ref_freeze(page, 1))
+ return false;
+
+ page_move_anon_rmap(page, vma);
+ page->index = linear_page_index(vma, address);
+ page_ref_unfreeze(page, 1);
+
+ return true;
+}
#ifdef CONFIG_MIGRATION
void ksm_migrate_page(struct page *newpage, struct page *oldpage)
{
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 5b30625fd365..0730bf8ff39f 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -601,7 +601,6 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
struct lock_class_key *key, struct shrinker *shrinker)
{
int i;
- size_t size = sizeof(*lru->node) * nr_node_ids;
int err = -ENOMEM;
#ifdef CONFIG_MEMCG_KMEM
@@ -612,7 +611,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
#endif
memcg_get_cache_ids();
- lru->node = kzalloc(size, GFP_KERNEL);
+ lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL);
if (!lru->node)
goto out;
diff --git a/mm/maccess.c b/mm/maccess.c
index f3416632e5a4..ec00be51a24f 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -30,10 +30,8 @@ long __probe_kernel_read(void *dst, const void *src, size_t size)
set_fs(KERNEL_DS);
pagefault_disable();
- current->kernel_uaccess_faults_ok++;
ret = __copy_from_user_inatomic(dst,
(__force const void __user *)src, size);
- current->kernel_uaccess_faults_ok--;
pagefault_enable();
set_fs(old_fs);
@@ -60,9 +58,7 @@ long __probe_kernel_write(void *dst, const void *src, size_t size)
set_fs(KERNEL_DS);
pagefault_disable();
- current->kernel_uaccess_faults_ok++;
ret = __copy_to_user_inatomic((__force void __user *)dst, src, size);
- current->kernel_uaccess_faults_ok--;
pagefault_enable();
set_fs(old_fs);
@@ -98,13 +94,11 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
set_fs(KERNEL_DS);
pagefault_disable();
- current->kernel_uaccess_faults_ok++;
do {
ret = __get_user(*dst++, (const char __user __force *)src++);
} while (dst[-1] && ret == 0 && src - unsafe_addr < count);
- current->kernel_uaccess_faults_ok--;
dst[-1] = '\0';
pagefault_enable();
set_fs(old_fs);
diff --git a/mm/memblock.c b/mm/memblock.c
index 022d4cbb3618..e7665cf914b1 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -26,6 +26,13 @@
#include "internal.h"
+#define INIT_MEMBLOCK_REGIONS 128
+#define INIT_PHYSMEM_REGIONS 4
+
+#ifndef INIT_MEMBLOCK_RESERVED_REGIONS
+# define INIT_MEMBLOCK_RESERVED_REGIONS INIT_MEMBLOCK_REGIONS
+#endif
+
/**
* DOC: memblock overview
*
@@ -69,8 +76,19 @@
* :c:func:`memblock_set_node`. The :c:func:`memblock_add_node`
* performs such an assignment directly.
*
- * Once memblock is setup the memory can be allocated using either
- * memblock or bootmem APIs.
+ * Once memblock is setup the memory can be allocated using one of the
+ * API variants:
+ *
+ * * :c:func:`memblock_phys_alloc*` - these functions return the
+ * **physical** address of the allocated memory
+ * * :c:func:`memblock_alloc*` - these functions return the **virtual**
+ * address of the allocated memory.
+ *
+ * Note, that both API variants use implict assumptions about allowed
+ * memory ranges and the fallback methods. Consult the documentation
+ * of :c:func:`memblock_alloc_internal` and
+ * :c:func:`memblock_alloc_range_nid` functions for more elaboarte
+ * description.
*
* As the system boot progresses, the architecture specific
* :c:func:`mem_init` function frees all the memory to the buddy page
@@ -92,7 +110,7 @@ unsigned long max_pfn;
unsigned long long max_possible_pfn;
static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
-static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
+static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock;
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock;
#endif
@@ -105,7 +123,7 @@ struct memblock memblock __initdata_memblock = {
.reserved.regions = memblock_reserved_init_regions,
.reserved.cnt = 1, /* empty dummy entry */
- .reserved.max = INIT_MEMBLOCK_REGIONS,
+ .reserved.max = INIT_MEMBLOCK_RESERVED_REGIONS,
.reserved.name = "reserved",
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
@@ -125,7 +143,7 @@ static int memblock_can_resize __initdata_memblock;
static int memblock_memory_in_slab __initdata_memblock = 0;
static int memblock_reserved_in_slab __initdata_memblock = 0;
-enum memblock_flags __init_memblock choose_memblock_flags(void)
+static enum memblock_flags __init_memblock choose_memblock_flags(void)
{
return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
}
@@ -254,7 +272,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
* Return:
* Found address on success, 0 on failure.
*/
-phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
+static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
phys_addr_t end, int nid,
enum memblock_flags flags)
@@ -428,17 +446,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
else
in_slab = &memblock_reserved_in_slab;
- /* Try to find some space for it.
- *
- * WARNING: We assume that either slab_is_available() and we use it or
- * we use MEMBLOCK for allocations. That means that this is unsafe to
- * use when bootmem is currently active (unless bootmem itself is
- * implemented on top of MEMBLOCK which isn't the case yet)
- *
- * This should however not be an issue for now, as we currently only
- * call into MEMBLOCK while it's still active, or much later when slab
- * is active for memory hotplug operations
- */
+ /* Try to find some space for it */
if (use_slab) {
new_array = kmalloc(new_size, GFP_KERNEL);
addr = new_array ? __pa(new_array) : 0;
@@ -851,11 +859,14 @@ static int __init_memblock memblock_setclr_flag(phys_addr_t base,
if (ret)
return ret;
- for (i = start_rgn; i < end_rgn; i++)
+ for (i = start_rgn; i < end_rgn; i++) {
+ struct memblock_region *r = &type->regions[i];
+
if (set)
- memblock_set_region_flags(&type->regions[i], flag);
+ r->flags |= flag;
else
- memblock_clear_region_flags(&type->regions[i], flag);
+ r->flags &= ~flag;
+ }
memblock_merge_regions(type);
return 0;
@@ -955,8 +966,31 @@ void __init_memblock __next_reserved_mem_region(u64 *idx,
*idx = ULLONG_MAX;
}
+static bool should_skip_region(struct memblock_region *m, int nid, int flags)
+{
+ int m_nid = memblock_get_region_node(m);
+
+ /* only memory regions are associated with nodes, check it */
+ if (nid != NUMA_NO_NODE && nid != m_nid)
+ return true;
+
+ /* skip hotpluggable memory regions if needed */
+ if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
+ return true;
+
+ /* if we want mirror memory skip non-mirror memory regions */
+ if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
+ return true;
+
+ /* skip nomap memory unless we were asked for it explicitly */
+ if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m))
+ return true;
+
+ return false;
+}
+
/**
- * __next__mem_range - next function for for_each_free_mem_range() etc.
+ * __next_mem_range - next function for for_each_free_mem_range() etc.
* @idx: pointer to u64 loop variable
* @nid: node selector, %NUMA_NO_NODE for all nodes
* @flags: pick from blocks based on memory attributes
@@ -1002,20 +1036,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
phys_addr_t m_end = m->base + m->size;
int m_nid = memblock_get_region_node(m);
- /* only memory regions are associated with nodes, check it */
- if (nid != NUMA_NO_NODE && nid != m_nid)
- continue;
-
- /* skip hotpluggable memory regions if needed */
- if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
- continue;
-
- /* if we want mirror memory skip non-mirror memory regions */
- if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
- continue;
-
- /* skip nomap memory unless we were asked for it explicitly */
- if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m))
+ if (should_skip_region(m, nid, flags))
continue;
if (!type_b) {
@@ -1119,20 +1140,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
phys_addr_t m_end = m->base + m->size;
int m_nid = memblock_get_region_node(m);
- /* only memory regions are associated with nodes, check it */
- if (nid != NUMA_NO_NODE && nid != m_nid)
- continue;
-
- /* skip hotpluggable memory regions if needed */
- if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
- continue;
-
- /* if we want mirror memory skip non-mirror memory regions */
- if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
- continue;
-
- /* skip nomap memory unless we were asked for it explicitly */
- if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m))
+ if (should_skip_region(m, nid, flags))
continue;
if (!type_b) {
@@ -1248,94 +1256,123 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
}
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+/**
+ * memblock_alloc_range_nid - allocate boot memory block
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @start: the lower bound of the memory region to allocate (phys address)
+ * @end: the upper bound of the memory region to allocate (phys address)
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * The allocation is performed from memory region limited by
+ * memblock.current_limit if @max_addr == %MEMBLOCK_ALLOC_ACCESSIBLE.
+ *
+ * If the specified node can not hold the requested memory the
+ * allocation falls back to any node in the system
+ *
+ * For systems with memory mirroring, the allocation is attempted first
+ * from the regions with mirroring enabled and then retried from any
+ * memory region.
+ *
+ * In addition, function sets the min_count to 0 using kmemleak_alloc_phys for
+ * allocated boot memory block, so that it is never reported as leaks.
+ *
+ * Return:
+ * Physical address of allocated memory block on success, %0 on failure.
+ */
static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
- phys_addr_t end, int nid,
- enum memblock_flags flags)
+ phys_addr_t end, int nid)
{
+ enum memblock_flags flags = choose_memblock_flags();
phys_addr_t found;
+ if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
+ nid = NUMA_NO_NODE;
+
if (!align) {
/* Can't use WARNs this early in boot on powerpc */
dump_stack();
align = SMP_CACHE_BYTES;
}
+ if (end > memblock.current_limit)
+ end = memblock.current_limit;
+
+again:
found = memblock_find_in_range_node(size, align, start, end, nid,
flags);
- if (found && !memblock_reserve(found, size)) {
- /*
- * The min_count is set to 0 so that memblock allocations are
- * never reported as leaks.
- */
- kmemleak_alloc_phys(found, size, 0, 0);
- return found;
- }
- return 0;
-}
-
-phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
- phys_addr_t start, phys_addr_t end,
- enum memblock_flags flags)
-{
- return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE,
- flags);
-}
-
-phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
- phys_addr_t align, phys_addr_t max_addr,
- int nid, enum memblock_flags flags)
-{
- return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flags);
-}
-
-phys_addr_t __init memblock_phys_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
-{
- enum memblock_flags flags = choose_memblock_flags();
- phys_addr_t ret;
+ if (found && !memblock_reserve(found, size))
+ goto done;
-again:
- ret = memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE,
- nid, flags);
+ if (nid != NUMA_NO_NODE) {
+ found = memblock_find_in_range_node(size, align, start,
+ end, NUMA_NO_NODE,
+ flags);
+ if (found && !memblock_reserve(found, size))
+ goto done;
+ }
- if (!ret && (flags & MEMBLOCK_MIRROR)) {
+ if (flags & MEMBLOCK_MIRROR) {
flags &= ~MEMBLOCK_MIRROR;
+ pr_warn("Could not allocate %pap bytes of mirrored memory\n",
+ &size);
goto again;
}
- return ret;
-}
-phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
-{
- return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE,
- MEMBLOCK_NONE);
-}
-
-phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
-{
- phys_addr_t alloc;
-
- alloc = __memblock_alloc_base(size, align, max_addr);
+ return 0;
- if (alloc == 0)
- panic("ERROR: Failed to allocate %pa bytes below %pa.\n",
- &size, &max_addr);
+done:
+ /* Skip kmemleak for kasan_init() due to high volume. */
+ if (end != MEMBLOCK_ALLOC_KASAN)
+ /*
+ * The min_count is set to 0 so that memblock allocated
+ * blocks are never reported as leaks. This is because many
+ * of these blocks are only referred via the physical
+ * address which is not looked up by kmemleak.
+ */
+ kmemleak_alloc_phys(found, size, 0, 0);
- return alloc;
+ return found;
}
-phys_addr_t __init memblock_phys_alloc(phys_addr_t size, phys_addr_t align)
+/**
+ * memblock_phys_alloc_range - allocate a memory block inside specified range
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @start: the lower bound of the memory region to allocate (physical address)
+ * @end: the upper bound of the memory region to allocate (physical address)
+ *
+ * Allocate @size bytes in the between @start and @end.
+ *
+ * Return: physical address of the allocated memory block on success,
+ * %0 on failure.
+ */
+phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size,
+ phys_addr_t align,
+ phys_addr_t start,
+ phys_addr_t end)
{
- return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
+ return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE);
}
+/**
+ * memblock_phys_alloc_try_nid - allocate a memory block from specified MUMA node
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Allocates memory block from the specified NUMA node. If the node
+ * has no available memory, attempts to allocated from any node in the
+ * system.
+ *
+ * Return: physical address of the allocated memory block on success,
+ * %0 on failure.
+ */
phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
{
- phys_addr_t res = memblock_phys_alloc_nid(size, align, nid);
-
- if (res)
- return res;
- return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
+ return memblock_alloc_range_nid(size, align, 0,
+ MEMBLOCK_ALLOC_ACCESSIBLE, nid);
}
/**
@@ -1346,19 +1383,13 @@ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t ali
* @max_addr: the upper bound of the memory region to allocate (phys address)
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
*
- * The @min_addr limit is dropped if it can not be satisfied and the allocation
- * will fall back to memory below @min_addr. Also, allocation may fall back
- * to any node in the system if the specified node can not
- * hold the requested memory.
- *
- * The allocation is performed from memory region limited by
- * memblock.current_limit if @max_addr == %MEMBLOCK_ALLOC_ACCESSIBLE.
- *
- * The phys address of allocated boot memory block is converted to virtual and
- * allocated memory is reset to 0.
+ * Allocates memory block using memblock_alloc_range_nid() and
+ * converts the returned physical address to virtual.
*
- * In addition, function sets the min_count to 0 using kmemleak_alloc for
- * allocated boot memory block, so that it is never reported as leaks.
+ * The @min_addr limit is dropped if it can not be satisfied and the allocation
+ * will fall back to memory below @min_addr. Other constraints, such
+ * as node and mirrored memory will be handled again in
+ * memblock_alloc_range_nid().
*
* Return:
* Virtual address of allocated memory block on success, NULL on failure.
@@ -1369,11 +1400,6 @@ static void * __init memblock_alloc_internal(
int nid)
{
phys_addr_t alloc;
- void *ptr;
- enum memblock_flags flags = choose_memblock_flags();
-
- if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
- nid = NUMA_NO_NODE;
/*
* Detect any accidental use of these APIs after slab is ready, as at
@@ -1383,54 +1409,16 @@ static void * __init memblock_alloc_internal(
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, nid);
- if (!align) {
- dump_stack();
- align = SMP_CACHE_BYTES;
- }
+ alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid);
- if (max_addr > memblock.current_limit)
- max_addr = memblock.current_limit;
-again:
- alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
- nid, flags);
- if (alloc && !memblock_reserve(alloc, size))
- goto done;
+ /* retry allocation without lower limit */
+ if (!alloc && min_addr)
+ alloc = memblock_alloc_range_nid(size, align, 0, max_addr, nid);
- if (nid != NUMA_NO_NODE) {
- alloc = memblock_find_in_range_node(size, align, min_addr,
- max_addr, NUMA_NO_NODE,
- flags);
- if (alloc && !memblock_reserve(alloc, size))
- goto done;
- }
-
- if (min_addr) {
- min_addr = 0;
- goto again;
- }
+ if (!alloc)
+ return NULL;
- if (flags & MEMBLOCK_MIRROR) {
- flags &= ~MEMBLOCK_MIRROR;
- pr_warn("Could not allocate %pap bytes of mirrored memory\n",
- &size);
- goto again;
- }
-
- return NULL;
-done:
- ptr = phys_to_virt(alloc);
-
- /* Skip kmemleak for kasan_init() due to high volume. */
- if (max_addr != MEMBLOCK_ALLOC_KASAN)
- /*
- * The min_count is set to 0 so that bootmem allocated
- * blocks are never reported as leaks. This is because many
- * of these blocks are only referred via the physical
- * address which is not looked up by kmemleak.
- */
- kmemleak_alloc(ptr, size, 0, 0);
-
- return ptr;
+ return phys_to_virt(alloc);
}
/**
@@ -1472,7 +1460,7 @@ void * __init memblock_alloc_try_nid_raw(
}
/**
- * memblock_alloc_try_nid_nopanic - allocate boot memory block
+ * memblock_alloc_try_nid - allocate boot memory block
* @size: size of memory block to be allocated in bytes
* @align: alignment of the region and block's size
* @min_addr: the lower bound of the memory region from where the allocation
@@ -1488,42 +1476,6 @@ void * __init memblock_alloc_try_nid_raw(
* Return:
* Virtual address of allocated memory block on success, NULL on failure.
*/
-void * __init memblock_alloc_try_nid_nopanic(
- phys_addr_t size, phys_addr_t align,
- phys_addr_t min_addr, phys_addr_t max_addr,
- int nid)
-{
- void *ptr;
-
- memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n",
- __func__, (u64)size, (u64)align, nid, &min_addr,
- &max_addr, (void *)_RET_IP_);
-
- ptr = memblock_alloc_internal(size, align,
- min_addr, max_addr, nid);
- if (ptr)
- memset(ptr, 0, size);
- return ptr;
-}
-
-/**
- * memblock_alloc_try_nid - allocate boot memory block with panicking
- * @size: size of memory block to be allocated in bytes
- * @align: alignment of the region and block's size
- * @min_addr: the lower bound of the memory region from where the allocation
- * is preferred (phys address)
- * @max_addr: the upper bound of the memory region from where the allocation
- * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to
- * allocate only from memory limited by memblock.current_limit value
- * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
- *
- * Public panicking version of memblock_alloc_try_nid_nopanic()
- * which provides debug information (including caller info), if enabled,
- * and panics if the request can not be satisfied.
- *
- * Return:
- * Virtual address of allocated memory block on success, NULL on failure.
- */
void * __init memblock_alloc_try_nid(
phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr, phys_addr_t max_addr,
@@ -1536,24 +1488,20 @@ void * __init memblock_alloc_try_nid(
&max_addr, (void *)_RET_IP_);
ptr = memblock_alloc_internal(size, align,
min_addr, max_addr, nid);
- if (ptr) {
+ if (ptr)
memset(ptr, 0, size);
- return ptr;
- }
- panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa\n",
- __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr);
- return NULL;
+ return ptr;
}
/**
- * __memblock_free_late - free bootmem block pages directly to buddy allocator
+ * __memblock_free_late - free pages directly to buddy allocator
* @base: phys starting address of the boot memory block
* @size: size of the boot memory block in bytes
*
- * This is only useful when the bootmem allocator has already been torn
+ * This is only useful when the memblock allocator has already been torn
* down, but we are still initializing the system. Pages are released directly
- * to the buddy allocator, no bootmem metadata is updated because it is gone.
+ * to the buddy allocator.
*/
void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
{
@@ -1998,8 +1946,7 @@ DEFINE_SHOW_ATTRIBUTE(memblock_debug);
static int __init memblock_init_debugfs(void)
{
struct dentry *root = debugfs_create_dir("memblock", NULL);
- if (!root)
- return -ENXIO;
+
debugfs_create_file("memory", 0444, root,
&memblock.memory, &memblock_debug_fops);
debugfs_create_file("reserved", 0444, root,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index af7f18b32389..532e0e2a4817 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -39,6 +39,7 @@
#include <linux/shmem_fs.h>
#include <linux/hugetlb.h>
#include <linux/pagemap.h>
+#include <linux/vm_event_item.h>
#include <linux/smp.h>
#include <linux/page-flags.h>
#include <linux/backing-dev.h>
@@ -248,6 +249,12 @@ enum res_type {
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
+static inline bool should_force_charge(void)
+{
+ return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
+ (current->flags & PF_EXITING);
+}
+
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
@@ -1389,8 +1396,13 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
};
bool ret;
- mutex_lock(&oom_lock);
- ret = out_of_memory(&oc);
+ if (mutex_lock_killable(&oom_lock))
+ return true;
+ /*
+ * A few threads which were not waiting at mutex_lock_killable() can
+ * fail to bail out. Therefore, check again after holding oom_lock.
+ */
+ ret = should_force_charge() || out_of_memory(&oc);
mutex_unlock(&oom_lock);
return ret;
}
@@ -2209,9 +2221,7 @@ retry:
* bypass the last charges so that they can exit quickly and
* free their memory.
*/
- if (unlikely(tsk_is_oom_victim(current) ||
- fatal_signal_pending(current) ||
- current->flags & PF_EXITING))
+ if (unlikely(should_force_charge()))
goto force;
/*
@@ -2352,13 +2362,13 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
static void lock_page_lru(struct page *page, int *isolated)
{
- struct zone *zone = page_zone(page);
+ pg_data_t *pgdat = page_pgdat(page);
- spin_lock_irq(zone_lru_lock(zone));
+ spin_lock_irq(&pgdat->lru_lock);
if (PageLRU(page)) {
struct lruvec *lruvec;
- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_lru(page));
*isolated = 1;
@@ -2368,17 +2378,17 @@ static void lock_page_lru(struct page *page, int *isolated)
static void unlock_page_lru(struct page *page, int isolated)
{
- struct zone *zone = page_zone(page);
+ pg_data_t *pgdat = page_pgdat(page);
if (isolated) {
struct lruvec *lruvec;
- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
VM_BUG_ON_PAGE(PageLRU(page), page);
SetPageLRU(page);
add_page_to_lru_list(page, lruvec, page_lru(page));
}
- spin_unlock_irq(zone_lru_lock(zone));
+ spin_unlock_irq(&pgdat->lru_lock);
}
static void commit_charge(struct page *page, struct mem_cgroup *memcg,
@@ -2573,7 +2583,7 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep)
}
/**
- * memcg_kmem_charge_memcg: charge a kmem page
+ * __memcg_kmem_charge_memcg: charge a kmem page
* @page: page to charge
* @gfp: reclaim mode
* @order: allocation order
@@ -2581,7 +2591,7 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep)
*
* Returns 0 on success, an error code on failure.
*/
-int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
+int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
struct mem_cgroup *memcg)
{
unsigned int nr_pages = 1 << order;
@@ -2604,24 +2614,24 @@ int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
}
/**
- * memcg_kmem_charge: charge a kmem page to the current memory cgroup
+ * __memcg_kmem_charge: charge a kmem page to the current memory cgroup
* @page: page to charge
* @gfp: reclaim mode
* @order: allocation order
*
* Returns 0 on success, an error code on failure.
*/
-int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
+int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
{
struct mem_cgroup *memcg;
int ret = 0;
- if (mem_cgroup_disabled() || memcg_kmem_bypass())
+ if (memcg_kmem_bypass())
return 0;
memcg = get_mem_cgroup_from_current();
if (!mem_cgroup_is_root(memcg)) {
- ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
+ ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
if (!ret)
__SetPageKmemcg(page);
}
@@ -2629,11 +2639,11 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
return ret;
}
/**
- * memcg_kmem_uncharge: uncharge a kmem page
+ * __memcg_kmem_uncharge: uncharge a kmem page
* @page: page to uncharge
* @order: allocation order
*/
-void memcg_kmem_uncharge(struct page *page, int order)
+void __memcg_kmem_uncharge(struct page *page, int order)
{
struct mem_cgroup *memcg = page->mem_cgroup;
unsigned int nr_pages = 1 << order;
@@ -2664,7 +2674,7 @@ void memcg_kmem_uncharge(struct page *page, int order)
/*
* Because tail pages are not marked as "used", set it. We're under
- * zone_lru_lock and migration entries setup in all page mappings.
+ * pgdat->lru_lock and migration entries setup in all page mappings.
*/
void mem_cgroup_split_huge_fixup(struct page *head)
{
@@ -3337,7 +3347,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
const struct numa_stat *stat;
int nid;
unsigned long nr;
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
@@ -3388,7 +3398,7 @@ static const char *const memcg1_event_names[] = {
static int memcg_stat_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
unsigned long memory, memsw;
struct mem_cgroup *mi;
unsigned int i;
@@ -3626,8 +3636,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
size = thresholds->primary ? thresholds->primary->size + 1 : 1;
/* Allocate memory for new array of thresholds */
- new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
- GFP_KERNEL);
+ new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
if (!new) {
ret = -ENOMEM;
goto unlock;
@@ -3821,7 +3830,7 @@ static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
@@ -4420,7 +4429,7 @@ static void mem_cgroup_free(struct mem_cgroup *memcg)
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *memcg;
- size_t size;
+ unsigned int size;
int node;
size = sizeof(struct mem_cgroup);
@@ -5354,6 +5363,16 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
root_mem_cgroup->use_hierarchy = false;
}
+static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
+{
+ if (value == PAGE_COUNTER_MAX)
+ seq_puts(m, "max\n");
+ else
+ seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
+
+ return 0;
+}
+
static u64 memory_current_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
@@ -5364,15 +5383,8 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
static int memory_min_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long min = READ_ONCE(memcg->memory.min);
-
- if (min == PAGE_COUNTER_MAX)
- seq_puts(m, "max\n");
- else
- seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE);
-
- return 0;
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
}
static ssize_t memory_min_write(struct kernfs_open_file *of,
@@ -5394,15 +5406,8 @@ static ssize_t memory_min_write(struct kernfs_open_file *of,
static int memory_low_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long low = READ_ONCE(memcg->memory.low);
-
- if (low == PAGE_COUNTER_MAX)
- seq_puts(m, "max\n");
- else
- seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
-
- return 0;
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
}
static ssize_t memory_low_write(struct kernfs_open_file *of,
@@ -5424,15 +5429,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of,
static int memory_high_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long high = READ_ONCE(memcg->high);
-
- if (high == PAGE_COUNTER_MAX)
- seq_puts(m, "max\n");
- else
- seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
-
- return 0;
+ return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high));
}
static ssize_t memory_high_write(struct kernfs_open_file *of,
@@ -5461,15 +5458,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
static int memory_max_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long max = READ_ONCE(memcg->memory.max);
-
- if (max == PAGE_COUNTER_MAX)
- seq_puts(m, "max\n");
- else
- seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
-
- return 0;
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
}
static ssize_t memory_max_write(struct kernfs_open_file *of,
@@ -5523,7 +5513,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
static int memory_events_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
seq_printf(m, "low %lu\n",
atomic_long_read(&memcg->memory_events[MEMCG_LOW]));
@@ -5541,7 +5531,7 @@ static int memory_events_show(struct seq_file *m, void *v)
static int memory_stat_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
struct accumulated_stats acc;
int i;
@@ -5582,6 +5572,15 @@ static int memory_stat_show(struct seq_file *m, void *v)
seq_printf(m, "file_writeback %llu\n",
(u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
+ /*
+ * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
+ * with the NR_ANON_THP vm counter, but right now it's a pain in the
+ * arse because it requires migrating the work out of rmap to a place
+ * where the page->mem_cgroup is set up and stable.
+ */
+ seq_printf(m, "anon_thp %llu\n",
+ (u64)acc.stat[MEMCG_RSS_HUGE] * PAGE_SIZE);
+
for (i = 0; i < NR_LRU_LISTS; i++)
seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
(u64)acc.lru_pages[i] * PAGE_SIZE);
@@ -5613,12 +5612,18 @@ static int memory_stat_show(struct seq_file *m, void *v)
seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ seq_printf(m, "thp_fault_alloc %lu\n", acc.events[THP_FAULT_ALLOC]);
+ seq_printf(m, "thp_collapse_alloc %lu\n",
+ acc.events[THP_COLLAPSE_ALLOC]);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
return 0;
}
static int memory_oom_group_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
seq_printf(m, "%d\n", memcg->oom_group);
@@ -5747,7 +5752,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
*
* | memory.current, if memory.current < memory.low
* low_usage = |
- | 0, otherwise.
+ * | 0, otherwise.
*
*
* Such definition of the effective memory.low provides the expected
@@ -6601,15 +6606,8 @@ static u64 swap_current_read(struct cgroup_subsys_state *css,
static int swap_max_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long max = READ_ONCE(memcg->swap.max);
-
- if (max == PAGE_COUNTER_MAX)
- seq_puts(m, "max\n");
- else
- seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
-
- return 0;
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
}
static ssize_t swap_max_write(struct kernfs_open_file *of,
@@ -6631,7 +6629,7 @@ static ssize_t swap_max_write(struct kernfs_open_file *of,
static int swap_events_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
seq_printf(m, "max %lu\n",
atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
diff --git a/mm/memfd.c b/mm/memfd.c
index 97264c79d2cd..650e65a46b9c 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -131,7 +131,8 @@ static unsigned int *memfd_file_seals_ptr(struct file *file)
#define F_ALL_SEALS (F_SEAL_SEAL | \
F_SEAL_SHRINK | \
F_SEAL_GROW | \
- F_SEAL_WRITE)
+ F_SEAL_WRITE | \
+ F_SEAL_FUTURE_WRITE)
static int memfd_add_seals(struct file *file, unsigned int seals)
{
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6379fff1a5ff..fc8b51744579 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -372,7 +372,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
if (fail || tk->addr_valid == 0) {
pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
pfn, tk->tsk->comm, tk->tsk->pid);
- force_sig(SIGKILL, tk->tsk);
+ do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
+ tk->tsk, PIDTYPE_PID);
}
/*
@@ -966,7 +967,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
struct address_space *mapping;
LIST_HEAD(tokill);
- bool unmap_success = true;
+ bool unmap_success;
int kill = 1, forcekill;
struct page *hpage = *hpagep;
bool mlocked = PageMlocked(hpage);
@@ -1028,19 +1029,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (kill)
collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
- if (!PageHuge(hpage)) {
- unmap_success = try_to_unmap(hpage, ttu);
- } else if (mapping) {
- /*
- * For hugetlb pages, try_to_unmap could potentially call
- * huge_pmd_unshare. Because of this, take semaphore in
- * write mode here and set TTU_RMAP_LOCKED to indicate we
- * have taken the lock at this higer level.
- */
- i_mmap_lock_write(mapping);
- unmap_success = try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
- i_mmap_unlock_write(mapping);
- }
+ unmap_success = try_to_unmap(hpage, ttu);
if (!unmap_success)
pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(hpage));
@@ -1836,19 +1825,17 @@ static int soft_offline_in_use_page(struct page *page, int flags)
struct page *hpage = compound_head(page);
if (!PageHuge(page) && PageTransHuge(hpage)) {
- lock_page(hpage);
- if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
- unlock_page(hpage);
- if (!PageAnon(hpage))
+ lock_page(page);
+ if (!PageAnon(page) || unlikely(split_huge_page(page))) {
+ unlock_page(page);
+ if (!PageAnon(page))
pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
else
pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
- put_hwpoison_page(hpage);
+ put_hwpoison_page(page);
return -EBUSY;
}
- unlock_page(hpage);
- get_hwpoison_page(page);
- put_hwpoison_page(hpage);
+ unlock_page(page);
}
/*
diff --git a/mm/memory.c b/mm/memory.c
index a52663c0612d..47fe250307c7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -69,6 +69,7 @@
#include <linux/userfaultfd_k.h>
#include <linux/dax.h>
#include <linux/oom.h>
+#include <linux/numa.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
@@ -1451,7 +1452,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
spinlock_t *ptl;
retval = -EINVAL;
- if (PageAnon(page))
+ if (PageAnon(page) || PageSlab(page) || page_has_type(page))
goto out;
retval = -ENOMEM;
flush_dcache_page(page);
@@ -1503,6 +1504,8 @@ out:
* under mm->mmap_sem write-lock, so it can change vma->vm_flags.
* Caller must set VM_MIXEDMAP on vma if it wants to call this
* function from other places, for example from page-fault handler.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
struct page *page)
@@ -1830,7 +1833,9 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
* @size: size of map area
* @prot: page protection flags for this mapping
*
- * Note: this is only safe if the mm semaphore is held when called.
+ * Note: this is only safe if the mm semaphore is held when called.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t prot)
@@ -1903,6 +1908,8 @@ EXPORT_SYMBOL(remap_pfn_range);
*
* NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
* whatever write-combining details or similar.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
{
@@ -2381,12 +2388,13 @@ oom:
*
* This function handles all that is needed to finish a write page fault in a
* shared mapping due to PTE being read-only once the mapped page is prepared.
- * It handles locking of PTE and modifying it. The function returns
- * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE
- * lock.
+ * It handles locking of PTE and modifying it.
*
* The function expects the page to be locked or other protection against
* concurrent faults / writeback (such as DAX radix tree locks).
+ *
+ * Return: %VM_FAULT_WRITE on success, %0 when PTE got changed before
+ * we acquired PTE lock.
*/
vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
{
@@ -2504,8 +2512,11 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
* Take out anonymous pages first, anonymous shared vmas are
* not dirty accountable.
*/
- if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
+ if (PageAnon(vmf->page)) {
int total_map_swapcount;
+ if (PageKsm(vmf->page) && (PageSwapCache(vmf->page) ||
+ page_count(vmf->page) != 1))
+ goto copy;
if (!trylock_page(vmf->page)) {
get_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2520,6 +2531,15 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
}
put_page(vmf->page);
}
+ if (PageKsm(vmf->page)) {
+ bool reused = reuse_ksm_page(vmf->page, vmf->vma,
+ vmf->address);
+ unlock_page(vmf->page);
+ if (!reused)
+ goto copy;
+ wp_page_reuse(vmf);
+ return VM_FAULT_WRITE;
+ }
if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
if (total_map_swapcount == 1) {
/*
@@ -2540,7 +2560,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
(VM_WRITE|VM_SHARED))) {
return wp_page_shared(vmf);
}
-
+copy:
/*
* Ok, we need to copy. Oh, well..
*/
@@ -2994,6 +3014,28 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret;
+ /*
+ * Preallocate pte before we take page_lock because this might lead to
+ * deadlocks for memcg reclaim which waits for pages under writeback:
+ * lock_page(A)
+ * SetPageWriteback(A)
+ * unlock_page(A)
+ * lock_page(B)
+ * lock_page(B)
+ * pte_alloc_pne
+ * shrink_page_list
+ * wait_on_page_writeback(A)
+ * SetPageWriteback(B)
+ * unlock_page(B)
+ * # flush A, B to clear the writeback
+ */
+ if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
+ vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
+ if (!vmf->prealloc_pte)
+ return VM_FAULT_OOM;
+ smp_wmb(); /* See comment in __pte_alloc() */
+ }
+
ret = vma->vm_ops->fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
VM_FAULT_DONE_COW)))
@@ -3179,6 +3221,8 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
*
* Target users are page handler itself and implementations of
* vm_ops->map_pages.
+ *
+ * Return: %0 on success, %VM_FAULT_ code in case of error.
*/
vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
struct page *page)
@@ -3239,11 +3283,12 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
* This function handles all that is needed to finish a page fault once the
* page to fault in is prepared. It handles locking of PTEs, inserts PTE for
* given page, adds reverse page mapping, handles memcg charges and LRU
- * addition. The function returns 0 on success, VM_FAULT_ code in case of
- * error.
+ * addition.
*
* The function expects the page to be locked and on success it consumes a
* reference of a page being mapped (for the PTE which maps it).
+ *
+ * Return: %0 on success, %VM_FAULT_ code in case of error.
*/
vm_fault_t finish_fault(struct vm_fault *vmf)
{
@@ -3299,12 +3344,8 @@ DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
static int __init fault_around_debugfs(void)
{
- void *ret;
-
- ret = debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
- &fault_around_bytes_fops);
- if (!ret)
- pr_warn("Failed to create fault_around_bytes in debugfs");
+ debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
+ &fault_around_bytes_fops);
return 0;
}
late_initcall(fault_around_debugfs);
@@ -3495,10 +3536,13 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
* but allow concurrent faults).
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
+ * If mmap_sem is released, vma may become invalid (for example
+ * by other thread calling munmap()).
*/
static vm_fault_t do_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
+ struct mm_struct *vm_mm = vma->vm_mm;
vm_fault_t ret;
/*
@@ -3539,7 +3583,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
/* preallocated pagetable is unused: free it */
if (vmf->prealloc_pte) {
- pte_free(vma->vm_mm, vmf->prealloc_pte);
+ pte_free(vm_mm, vmf->prealloc_pte);
vmf->prealloc_pte = NULL;
}
return ret;
@@ -3564,11 +3608,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL;
- int page_nid = -1;
+ int page_nid = NUMA_NO_NODE;
int last_cpupid;
int target_nid;
bool migrated = false;
- pte_t pte;
+ pte_t pte, old_pte;
bool was_writable = pte_savedwrite(vmf->orig_pte);
int flags = 0;
@@ -3588,12 +3632,12 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
* Make it present again, Depending on how arch implementes non
* accessible ptes, some can allow access by kernel mode.
*/
- pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte);
- pte = pte_modify(pte, vma->vm_page_prot);
+ old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
+ pte = pte_modify(old_pte, vma->vm_page_prot);
pte = pte_mkyoung(pte);
if (was_writable)
pte = pte_mkwrite(pte);
- ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte);
+ ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
update_mmu_cache(vma, vmf->address, vmf->pte);
page = vm_normal_page(vma, vmf->address, pte);
@@ -3631,7 +3675,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
&flags);
pte_unmap_unlock(vmf->pte, vmf->ptl);
- if (target_nid == -1) {
+ if (target_nid == NUMA_NO_NODE) {
put_page(page);
goto out;
}
@@ -3645,7 +3689,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
flags |= TNF_MIGRATE_FAIL;
out:
- if (page_nid != -1)
+ if (page_nid != NUMA_NO_NODE)
task_numa_fault(last_cpupid, page_nid, 1, flags);
return 0;
}
@@ -4077,8 +4121,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
goto out;
if (range) {
- range->start = address & PAGE_MASK;
- range->end = range->start + PAGE_SIZE;
+ mmu_notifier_range_init(range, mm, address & PAGE_MASK,
+ (address & PAGE_MASK) + PAGE_SIZE);
mmu_notifier_invalidate_range_start(range);
}
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
@@ -4128,7 +4172,7 @@ EXPORT_SYMBOL(follow_pte_pmd);
*
* Only IO mappings and raw PFN mappings are allowed.
*
- * Returns zero and the pfn at @pfn on success, -ve otherwise.
+ * Return: zero and the pfn at @pfn on success, -ve otherwise.
*/
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
unsigned long *pfn)
@@ -4278,6 +4322,8 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
* @gup_flags: flags modifying lookup behaviour
*
* The caller must hold a reference on @mm.
+ *
+ * Return: number of bytes copied from source to destination.
*/
int access_remote_vm(struct mm_struct *mm, unsigned long addr,
void *buf, int len, unsigned int gup_flags)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b9a667d36c55..f767582af4f8 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -47,7 +47,7 @@
* and restore_online_page_callback() for generic callback restore.
*/
-static void generic_online_page(struct page *page);
+static void generic_online_page(struct page *page, unsigned int order);
static online_page_callback_t online_page_callback = generic_online_page;
static DEFINE_MUTEX(online_page_callback_lock);
@@ -96,27 +96,29 @@ void mem_hotplug_done(void)
cpus_read_unlock();
}
+u64 max_mem_size = U64_MAX;
+
/* add this memory to iomem resource */
static struct resource *register_memory_resource(u64 start, u64 size)
{
- struct resource *res, *conflict;
- res = kzalloc(sizeof(struct resource), GFP_KERNEL);
- if (!res)
- return ERR_PTR(-ENOMEM);
-
- res->name = "System RAM";
- res->start = start;
- res->end = start + size - 1;
- res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
- conflict = request_resource_conflict(&iomem_resource, res);
- if (conflict) {
- if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
- pr_debug("Device unaddressable memory block "
- "memory hotplug at %#010llx !\n",
- (unsigned long long)start);
- }
- pr_debug("System RAM resource %pR cannot be added\n", res);
- kfree(res);
+ struct resource *res;
+ unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+ char *resource_name = "System RAM";
+
+ if (start + size > max_mem_size)
+ return ERR_PTR(-E2BIG);
+
+ /*
+ * Request ownership of the new memory range. This might be
+ * a child of an existing resource that was present but
+ * not marked as busy.
+ */
+ res = __request_region(&iomem_resource, start, size,
+ resource_name, flags);
+
+ if (!res) {
+ pr_debug("Unable to reserve System RAM region: %016llx->%016llx\n",
+ start, start + size);
return ERR_PTR(-EEXIST);
}
return res;
@@ -656,26 +658,40 @@ void __online_page_free(struct page *page)
}
EXPORT_SYMBOL_GPL(__online_page_free);
-static void generic_online_page(struct page *page)
+static void generic_online_page(struct page *page, unsigned int order)
{
- __online_page_set_limits(page);
- __online_page_increment_counters(page);
- __online_page_free(page);
+ kernel_map_pages(page, 1 << order, 1);
+ __free_pages_core(page, order);
+ totalram_pages_add(1UL << order);
+#ifdef CONFIG_HIGHMEM
+ if (PageHighMem(page))
+ totalhigh_pages_add(1UL << order);
+#endif
+}
+
+static int online_pages_blocks(unsigned long start, unsigned long nr_pages)
+{
+ unsigned long end = start + nr_pages;
+ int order, onlined_pages = 0;
+
+ while (start < end) {
+ order = min(MAX_ORDER - 1,
+ get_order(PFN_PHYS(end) - PFN_PHYS(start)));
+ (*online_page_callback)(pfn_to_page(start), order);
+
+ onlined_pages += (1UL << order);
+ start += (1UL << order);
+ }
+ return onlined_pages;
}
static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg)
{
- unsigned long i;
unsigned long onlined_pages = *(unsigned long *)arg;
- struct page *page;
if (PageReserved(pfn_to_page(start_pfn)))
- for (i = 0; i < nr_pages; i++) {
- page = pfn_to_page(start_pfn + i);
- (*online_page_callback)(page);
- onlined_pages++;
- }
+ onlined_pages += online_pages_blocks(start_pfn, nr_pages);
online_mem_sections(start_pfn, start_pfn + nr_pages);
@@ -689,9 +705,9 @@ static void node_states_check_changes_online(unsigned long nr_pages,
{
int nid = zone_to_nid(zone);
- arg->status_change_nid = -1;
- arg->status_change_nid_normal = -1;
- arg->status_change_nid_high = -1;
+ arg->status_change_nid = NUMA_NO_NODE;
+ arg->status_change_nid_normal = NUMA_NO_NODE;
+ arg->status_change_nid_high = NUMA_NO_NODE;
if (!node_state(nid, N_MEMORY))
arg->status_change_nid = nid;
@@ -1188,11 +1204,13 @@ static inline int pageblock_free(struct page *page)
return PageBuddy(page) && page_order(page) >= pageblock_order;
}
-/* Return the start of the next active pageblock after a given page */
-static struct page *next_active_pageblock(struct page *page)
+/* Return the pfn of the start of the next active pageblock after a given pfn */
+static unsigned long next_active_pageblock(unsigned long pfn)
{
+ struct page *page = pfn_to_page(pfn);
+
/* Ensure the starting page is pageblock-aligned */
- BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
+ BUG_ON(pfn & (pageblock_nr_pages - 1));
/* If the entire pageblock is free, move to the end of free page */
if (pageblock_free(page)) {
@@ -1200,16 +1218,16 @@ static struct page *next_active_pageblock(struct page *page)
/* be careful. we don't have locks, page_order can be changed.*/
order = page_order(page);
if ((order < MAX_ORDER) && (order >= pageblock_order))
- return page + (1 << order);
+ return pfn + (1 << order);
}
- return page + pageblock_nr_pages;
+ return pfn + pageblock_nr_pages;
}
-static bool is_pageblock_removable_nolock(struct page *page)
+static bool is_pageblock_removable_nolock(unsigned long pfn)
{
+ struct page *page = pfn_to_page(pfn);
struct zone *zone;
- unsigned long pfn;
/*
* We have to be careful here because we are iterating over memory
@@ -1232,12 +1250,14 @@ static bool is_pageblock_removable_nolock(struct page *page)
/* Checks if this range of memory is likely to be hot-removable. */
bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
{
- struct page *page = pfn_to_page(start_pfn);
- struct page *end_page = page + nr_pages;
+ unsigned long end_pfn, pfn;
+
+ end_pfn = min(start_pfn + nr_pages,
+ zone_end_pfn(page_zone(pfn_to_page(start_pfn))));
/* Check the starting page of each pageblock within the range */
- for (; page < end_page; page = next_active_pageblock(page)) {
- if (!is_pageblock_removable_nolock(page))
+ for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) {
+ if (!is_pageblock_removable_nolock(pfn))
return false;
cond_resched();
}
@@ -1273,6 +1293,9 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
i++;
if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
continue;
+ /* Check if we got outside of the zone */
+ if (zone && !zone_spans_pfn(zone, pfn + i))
+ return 0;
page = pfn_to_page(pfn + i);
if (zone && page_zone(page) != zone)
return 0;
@@ -1301,23 +1324,27 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
{
unsigned long pfn;
- struct page *page;
+
for (pfn = start; pfn < end; pfn++) {
- if (pfn_valid(pfn)) {
- page = pfn_to_page(pfn);
- if (PageLRU(page))
- return pfn;
- if (__PageMovable(page))
- return pfn;
- if (PageHuge(page)) {
- if (hugepage_migration_supported(page_hstate(page)) &&
- page_huge_active(page))
- return pfn;
- else
- pfn = round_up(pfn + 1,
- 1 << compound_order(page)) - 1;
- }
- }
+ struct page *page, *head;
+ unsigned long skip;
+
+ if (!pfn_valid(pfn))
+ continue;
+ page = pfn_to_page(pfn);
+ if (PageLRU(page))
+ return pfn;
+ if (__PageMovable(page))
+ return pfn;
+
+ if (!PageHuge(page))
+ continue;
+ head = compound_head(page);
+ if (hugepage_migration_supported(page_hstate(head)) &&
+ page_huge_active(head))
+ return pfn;
+ skip = (1 << compound_order(head)) - (page - head);
+ pfn += skip - 1;
}
return 0;
}
@@ -1344,7 +1371,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long pfn;
struct page *page;
- int not_managed = 0;
int ret = 0;
LIST_HEAD(source);
@@ -1355,12 +1381,12 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (PageHuge(page)) {
struct page *head = compound_head(page);
- pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
if (compound_order(head) > PFN_SECTION_SHIFT) {
ret = -EBUSY;
break;
}
- isolate_huge_page(page, &source);
+ pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
+ isolate_huge_page(head, &source);
continue;
} else if (PageTransHuge(page))
pfn = page_to_pfn(compound_head(page))
@@ -1392,7 +1418,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
else
ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
if (!ret) { /* Success */
- put_page(page);
list_add_tail(&page->lru, &source);
if (!__PageMovable(page))
inc_node_page_state(page, NR_ISOLATED_ANON +
@@ -1401,22 +1426,10 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
} else {
pr_warn("failed to isolate pfn %lx\n", pfn);
dump_page(page, "isolation failed");
- put_page(page);
- /* Because we don't have big zone->lock. we should
- check this again here. */
- if (page_count(page)) {
- not_managed++;
- ret = -EBUSY;
- break;
- }
}
+ put_page(page);
}
if (!list_empty(&source)) {
- if (not_managed) {
- putback_movable_pages(&source);
- goto out;
- }
-
/* Allocate a new page from the nearest neighbor node */
ret = migrate_pages(&source, new_node_page, NULL, 0,
MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
@@ -1429,7 +1442,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
putback_movable_pages(&source);
}
}
-out:
+
return ret;
}
@@ -1499,9 +1512,9 @@ static void node_states_check_changes_offline(unsigned long nr_pages,
unsigned long present_pages = 0;
enum zone_type zt;
- arg->status_change_nid = -1;
- arg->status_change_nid_normal = -1;
- arg->status_change_nid_high = -1;
+ arg->status_change_nid = NUMA_NO_NODE;
+ arg->status_change_nid_normal = NUMA_NO_NODE;
+ arg->status_change_nid_high = NUMA_NO_NODE;
/*
* Check whether node_states[N_NORMAL_MEMORY] will be changed.
@@ -1576,7 +1589,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
we assume this for now. .*/
if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start,
&valid_end)) {
- mem_hotplug_done();
ret = -EINVAL;
reason = "multizone range";
goto failed_removal;
@@ -1591,7 +1603,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
MIGRATE_MOVABLE,
SKIP_HWPOISON | REPORT_FAILURE);
if (ret) {
- mem_hotplug_done();
reason = "failure to isolate range";
goto failed_removal;
}
@@ -1617,7 +1628,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
cond_resched();
lru_add_drain_all();
- drain_all_pages(zone);
pfn = scan_movable_pages(pfn, end_pfn);
if (pfn) {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d4496d9d34f5..af171ccb56a2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -350,7 +350,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
if (!pol)
return;
- if (!mpol_store_user_nodemask(pol) &&
+ if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
return;
@@ -1314,7 +1314,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
nodemask_t *nodes)
{
unsigned long copy = ALIGN(maxnode-1, 64) / 8;
- const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
+ unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
if (copy > nbytes) {
if (copy > PAGE_SIZE)
@@ -1491,7 +1491,7 @@ static int kernel_get_mempolicy(int __user *policy,
int uninitialized_var(pval);
nodemask_t nodes;
- if (nmask != NULL && maxnode < MAX_NUMNODES)
+ if (nmask != NULL && maxnode < nr_node_ids)
return -EINVAL;
err = do_get_mempolicy(&pval, &nodes, addr, flags);
@@ -1527,7 +1527,7 @@ COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
unsigned long nr_bits, alloc_size;
DECLARE_BITMAP(bm, MAX_NUMNODES);
- nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
+ nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
if (nmask)
@@ -2304,7 +2304,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
unsigned long pgoff;
int thiscpu = raw_smp_processor_id();
int thisnid = cpu_to_node(thiscpu);
- int polnid = -1;
+ int polnid = NUMA_NO_NODE;
int ret = -1;
pol = get_vma_policy(vma, addr);
diff --git a/mm/mempool.c b/mm/mempool.c
index 0ef8cc8d1602..85efab3da720 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -222,6 +222,8 @@ EXPORT_SYMBOL(mempool_init_node);
*
* Like mempool_create(), but initializes the pool in (i.e. embedded in another
* structure).
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data)
@@ -245,6 +247,8 @@ EXPORT_SYMBOL(mempool_init);
* functions. This function might sleep. Both the alloc_fn() and the free_fn()
* functions might sleep - as long as the mempool_alloc() function is not called
* from IRQ contexts.
+ *
+ * Return: pointer to the created memory pool object or %NULL on error.
*/
mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data)
@@ -289,6 +293,8 @@ EXPORT_SYMBOL(mempool_create_node);
* Note, the caller must guarantee that no mempool_destroy is called
* while this function is running. mempool_alloc() & mempool_free()
* might be called (eg. from IRQ contexts) while this function executes.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int mempool_resize(mempool_t *pool, int new_min_nr)
{
@@ -363,6 +369,8 @@ EXPORT_SYMBOL(mempool_resize);
* *never* fails when called from process contexts. (it might
* fail if called from an IRQ context.)
* Note: using __GFP_ZERO is not supported.
+ *
+ * Return: pointer to the allocated element or %NULL on error.
*/
void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
{
diff --git a/mm/migrate.c b/mm/migrate.c
index ccf8966caf6f..ac6f4939bb59 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -100,7 +100,7 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode)
/*
* Check PageMovable before holding a PG_lock because page's owner
* assumes anybody doesn't touch PG_lock of newly allocated page
- * so unconditionally grapping the lock ruins page's owner side.
+ * so unconditionally grabbing the lock ruins page's owner side.
*/
if (unlikely(!__PageMovable(page)))
goto out_putpage;
@@ -374,7 +374,7 @@ unlock:
}
#endif
-static int expected_page_refs(struct page *page)
+static int expected_page_refs(struct address_space *mapping, struct page *page)
{
int expected_count = 1;
@@ -384,7 +384,7 @@ static int expected_page_refs(struct page *page)
*/
expected_count += is_device_private_page(page);
expected_count += is_device_public_page(page);
- if (page_mapping(page))
+ if (mapping)
expected_count += hpage_nr_pages(page) + page_has_private(page);
return expected_count;
@@ -405,7 +405,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
XA_STATE(xas, &mapping->i_pages, page_index(page));
struct zone *oldzone, *newzone;
int dirty;
- int expected_count = expected_page_refs(page) + extra_count;
+ int expected_count = expected_page_refs(mapping, page) + extra_count;
if (!mapping) {
/* Anonymous page without mapping */
@@ -709,7 +709,6 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head,
/* Simple case, sync compaction */
if (mode != MIGRATE_ASYNC) {
do {
- get_bh(bh);
lock_buffer(bh);
bh = bh->b_this_page;
@@ -720,18 +719,15 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head,
/* async case, we cannot block on lock_buffer so use trylock_buffer */
do {
- get_bh(bh);
if (!trylock_buffer(bh)) {
/*
* We failed to lock the buffer and cannot stall in
* async migration. Release the taken locks
*/
struct buffer_head *failed_bh = bh;
- put_bh(failed_bh);
bh = head;
while (bh != failed_bh) {
unlock_buffer(bh);
- put_bh(bh);
bh = bh->b_this_page;
}
return false;
@@ -754,7 +750,7 @@ static int __buffer_migrate_page(struct address_space *mapping,
return migrate_page(mapping, newpage, page, mode);
/* Check whether page does not have extra refs before we do more work */
- expected_count = expected_page_refs(page);
+ expected_count = expected_page_refs(mapping, page);
if (page_count(page) != expected_count)
return -EAGAIN;
@@ -818,7 +814,6 @@ unlock_buffers:
bh = head;
do {
unlock_buffer(bh);
- put_bh(bh);
bh = bh->b_this_page;
} while (bh != head);
@@ -916,7 +911,7 @@ static int fallback_migrate_page(struct address_space *mapping,
*/
if (page_has_private(page) &&
!try_to_release_page(page, GFP_KERNEL))
- return -EAGAIN;
+ return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
return migrate_page(mapping, newpage, page, mode);
}
@@ -1135,10 +1130,13 @@ out:
* If migration is successful, decrease refcount of the newpage
* which will not free the page because new page owner increased
* refcounter. As well, if it is LRU page, add the page to LRU
- * list in here.
+ * list in here. Use the old state of the isolated source page to
+ * determine if we migrated a LRU page. newpage was already unlocked
+ * and possibly modified by its owner - don't rely on the page
+ * state.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
- if (unlikely(__PageMovable(newpage)))
+ if (unlikely(!is_lru))
put_page(newpage);
else
putback_lru_page(newpage);
@@ -1289,7 +1287,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
struct anon_vma *anon_vma = NULL;
/*
- * Movability of hugepages depends on architectures and hugepage size.
+ * Migratability of hugepages depends on architectures and their size.
* This check is necessary because some callers of hugepage migration
* like soft offline and memory hotremove don't walk through page
* tables or check whether the hugepage is pmd-based or not before
@@ -1317,6 +1315,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
lock_page(hpage);
}
+ /*
+ * Check for pages which are in the process of being freed. Without
+ * page_mapping() set, hugetlbfs specific move page routine will not
+ * be called and we could leak usage counts for subpools.
+ */
+ if (page_private(hpage) && !page_mapping(hpage)) {
+ rc = -EBUSY;
+ goto out_unlock;
+ }
+
if (PageAnon(hpage))
anon_vma = page_get_anon_vma(hpage);
@@ -1324,19 +1332,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
goto put_anon;
if (page_mapped(hpage)) {
- struct address_space *mapping = page_mapping(hpage);
-
- /*
- * try_to_unmap could potentially call huge_pmd_unshare.
- * Because of this, take semaphore in write mode here and
- * set TTU_RMAP_LOCKED to let lower levels know we have
- * taken the lock.
- */
- i_mmap_lock_write(mapping);
try_to_unmap(hpage,
- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
- TTU_RMAP_LOCKED);
- i_mmap_unlock_write(mapping);
+ TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
page_was_mapped = 1;
}
@@ -1358,6 +1355,7 @@ put_anon:
put_new_page = NULL;
}
+out_unlock:
unlock_page(hpage);
out:
if (rc != -EAGAIN)
diff --git a/mm/mincore.c b/mm/mincore.c
index f0f91461a9f4..218099b5ed31 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -42,14 +42,72 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
return 0;
}
-static int mincore_unmapped_range(unsigned long addr, unsigned long end,
- struct mm_walk *walk)
+/*
+ * Later we can get more picky about what "in core" means precisely.
+ * For now, simply check to see if the page is in the page cache,
+ * and is up to date; i.e. that no page-in operation would be required
+ * at this time if an application were to map and access this page.
+ */
+static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
+{
+ unsigned char present = 0;
+ struct page *page;
+
+ /*
+ * When tmpfs swaps out a page from a file, any process mapping that
+ * file will not get a swp_entry_t in its pte, but rather it is like
+ * any other file mapping (ie. marked !present and faulted in with
+ * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
+ */
+#ifdef CONFIG_SWAP
+ if (shmem_mapping(mapping)) {
+ page = find_get_entry(mapping, pgoff);
+ /*
+ * shmem/tmpfs may return swap: account for swapcache
+ * page too.
+ */
+ if (xa_is_value(page)) {
+ swp_entry_t swp = radix_to_swp_entry(page);
+ page = find_get_page(swap_address_space(swp),
+ swp_offset(swp));
+ }
+ } else
+ page = find_get_page(mapping, pgoff);
+#else
+ page = find_get_page(mapping, pgoff);
+#endif
+ if (page) {
+ present = PageUptodate(page);
+ put_page(page);
+ }
+
+ return present;
+}
+
+static int __mincore_unmapped_range(unsigned long addr, unsigned long end,
+ struct vm_area_struct *vma, unsigned char *vec)
{
- unsigned char *vec = walk->private;
unsigned long nr = (end - addr) >> PAGE_SHIFT;
+ int i;
- memset(vec, 0, nr);
- walk->private += nr;
+ if (vma->vm_file) {
+ pgoff_t pgoff;
+
+ pgoff = linear_page_index(vma, addr);
+ for (i = 0; i < nr; i++, pgoff++)
+ vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
+ } else {
+ for (i = 0; i < nr; i++)
+ vec[i] = 0;
+ }
+ return nr;
+}
+
+static int mincore_unmapped_range(unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ walk->private += __mincore_unmapped_range(addr, end,
+ walk->vma, walk->private);
return 0;
}
@@ -69,9 +127,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
goto out;
}
- /* We'll consider a THP page under construction to be there */
if (pmd_trans_unstable(pmd)) {
- memset(vec, 1, nr);
+ __mincore_unmapped_range(addr, end, vma, vec);
goto out;
}
@@ -80,17 +137,28 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t pte = *ptep;
if (pte_none(pte))
- *vec = 0;
+ __mincore_unmapped_range(addr, addr + PAGE_SIZE,
+ vma, vec);
else if (pte_present(pte))
*vec = 1;
else { /* pte is a swap entry */
swp_entry_t entry = pte_to_swp_entry(pte);
- /*
- * migration or hwpoison entries are always
- * uptodate
- */
- *vec = !!non_swap_entry(entry);
+ if (non_swap_entry(entry)) {
+ /*
+ * migration or hwpoison entries are always
+ * uptodate
+ */
+ *vec = 1;
+ } else {
+#ifdef CONFIG_SWAP
+ *vec = mincore_page(swap_address_space(entry),
+ swp_offset(entry));
+#else
+ WARN_ON(1);
+ *vec = 1;
+#endif
+ }
}
vec++;
}
diff --git a/mm/mlock.c b/mm/mlock.c
index 41cc47e28ad6..080f3b36415b 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -182,7 +182,7 @@ static void __munlock_isolation_failed(struct page *page)
unsigned int munlock_vma_page(struct page *page)
{
int nr_pages;
- struct zone *zone = page_zone(page);
+ pg_data_t *pgdat = page_pgdat(page);
/* For try_to_munlock() and to serialize with page migration */
BUG_ON(!PageLocked(page));
@@ -194,7 +194,7 @@ unsigned int munlock_vma_page(struct page *page)
* might otherwise copy PageMlocked to part of the tail pages before
* we clear it in the head page. It also stabilizes hpage_nr_pages().
*/
- spin_lock_irq(zone_lru_lock(zone));
+ spin_lock_irq(&pgdat->lru_lock);
if (!TestClearPageMlocked(page)) {
/* Potentially, PTE-mapped THP: do not skip the rest PTEs */
@@ -203,17 +203,17 @@ unsigned int munlock_vma_page(struct page *page)
}
nr_pages = hpage_nr_pages(page);
- __mod_zone_page_state(zone, NR_MLOCK, -nr_pages);
+ __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
if (__munlock_isolate_lru_page(page, true)) {
- spin_unlock_irq(zone_lru_lock(zone));
+ spin_unlock_irq(&pgdat->lru_lock);
__munlock_isolated_page(page);
goto out;
}
__munlock_isolation_failed(page);
unlock_out:
- spin_unlock_irq(zone_lru_lock(zone));
+ spin_unlock_irq(&pgdat->lru_lock);
out:
return nr_pages - 1;
@@ -298,7 +298,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
pagevec_init(&pvec_putback);
/* Phase 1: page isolation */
- spin_lock_irq(zone_lru_lock(zone));
+ spin_lock_irq(&zone->zone_pgdat->lru_lock);
for (i = 0; i < nr; i++) {
struct page *page = pvec->pages[i];
@@ -325,7 +325,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
pvec->pages[i] = NULL;
}
__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
- spin_unlock_irq(zone_lru_lock(zone));
+ spin_unlock_irq(&zone->zone_pgdat->lru_lock);
/* Now we can release pins of pages that we are not munlocking */
pagevec_release(&pvec_putback);
diff --git a/mm/mmap.c b/mm/mmap.c
index f901065c4c64..41eb48d9b527 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -438,7 +438,7 @@ static void vma_gap_update(struct vm_area_struct *vma)
{
/*
* As it turns out, RB_DECLARE_CALLBACKS() already created a callback
- * function that does exacltly what we want.
+ * function that does exactly what we want.
*/
vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
}
@@ -1012,7 +1012,7 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
* VM_SOFTDIRTY should not prevent from VMA merging, if we
* match the flags but dirty bit -- the caller should mark
* merged VMA as dirty. If dirty bit won't be excluded from
- * comparison, we increase pressue on the memory system forcing
+ * comparison, we increase pressure on the memory system forcing
* the kernel to generate new VMAs when old one could be
* extended instead.
*/
@@ -1115,7 +1115,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
* PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN
* might become case 1 below case 2 below case 3 below
*
- * It is important for case 8 that the the vma NNNN overlapping the
+ * It is important for case 8 that the vma NNNN overlapping the
* region AAAA is never going to extended over XXXX. Instead XXXX must
* be extended in region AAAA and NNNN must be removed. This way in
* all cases where vma_merge succeeds, the moment vma_adjust drops the
@@ -1645,7 +1645,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
#endif /* __ARCH_WANT_SYS_OLD_MMAP */
/*
- * Some shared mappigns will want the pages marked read-only
+ * Some shared mappings will want the pages marked read-only
* to track write events. If so, we'll downgrade vm_page_prot
* to the private version (using protection_map[] without the
* VM_SHARED bit).
@@ -2126,13 +2126,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
*/
#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
unsigned long
-arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
- const unsigned long len, const unsigned long pgoff,
- const unsigned long flags)
+arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
{
struct vm_area_struct *vma, *prev;
struct mm_struct *mm = current->mm;
- unsigned long addr = addr0;
struct vm_unmapped_area_info info;
const unsigned long mmap_end = arch_get_mmap_end(addr);
@@ -2426,12 +2425,11 @@ int expand_downwards(struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *prev;
- int error;
+ int error = 0;
address &= PAGE_MASK;
- error = security_mmap_addr(address);
- if (error)
- return error;
+ if (address < mmap_min_addr)
+ return -EPERM;
/* Enforce stack_guard_gap */
prev = vma->vm_prev;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 36cb358db170..028c724dcb1a 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -110,8 +110,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
continue;
}
- ptent = ptep_modify_prot_start(mm, addr, pte);
- ptent = pte_modify(ptent, newprot);
+ oldpte = ptep_modify_prot_start(vma, addr, pte);
+ ptent = pte_modify(oldpte, newprot);
if (preserve_write)
ptent = pte_mk_savedwrite(ptent);
@@ -121,7 +121,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
!(vma->vm_flags & VM_SOFTDIRTY))) {
ptent = pte_mkwrite(ptent);
}
- ptep_modify_prot_commit(mm, addr, pte, ptent);
+ ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
pages++;
} else if (IS_ENABLED(CONFIG_MIGRATION)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
diff --git a/mm/mremap.c b/mm/mremap.c
index 3320616ed93f..e3edef6b7a12 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -516,6 +516,23 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
if (addr + old_len > new_addr && new_addr + new_len > addr)
goto out;
+ /*
+ * move_vma() need us to stay 4 maps below the threshold, otherwise
+ * it will bail out at the very beginning.
+ * That is a problem if we have already unmaped the regions here
+ * (new_addr, and old_addr), because userspace will not know the
+ * state of the vma's after it gets -ENOMEM.
+ * So, to avoid such scenario we can pre-compute if the whole
+ * operation has high chances to success map-wise.
+ * Worst-scenario case is when both vma's (new_addr and old_addr) get
+ * split in 3 before unmaping it.
+ * That means 2 more maps (1 for each) to the ones we already hold.
+ * Check whether current map count plus 2 still leads us to 4 maps below
+ * the threshold, otherwise return -ENOMEM here to be more safe.
+ */
+ if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
+ return -ENOMEM;
+
ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
if (ret)
goto out;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f0e8cd9edb1a..3a2484884cfd 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -647,8 +647,8 @@ static int oom_reaper(void *unused)
static void wake_oom_reaper(struct task_struct *tsk)
{
- /* tsk is already queued? */
- if (tsk == oom_reaper_list || tsk->oom_reaper_list)
+ /* mm is already queued? */
+ if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
return;
get_task_struct(tsk);
@@ -843,7 +843,7 @@ static bool task_will_free_mem(struct task_struct *task)
return ret;
}
-static void __oom_kill_process(struct task_struct *victim)
+static void __oom_kill_process(struct task_struct *victim, const char *message)
{
struct task_struct *p;
struct mm_struct *mm;
@@ -874,8 +874,9 @@ static void __oom_kill_process(struct task_struct *victim)
*/
do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
mark_oom_victim(victim);
- pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
- task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
+ pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
+ message, task_pid_nr(victim), victim->comm,
+ K(victim->mm->total_vm),
K(get_mm_counter(victim->mm, MM_ANONPAGES)),
K(get_mm_counter(victim->mm, MM_FILEPAGES)),
K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
@@ -926,24 +927,20 @@ static void __oom_kill_process(struct task_struct *victim)
* Kill provided task unless it's secured by setting
* oom_score_adj to OOM_SCORE_ADJ_MIN.
*/
-static int oom_kill_memcg_member(struct task_struct *task, void *unused)
+static int oom_kill_memcg_member(struct task_struct *task, void *message)
{
- if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
+ if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
+ !is_global_init(task)) {
get_task_struct(task);
- __oom_kill_process(task);
+ __oom_kill_process(task, message);
}
return 0;
}
static void oom_kill_process(struct oom_control *oc, const char *message)
{
- struct task_struct *p = oc->chosen;
- unsigned int points = oc->chosen_points;
- struct task_struct *victim = p;
- struct task_struct *child;
- struct task_struct *t;
+ struct task_struct *victim = oc->chosen;
struct mem_cgroup *oom_group;
- unsigned int victim_points = 0;
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
@@ -952,49 +949,18 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
* its children or threads, just give it access to memory reserves
* so it can die quickly
*/
- task_lock(p);
- if (task_will_free_mem(p)) {
- mark_oom_victim(p);
- wake_oom_reaper(p);
- task_unlock(p);
- put_task_struct(p);
+ task_lock(victim);
+ if (task_will_free_mem(victim)) {
+ mark_oom_victim(victim);
+ wake_oom_reaper(victim);
+ task_unlock(victim);
+ put_task_struct(victim);
return;
}
- task_unlock(p);
+ task_unlock(victim);
if (__ratelimit(&oom_rs))
- dump_header(oc, p);
-
- pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
- message, task_pid_nr(p), p->comm, points);
-
- /*
- * If any of p's children has a different mm and is eligible for kill,
- * the one with the highest oom_badness() score is sacrificed for its
- * parent. This attempts to lose the minimal amount of work done while
- * still freeing memory.
- */
- read_lock(&tasklist_lock);
- for_each_thread(p, t) {
- list_for_each_entry(child, &t->children, sibling) {
- unsigned int child_points;
-
- if (process_shares_mm(child, p->mm))
- continue;
- /*
- * oom_badness() returns 0 if the thread is unkillable
- */
- child_points = oom_badness(child,
- oc->memcg, oc->nodemask, oc->totalpages);
- if (child_points > victim_points) {
- put_task_struct(victim);
- victim = child;
- victim_points = child_points;
- get_task_struct(victim);
- }
- }
- }
- read_unlock(&tasklist_lock);
+ dump_header(oc, victim);
/*
* Do we need to kill the entire memory cgroup?
@@ -1003,14 +969,15 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
*/
oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
- __oom_kill_process(victim);
+ __oom_kill_process(victim, message);
/*
* If necessary, kill all tasks in the selected memory cgroup.
*/
if (oom_group) {
mem_cgroup_print_oom_group(oom_group);
- mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL);
+ mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
+ (void*)message);
mem_cgroup_put(oom_group);
}
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7d1010453fb9..9f61dfec6a1f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -270,7 +270,7 @@ static void wb_min_max_ratio(struct bdi_writeback *wb,
* node_dirtyable_memory - number of dirtyable pages in a node
* @pgdat: the node
*
- * Returns the node's number of pages potentially available for dirty
+ * Return: the node's number of pages potentially available for dirty
* page cache. This is the base value for the per-node dirty limits.
*/
static unsigned long node_dirtyable_memory(struct pglist_data *pgdat)
@@ -355,7 +355,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
/**
* global_dirtyable_memory - number of globally dirtyable pages
*
- * Returns the global number of pages potentially available for dirty
+ * Return: the global number of pages potentially available for dirty
* page cache. This is the base value for the global dirty limits.
*/
static unsigned long global_dirtyable_memory(void)
@@ -470,7 +470,7 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
* node_dirty_limit - maximum number of dirty pages allowed in a node
* @pgdat: the node
*
- * Returns the maximum number of dirty pages allowed in a node, based
+ * Return: the maximum number of dirty pages allowed in a node, based
* on the node's dirtyable memory.
*/
static unsigned long node_dirty_limit(struct pglist_data *pgdat)
@@ -495,7 +495,7 @@ static unsigned long node_dirty_limit(struct pglist_data *pgdat)
* node_dirty_ok - tells whether a node is within its dirty limits
* @pgdat: the node to check
*
- * Returns %true when the dirty pages in @pgdat are within the node's
+ * Return: %true when the dirty pages in @pgdat are within the node's
* dirty limit, %false if the limit is exceeded.
*/
bool node_dirty_ok(struct pglist_data *pgdat)
@@ -743,9 +743,6 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
* __wb_calc_thresh - @wb's share of dirty throttling threshold
* @dtc: dirty_throttle_context of interest
*
- * Returns @wb's dirty limit in pages. The term "dirty" in the context of
- * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
- *
* Note that balance_dirty_pages() will only seriously take it as a hard limit
* when sleeping max_pause per page is not enough to keep the dirty pages under
* control. For example, when the device is completely stalled due to some error
@@ -759,6 +756,9 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
*
* The wb's share of dirty limit will be adapting to its throughput and
* bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
+ *
+ * Return: @wb's dirty limit in pages. The term "dirty" in the context of
+ * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
*/
static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
{
@@ -1918,7 +1918,9 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
* @wb: bdi_writeback of interest
*
* Determines whether background writeback should keep writing @wb or it's
- * clean enough. Returns %true if writeback should continue.
+ * clean enough.
+ *
+ * Return: %true if writeback should continue.
*/
bool wb_over_bg_thresh(struct bdi_writeback *wb)
{
@@ -2147,6 +2149,8 @@ EXPORT_SYMBOL(tag_pages_for_writeback);
* lock/page writeback access order inversion - we should only ever lock
* multiple pages in ascending page->index order, and looping back to the start
* of the file violates that rule and causes deadlocks.
+ *
+ * Return: %0 on success, negative error code otherwise
*/
int write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc, writepage_t writepage,
@@ -2305,6 +2309,8 @@ static int __writepage(struct page *page, struct writeback_control *wbc,
*
* This is a library function, which implements the writepages()
* address_space_operation.
+ *
+ * Return: %0 on success, negative error code otherwise
*/
int generic_writepages(struct address_space *mapping,
struct writeback_control *wbc)
@@ -2351,6 +2357,8 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
*
* Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this
* function returns.
+ *
+ * Return: %0 on success, negative error code otherwise
*/
int write_one_page(struct page *page)
{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cde5dac6229a..03fcf73d47da 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -289,8 +289,8 @@ EXPORT_SYMBOL(movable_zone);
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
#if MAX_NUMNODES > 1
-int nr_node_ids __read_mostly = MAX_NUMNODES;
-int nr_online_nodes __read_mostly = 1;
+unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
+unsigned int nr_online_nodes __read_mostly = 1;
EXPORT_SYMBOL(nr_node_ids);
EXPORT_SYMBOL(nr_online_nodes);
#endif
@@ -789,6 +789,57 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
return 0;
}
+#ifdef CONFIG_COMPACTION
+static inline struct capture_control *task_capc(struct zone *zone)
+{
+ struct capture_control *capc = current->capture_control;
+
+ return capc &&
+ !(current->flags & PF_KTHREAD) &&
+ !capc->page &&
+ capc->cc->zone == zone &&
+ capc->cc->direct_compaction ? capc : NULL;
+}
+
+static inline bool
+compaction_capture(struct capture_control *capc, struct page *page,
+ int order, int migratetype)
+{
+ if (!capc || order != capc->cc->order)
+ return false;
+
+ /* Do not accidentally pollute CMA or isolated regions*/
+ if (is_migrate_cma(migratetype) ||
+ is_migrate_isolate(migratetype))
+ return false;
+
+ /*
+ * Do not let lower order allocations polluate a movable pageblock.
+ * This might let an unmovable request use a reclaimable pageblock
+ * and vice-versa but no more than normal fallback logic which can
+ * have trouble finding a high-order free page.
+ */
+ if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
+ return false;
+
+ capc->page = page;
+ return true;
+}
+
+#else
+static inline struct capture_control *task_capc(struct zone *zone)
+{
+ return NULL;
+}
+
+static inline bool
+compaction_capture(struct capture_control *capc, struct page *page,
+ int order, int migratetype)
+{
+ return false;
+}
+#endif /* CONFIG_COMPACTION */
+
/*
* Freeing function for a buddy system allocator.
*
@@ -822,6 +873,7 @@ static inline void __free_one_page(struct page *page,
unsigned long uninitialized_var(buddy_pfn);
struct page *buddy;
unsigned int max_order;
+ struct capture_control *capc = task_capc(zone);
max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
@@ -837,6 +889,11 @@ static inline void __free_one_page(struct page *page,
continue_merging:
while (order < max_order - 1) {
+ if (compaction_capture(capc, page, order, migratetype)) {
+ __mod_zone_freepage_state(zone, -(1 << order),
+ migratetype);
+ return;
+ }
buddy_pfn = __find_buddy_pfn(pfn, order);
buddy = page + (buddy_pfn - pfn);
@@ -1056,7 +1113,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
if (PageMappingFlags(page))
page->mapping = NULL;
if (memcg_kmem_enabled() && PageKmemcg(page))
- memcg_kmem_uncharge(page, order);
+ __memcg_kmem_uncharge(page, order);
if (check_free)
bad += free_pages_check(page);
if (bad)
@@ -1303,7 +1360,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
local_irq_restore(flags);
}
-static void __init __free_pages_boot_core(struct page *page, unsigned int order)
+void __free_pages_core(struct page *page, unsigned int order)
{
unsigned int nr_pages = 1 << order;
struct page *p = page;
@@ -1382,7 +1439,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn,
{
if (early_page_uninitialised(pfn))
return;
- return __free_pages_boot_core(page, order);
+ __free_pages_core(page, order);
}
/*
@@ -1472,14 +1529,14 @@ static void __init deferred_free_range(unsigned long pfn,
if (nr_pages == pageblock_nr_pages &&
(pfn & (pageblock_nr_pages - 1)) == 0) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- __free_pages_boot_core(page, pageblock_order);
+ __free_pages_core(page, pageblock_order);
return;
}
for (i = 0; i < nr_pages; i++, page++, pfn++) {
if ((pfn & (pageblock_nr_pages - 1)) == 0)
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- __free_pages_boot_core(page, 0);
+ __free_pages_core(page, 0);
}
}
@@ -1945,8 +2002,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
arch_alloc_page(page, order);
kernel_map_pages(page, 1 << order, 1);
- kernel_poison_pages(page, 1 << order, 1);
kasan_alloc_pages(page, order);
+ kernel_poison_pages(page, 1 << order, 1);
set_page_owner(page, order, gfp_flags);
}
@@ -2170,6 +2227,18 @@ static inline void boost_watermark(struct zone *zone)
max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
watermark_boost_factor, 10000);
+
+ /*
+ * high watermark may be uninitialised if fragmentation occurs
+ * very early in boot so do not boost. We do not fall
+ * through and boost by pageblock_nr_pages as failing
+ * allocations that early means that reclaim is not going
+ * to help and it may even be impossible to reclaim the
+ * boosted watermark resulting in a hang.
+ */
+ if (!max_boost)
+ return;
+
max_boost = max(pageblock_nr_pages, max_boost);
zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
@@ -2214,7 +2283,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
*/
boost_watermark(zone);
if (alloc_flags & ALLOC_KSWAPD)
- wakeup_kswapd(zone, 0, 0, zone_idx(zone));
+ set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
/* We are not allowed to try stealing from the whole block */
if (!whole_block)
@@ -2950,7 +3019,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
* watermark, because we already know our high-order page
* exists.
*/
- watermark = min_wmark_pages(zone) + (1UL << order);
+ watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
return 0;
@@ -3102,6 +3171,12 @@ struct page *rmqueue(struct zone *preferred_zone,
local_irq_restore(flags);
out:
+ /* Separate test+clear to avoid unnecessary atomics */
+ if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
+ clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
+ wakeup_kswapd(zone, 0, 0, zone_idx(zone));
+ }
+
VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
return page;
@@ -3155,24 +3230,14 @@ static int __init fail_page_alloc_debugfs(void)
dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
&fail_page_alloc.attr);
- if (IS_ERR(dir))
- return PTR_ERR(dir);
-
- if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
- &fail_page_alloc.ignore_gfp_reclaim))
- goto fail;
- if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
- &fail_page_alloc.ignore_gfp_highmem))
- goto fail;
- if (!debugfs_create_u32("min-order", mode, dir,
- &fail_page_alloc.min_order))
- goto fail;
- return 0;
-fail:
- debugfs_remove_recursive(dir);
+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &fail_page_alloc.ignore_gfp_reclaim);
+ debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+ &fail_page_alloc.ignore_gfp_highmem);
+ debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
- return -ENOMEM;
+ return 0;
}
late_initcall(fail_page_alloc_debugfs);
@@ -3692,7 +3757,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
enum compact_priority prio, enum compact_result *compact_result)
{
- struct page *page;
+ struct page *page = NULL;
unsigned long pflags;
unsigned int noreclaim_flag;
@@ -3703,13 +3768,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
noreclaim_flag = memalloc_noreclaim_save();
*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
- prio);
+ prio, &page);
memalloc_noreclaim_restore(noreclaim_flag);
psi_memstall_leave(&pflags);
- if (*compact_result <= COMPACT_INACTIVE)
+ if (*compact_result <= COMPACT_INACTIVE) {
+ WARN_ON_ONCE(page);
return NULL;
+ }
/*
* At least in one zone compaction wasn't deferred or skipped, so let's
@@ -3717,7 +3784,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
*/
count_vm_event(COMPACTSTALL);
- page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+ /* Prep a captured page if available */
+ if (page)
+ prep_new_page(page, order, gfp_mask, alloc_flags);
+
+ /* Try get a page from the freelist if available */
+ if (!page)
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page) {
struct zone *zone = page_zone(page);
@@ -4550,7 +4623,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
out:
if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
- unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
+ unlikely(__memcg_kmem_charge(page, gfp_mask, order) != 0)) {
__free_pages(page, order);
page = NULL;
}
@@ -4669,11 +4742,11 @@ refill:
/* Even if we own the page, we do not use atomic_set().
* This would break get_page_unless_zero() users.
*/
- page_ref_add(page, size - 1);
+ page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
/* reset page count bias and offset to start of new frag */
nc->pfmemalloc = page_is_pfmemalloc(page);
- nc->pagecnt_bias = size;
+ nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
nc->offset = size;
}
@@ -4689,10 +4762,10 @@ refill:
size = nc->size;
#endif
/* OK, page count is 0, we can safely set it */
- set_page_count(page, size);
+ set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
/* reset page count bias and offset to start of new frag */
- nc->pagecnt_bias = size;
+ nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
offset = size - fragsz;
}
@@ -4743,6 +4816,8 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
* This function is also limited by MAX_ORDER.
*
* Memory allocated by this function must be released by free_pages_exact().
+ *
+ * Return: pointer to the allocated area or %NULL in case of error.
*/
void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
{
@@ -4763,6 +4838,8 @@ EXPORT_SYMBOL(alloc_pages_exact);
*
* Like alloc_pages_exact(), but try to allocate on node nid first before falling
* back.
+ *
+ * Return: pointer to the allocated area or %NULL in case of error.
*/
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
{
@@ -4796,11 +4873,13 @@ EXPORT_SYMBOL(free_pages_exact);
* nr_free_zone_pages - count number of pages beyond high watermark
* @offset: The zone index of the highest zone
*
- * nr_free_zone_pages() counts the number of counts pages which are beyond the
+ * nr_free_zone_pages() counts the number of pages which are beyond the
* high watermark within all zones at or below a given zone index. For each
* zone, the number of pages is calculated as:
*
* nr_free_zone_pages = managed_pages - high_pages
+ *
+ * Return: number of pages beyond high watermark.
*/
static unsigned long nr_free_zone_pages(int offset)
{
@@ -4827,6 +4906,9 @@ static unsigned long nr_free_zone_pages(int offset)
*
* nr_free_buffer_pages() counts the number of pages which are beyond the high
* watermark within ZONE_DMA and ZONE_NORMAL.
+ *
+ * Return: number of pages beyond high watermark within ZONE_DMA and
+ * ZONE_NORMAL.
*/
unsigned long nr_free_buffer_pages(void)
{
@@ -4839,6 +4921,8 @@ EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
*
* nr_free_pagecache_pages() counts the number of pages which are beyond the
* high watermark within all zones.
+ *
+ * Return: number of pages beyond high watermark within all zones.
*/
unsigned long nr_free_pagecache_pages(void)
{
@@ -5285,7 +5369,8 @@ static int node_load[MAX_NUMNODES];
* from each node to each node in the system), and should also prefer nodes
* with no CPUs, since presumably they'll have very little allocation pressure
* on them otherwise.
- * It returns -1 if no node is found.
+ *
+ * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
*/
static int find_next_best_node(int node, nodemask_t *used_node_mask)
{
@@ -5591,7 +5676,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
else
page_group_by_mobility_disabled = 0;
- pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n",
+ pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n",
nr_online_nodes,
page_group_by_mobility_disabled ? "off" : "on",
vm_total_pages);
@@ -5695,18 +5780,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
cond_resched();
}
}
-#ifdef CONFIG_SPARSEMEM
- /*
- * If the zone does not span the rest of the section then
- * we should at least initialize those pages. Otherwise we
- * could blow up on a poisoned page in some paths which depend
- * on full sections being initialized (e.g. memory hotplug).
- */
- while (end_pfn % PAGES_PER_SECTION) {
- __init_single_page(pfn_to_page(end_pfn), end_pfn, zone, nid);
- end_pfn++;
- }
-#endif
}
#ifdef CONFIG_ZONE_DEVICE
@@ -6010,7 +6083,7 @@ int __meminit __early_pfn_to_nid(unsigned long pfn,
return state->last_nid;
nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
- if (nid != -1) {
+ if (nid != NUMA_NO_NODE) {
state->last_start = start_pfn;
state->last_end = end_pfn;
state->last_nid = nid;
@@ -6208,7 +6281,7 @@ unsigned long __init __absent_pages_in_range(int nid,
* @start_pfn: The start PFN to start searching for holes
* @end_pfn: The end PFN to stop searching for holes
*
- * It returns the number of pages frames in memory holes within a range.
+ * Return: the number of pages frames in memory holes within a range.
*/
unsigned long __init absent_pages_in_range(unsigned long start_pfn,
unsigned long end_pfn)
@@ -6370,10 +6443,14 @@ static void __ref setup_usemap(struct pglist_data *pgdat,
{
unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
zone->pageblock_flags = NULL;
- if (usemapsize)
+ if (usemapsize) {
zone->pageblock_flags =
- memblock_alloc_node_nopanic(usemapsize,
- pgdat->node_id);
+ memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
+ pgdat->node_id);
+ if (!zone->pageblock_flags)
+ panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
+ usemapsize, zone->name, pgdat->node_id);
+ }
}
#else
static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
@@ -6602,7 +6679,11 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
end = pgdat_end_pfn(pgdat);
end = ALIGN(end, MAX_ORDER_NR_PAGES);
size = (end - start) * sizeof(struct page);
- map = memblock_alloc_node_nopanic(size, pgdat->node_id);
+ map = memblock_alloc_node(size, SMP_CACHE_BYTES,
+ pgdat->node_id);
+ if (!map)
+ panic("Failed to allocate %ld bytes for node %d memory map\n",
+ size, pgdat->node_id);
pgdat->node_mem_map = map + offset;
}
pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
@@ -6758,14 +6839,14 @@ void __init setup_nr_node_ids(void)
* model has fine enough granularity to avoid incorrect mapping for the
* populated node map.
*
- * Returns the determined alignment in pfn's. 0 if there is no alignment
+ * Return: the determined alignment in pfn's. 0 if there is no alignment
* requirement (single node).
*/
unsigned long __init node_map_pfn_alignment(void)
{
unsigned long accl_mask = 0, last_end = 0;
unsigned long start, end, mask;
- int last_nid = -1;
+ int last_nid = NUMA_NO_NODE;
int i, nid;
for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
@@ -6813,7 +6894,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)
/**
* find_min_pfn_with_active_regions - Find the minimum PFN registered
*
- * It returns the minimum PFN based on information provided via
+ * Return: the minimum PFN based on information provided via
* memblock_set_node().
*/
unsigned long __init find_min_pfn_with_active_regions(void)
@@ -7261,7 +7342,6 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
return pages;
}
-EXPORT_SYMBOL(free_reserved_area);
#ifdef CONFIG_HIGHMEM
void free_highmem_page(struct page *page)
@@ -7490,7 +7570,7 @@ static void __setup_per_zone_wmarks(void)
* value here.
*
* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
- * deltas control asynch page reclaim, and so should
+ * deltas control async page reclaim, and so should
* not be capped for highmem.
*/
unsigned long min_pages;
@@ -7880,8 +7960,7 @@ void *__init alloc_large_system_hash(const char *tablename,
size = bucketsize << log2qty;
if (flags & HASH_EARLY) {
if (flags & HASH_ZERO)
- table = memblock_alloc_nopanic(size,
- SMP_CACHE_BYTES);
+ table = memblock_alloc(size, SMP_CACHE_BYTES);
else
table = memblock_alloc_raw(size,
SMP_CACHE_BYTES);
@@ -7967,7 +8046,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
/*
* Hugepages are not in LRU lists, but they're movable.
- * We need not scan over tail pages bacause we don't
+ * We need not scan over tail pages because we don't
* handle each tail page individually in migration.
*/
if (PageHuge(page)) {
@@ -8106,7 +8185,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
* pageblocks in the range. Once isolated, the pageblocks should not
* be modified by others.
*
- * Returns zero on success or negative error code. On success all
+ * Return: zero on success or negative error code. On success all
* pages which PFN is in [start, end) are allocated for the caller and
* need to be freed with free_contig_range().
*/
@@ -8190,7 +8269,6 @@ int alloc_contig_range(unsigned long start, unsigned long end,
*/
lru_add_drain_all();
- drain_all_pages(cc.zone);
order = 0;
outer_start = start;
diff --git a/mm/page_ext.c b/mm/page_ext.c
index ae44f7adbe07..d8f1aca4ad43 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -161,7 +161,7 @@ static int __init alloc_node_page_ext(int nid)
table_size = get_entry_size() * nr_pages;
- base = memblock_alloc_try_nid_nopanic(
+ base = memblock_alloc_try_nid(
table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
MEMBLOCK_ALLOC_ACCESSIBLE, nid);
if (!base)
@@ -273,6 +273,7 @@ static void free_page_ext(void *addr)
table_size = get_entry_size() * PAGES_PER_SECTION;
BUG_ON(PageReserved(page));
+ kmemleak_free(addr);
free_pages_exact(addr, table_size);
}
}
@@ -300,7 +301,7 @@ static int __meminit online_page_ext(unsigned long start_pfn,
start = SECTION_ALIGN_DOWN(start_pfn);
end = SECTION_ALIGN_UP(start_pfn + nr_pages);
- if (nid == -1) {
+ if (nid == NUMA_NO_NODE) {
/*
* In this case, "nid" already exists and contains valid memory.
* "start_pfn" passed to us is a pfn which is an arg for
@@ -398,10 +399,8 @@ void __init page_ext_init(void)
* We know some arch can have a nodes layout such as
* -------------pfn-------------->
* N0 | N1 | N2 | N0 | N1 | N2|....
- *
- * Take into account DEFERRED_STRUCT_PAGE_INIT.
*/
- if (early_pfn_to_nid(pfn) != nid)
+ if (pfn_to_nid(pfn) != nid)
continue;
if (init_section_page_ext(pfn, nid))
goto oom;
diff --git a/mm/page_idle.c b/mm/page_idle.c
index b9e4b42b33ab..0b39ec0c945c 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -31,7 +31,7 @@
static struct page *page_idle_get_page(unsigned long pfn)
{
struct page *page;
- struct zone *zone;
+ pg_data_t *pgdat;
if (!pfn_valid(pfn))
return NULL;
@@ -41,13 +41,13 @@ static struct page *page_idle_get_page(unsigned long pfn)
!get_page_unless_zero(page))
return NULL;
- zone = page_zone(page);
- spin_lock_irq(zone_lru_lock(zone));
+ pgdat = page_pgdat(page);
+ spin_lock_irq(&pgdat->lru_lock);
if (unlikely(!PageLRU(page))) {
put_page(page);
page = NULL;
}
- spin_unlock_irq(zone_lru_lock(zone));
+ spin_unlock_irq(&pgdat->lru_lock);
return page;
}
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 28b06524939f..925b6f44a444 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -625,16 +625,14 @@ static const struct file_operations proc_page_owner_operations = {
static int __init pageowner_init(void)
{
- struct dentry *dentry;
-
if (!static_branch_unlikely(&page_owner_inited)) {
pr_info("page_owner is disabled\n");
return 0;
}
- dentry = debugfs_create_file("page_owner", 0400, NULL,
- NULL, &proc_page_owner_operations);
+ debugfs_create_file("page_owner", 0400, NULL, NULL,
+ &proc_page_owner_operations);
- return PTR_ERR_OR_ZERO(dentry);
+ return 0;
}
late_initcall(pageowner_init)
diff --git a/mm/page_poison.c b/mm/page_poison.c
index f0c15e9017c0..21d4f97cb49b 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -6,6 +6,7 @@
#include <linux/page_ext.h>
#include <linux/poison.h>
#include <linux/ratelimit.h>
+#include <linux/kasan.h>
static bool want_page_poisoning __read_mostly;
@@ -40,7 +41,10 @@ static void poison_page(struct page *page)
{
void *addr = kmap_atomic(page);
+ /* KASAN still think the page is in-use, so skip it. */
+ kasan_disable_current();
memset(addr, PAGE_POISON, PAGE_SIZE);
+ kasan_enable_current();
kunmap_atomic(addr);
}
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index 0f643dc2dc65..b68d5df14731 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -67,7 +67,7 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
pcpu_set_page_chunk(nth_page(pages, i), chunk);
chunk->data = pages;
- chunk->base_addr = page_address(pages) - pcpu_group_offsets[0];
+ chunk->base_addr = page_address(pages);
spin_lock_irqsave(&pcpu_lock, flags);
pcpu_chunk_populated(chunk, 0, nr_pages, false);
diff --git a/mm/percpu.c b/mm/percpu.c
index db86282fd024..2e6fc8d552c9 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1086,6 +1086,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
struct pcpu_chunk *chunk;
unsigned long aligned_addr, lcm_align;
int start_offset, offset_bits, region_size, region_bits;
+ size_t alloc_size;
/* region calculations */
aligned_addr = tmp_addr & PAGE_MASK;
@@ -1101,9 +1102,12 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
region_size = ALIGN(start_offset + map_size, lcm_align);
/* allocate chunk */
- chunk = memblock_alloc(sizeof(struct pcpu_chunk) +
- BITS_TO_LONGS(region_size >> PAGE_SHIFT),
- SMP_CACHE_BYTES);
+ alloc_size = sizeof(struct pcpu_chunk) +
+ BITS_TO_LONGS(region_size >> PAGE_SHIFT);
+ chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
+ if (!chunk)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ alloc_size);
INIT_LIST_HEAD(&chunk->list);
@@ -1114,12 +1118,25 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
chunk->nr_pages = region_size >> PAGE_SHIFT;
region_bits = pcpu_chunk_map_bits(chunk);
- chunk->alloc_map = memblock_alloc(BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]),
- SMP_CACHE_BYTES);
- chunk->bound_map = memblock_alloc(BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]),
- SMP_CACHE_BYTES);
- chunk->md_blocks = memblock_alloc(pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]),
- SMP_CACHE_BYTES);
+ alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]);
+ chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
+ if (!chunk->alloc_map)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ alloc_size);
+
+ alloc_size =
+ BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]);
+ chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
+ if (!chunk->bound_map)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ alloc_size);
+
+ alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]);
+ chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
+ if (!chunk->md_blocks)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ alloc_size);
+
pcpu_init_md_blocks(chunk);
/* manage populated page bitmap */
@@ -1888,7 +1905,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
__alignof__(ai->groups[0].cpu_map[0]));
ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
- ptr = memblock_alloc_nopanic(PFN_ALIGN(ai_size), PAGE_SIZE);
+ ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE);
if (!ptr)
return NULL;
ai = ptr;
@@ -2044,6 +2061,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
int group, unit, i;
int map_size;
unsigned long tmp_addr;
+ size_t alloc_size;
#define PCPU_SETUP_BUG_ON(cond) do { \
if (unlikely(cond)) { \
@@ -2075,14 +2093,29 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
/* process group information and build config tables accordingly */
- group_offsets = memblock_alloc(ai->nr_groups * sizeof(group_offsets[0]),
- SMP_CACHE_BYTES);
- group_sizes = memblock_alloc(ai->nr_groups * sizeof(group_sizes[0]),
- SMP_CACHE_BYTES);
- unit_map = memblock_alloc(nr_cpu_ids * sizeof(unit_map[0]),
- SMP_CACHE_BYTES);
- unit_off = memblock_alloc(nr_cpu_ids * sizeof(unit_off[0]),
- SMP_CACHE_BYTES);
+ alloc_size = ai->nr_groups * sizeof(group_offsets[0]);
+ group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
+ if (!group_offsets)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ alloc_size);
+
+ alloc_size = ai->nr_groups * sizeof(group_sizes[0]);
+ group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
+ if (!group_sizes)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ alloc_size);
+
+ alloc_size = nr_cpu_ids * sizeof(unit_map[0]);
+ unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
+ if (!unit_map)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ alloc_size);
+
+ alloc_size = nr_cpu_ids * sizeof(unit_off[0]);
+ unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
+ if (!unit_off)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ alloc_size);
for (cpu = 0; cpu < nr_cpu_ids; cpu++)
unit_map[cpu] = UINT_MAX;
@@ -2148,6 +2181,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
pcpu_slot = memblock_alloc(pcpu_nr_slots * sizeof(pcpu_slot[0]),
SMP_CACHE_BYTES);
+ if (!pcpu_slot)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ pcpu_nr_slots * sizeof(pcpu_slot[0]));
for (i = 0; i < pcpu_nr_slots; i++)
INIT_LIST_HEAD(&pcpu_slot[i]);
@@ -2384,7 +2420,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
ai->atom_size = atom_size;
ai->alloc_size = alloc_size;
- for (group = 0, unit = 0; group_cnt[group]; group++) {
+ for (group = 0, unit = 0; group < nr_groups; group++) {
struct pcpu_group_info *gi = &ai->groups[group];
/*
@@ -2460,7 +2496,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
- areas = memblock_alloc_nopanic(areas_size, SMP_CACHE_BYTES);
+ areas = memblock_alloc(areas_size, SMP_CACHE_BYTES);
if (!areas) {
rc = -ENOMEM;
goto out_free;
@@ -2602,6 +2638,9 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
sizeof(pages[0]));
pages = memblock_alloc(pages_size, SMP_CACHE_BYTES);
+ if (!pages)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ pages_size);
/* allocate pages */
j = 0;
@@ -2690,8 +2729,7 @@ EXPORT_SYMBOL(__per_cpu_offset);
static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
size_t align)
{
- return memblock_alloc_from_nopanic(
- size, align, __pa(MAX_DMA_ADDRESS));
+ return memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS));
}
static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
@@ -2739,9 +2777,7 @@ void __init setup_per_cpu_areas(void)
void *fc;
ai = pcpu_alloc_alloc_info(1, 1);
- fc = memblock_alloc_from_nopanic(unit_size,
- PAGE_SIZE,
- __pa(MAX_DMA_ADDRESS));
+ fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
if (!ai || !fc)
panic("Failed to allocate memory for percpu areas.");
/* kmemleak tracks the percpu allocations separately */
diff --git a/mm/readahead.c b/mm/readahead.c
index 1ae16522412a..a4593654a26c 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -81,6 +81,8 @@ static void read_cache_pages_invalidate_pages(struct address_space *mapping,
* @data: private data for the callback routine.
*
* Hides the details of the LRU cache etc from the filesystems.
+ *
+ * Returns: %0 on success, error return by @filler otherwise
*/
int read_cache_pages(struct address_space *mapping, struct list_head *pages,
int (*filler)(void *, struct page *), void *data)
diff --git a/mm/rmap.c b/mm/rmap.c
index 21a26cf51114..b30c7c71d1d9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -25,10 +25,9 @@
* page->flags PG_locked (lock_page)
* hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
* mapping->i_mmap_rwsem
- * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
* anon_vma->rwsem
* mm->page_table_lock or pte_lock
- * zone_lru_lock (in mark_page_accessed, isolate_lru_page)
+ * pgdat->lru_lock (in mark_page_accessed, isolate_lru_page)
* swap_lock (in swap_duplicate, swap_info_get)
* mmlist_lock (in mmput, drain_mmlist and others)
* mapping->private_lock (in __set_page_dirty_buffers)
@@ -1372,16 +1371,13 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* Note that the page can not be free in this function as call of
* try_to_unmap() must hold a reference on the page.
*/
- mmu_notifier_range_init(&range, vma->vm_mm, vma->vm_start,
- min(vma->vm_end, vma->vm_start +
+ mmu_notifier_range_init(&range, vma->vm_mm, address,
+ min(vma->vm_end, address +
(PAGE_SIZE << compound_order(page))));
if (PageHuge(page)) {
/*
* If sharing is possible, start and end will be adjusted
* accordingly.
- *
- * If called for a huge page, caller must hold i_mmap_rwsem
- * in write mode as it is possible to call huge_pmd_unshare.
*/
adjust_range_if_pmd_sharing_possible(vma, &range.start,
&range.end);
diff --git a/mm/shmem.c b/mm/shmem.c
index 6ece1e2fe76e..b3db3779a30a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -36,6 +36,7 @@
#include <linux/uio.h>
#include <linux/khugepaged.h>
#include <linux/hugetlb.h>
+#include <linux/frontswap.h>
#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
@@ -123,6 +124,10 @@ static unsigned long shmem_default_max_inodes(void)
static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
static int shmem_replace_page(struct page **pagep, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index);
+static int shmem_swapin_page(struct inode *inode, pgoff_t index,
+ struct page **pagep, enum sgp_type sgp,
+ gfp_t gfp, struct vm_area_struct *vma,
+ vm_fault_t *fault_type);
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct page **pagep, enum sgp_type sgp,
gfp_t gfp, struct vm_area_struct *vma,
@@ -1089,159 +1094,184 @@ static void shmem_evict_inode(struct inode *inode)
clear_inode(inode);
}
-static unsigned long find_swap_entry(struct xarray *xa, void *item)
+extern struct swap_info_struct *swap_info[];
+
+static int shmem_find_swap_entries(struct address_space *mapping,
+ pgoff_t start, unsigned int nr_entries,
+ struct page **entries, pgoff_t *indices,
+ bool frontswap)
{
- XA_STATE(xas, xa, 0);
- unsigned int checked = 0;
- void *entry;
+ XA_STATE(xas, &mapping->i_pages, start);
+ struct page *page;
+ unsigned int ret = 0;
+
+ if (!nr_entries)
+ return 0;
rcu_read_lock();
- xas_for_each(&xas, entry, ULONG_MAX) {
- if (xas_retry(&xas, entry))
+ xas_for_each(&xas, page, ULONG_MAX) {
+ if (xas_retry(&xas, page))
continue;
- if (entry == item)
- break;
- checked++;
- if ((checked % XA_CHECK_SCHED) != 0)
+
+ if (!xa_is_value(page))
continue;
- xas_pause(&xas);
- cond_resched_rcu();
+
+ if (frontswap) {
+ swp_entry_t entry = radix_to_swp_entry(page);
+
+ if (!frontswap_test(swap_info[swp_type(entry)],
+ swp_offset(entry)))
+ continue;
+ }
+
+ indices[ret] = xas.xa_index;
+ entries[ret] = page;
+
+ if (need_resched()) {
+ xas_pause(&xas);
+ cond_resched_rcu();
+ }
+ if (++ret == nr_entries)
+ break;
}
rcu_read_unlock();
- return entry ? xas.xa_index : -1;
+ return ret;
}
/*
- * If swap found in inode, free it and move page from swapcache to filecache.
+ * Move the swapped pages for an inode to page cache. Returns the count
+ * of pages swapped in, or the error in case of failure.
*/
-static int shmem_unuse_inode(struct shmem_inode_info *info,
- swp_entry_t swap, struct page **pagep)
+static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
+ pgoff_t *indices)
{
- struct address_space *mapping = info->vfs_inode.i_mapping;
- void *radswap;
- pgoff_t index;
- gfp_t gfp;
+ int i = 0;
+ int ret = 0;
int error = 0;
+ struct address_space *mapping = inode->i_mapping;
- radswap = swp_to_radix_entry(swap);
- index = find_swap_entry(&mapping->i_pages, radswap);
- if (index == -1)
- return -EAGAIN; /* tell shmem_unuse we found nothing */
-
- /*
- * Move _head_ to start search for next from here.
- * But be careful: shmem_evict_inode checks list_empty without taking
- * mutex, and there's an instant in list_move_tail when info->swaplist
- * would appear empty, if it were the only one on shmem_swaplist.
- */
- if (shmem_swaplist.next != &info->swaplist)
- list_move_tail(&shmem_swaplist, &info->swaplist);
+ for (i = 0; i < pvec.nr; i++) {
+ struct page *page = pvec.pages[i];
- gfp = mapping_gfp_mask(mapping);
- if (shmem_should_replace_page(*pagep, gfp)) {
- mutex_unlock(&shmem_swaplist_mutex);
- error = shmem_replace_page(pagep, gfp, info, index);
- mutex_lock(&shmem_swaplist_mutex);
- /*
- * We needed to drop mutex to make that restrictive page
- * allocation, but the inode might have been freed while we
- * dropped it: although a racing shmem_evict_inode() cannot
- * complete without emptying the page cache, our page lock
- * on this swapcache page is not enough to prevent that -
- * free_swap_and_cache() of our swap entry will only
- * trylock_page(), removing swap from page cache whatever.
- *
- * We must not proceed to shmem_add_to_page_cache() if the
- * inode has been freed, but of course we cannot rely on
- * inode or mapping or info to check that. However, we can
- * safely check if our swap entry is still in use (and here
- * it can't have got reused for another page): if it's still
- * in use, then the inode cannot have been freed yet, and we
- * can safely proceed (if it's no longer in use, that tells
- * nothing about the inode, but we don't need to unuse swap).
- */
- if (!page_swapcount(*pagep))
- error = -ENOENT;
+ if (!xa_is_value(page))
+ continue;
+ error = shmem_swapin_page(inode, indices[i],
+ &page, SGP_CACHE,
+ mapping_gfp_mask(mapping),
+ NULL, NULL);
+ if (error == 0) {
+ unlock_page(page);
+ put_page(page);
+ ret++;
+ }
+ if (error == -ENOMEM)
+ break;
+ error = 0;
}
+ return error ? error : ret;
+}
- /*
- * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
- * but also to hold up shmem_evict_inode(): so inode cannot be freed
- * beneath us (pagelock doesn't help until the page is in pagecache).
- */
- if (!error)
- error = shmem_add_to_page_cache(*pagep, mapping, index,
- radswap, gfp);
- if (error != -ENOMEM) {
- /*
- * Truncation and eviction use free_swap_and_cache(), which
- * only does trylock page: if we raced, best clean up here.
- */
- delete_from_swap_cache(*pagep);
- set_page_dirty(*pagep);
- if (!error) {
- spin_lock_irq(&info->lock);
- info->swapped--;
- spin_unlock_irq(&info->lock);
- swap_free(swap);
+/*
+ * If swap found in inode, free it and move page from swapcache to filecache.
+ */
+static int shmem_unuse_inode(struct inode *inode, unsigned int type,
+ bool frontswap, unsigned long *fs_pages_to_unuse)
+{
+ struct address_space *mapping = inode->i_mapping;
+ pgoff_t start = 0;
+ struct pagevec pvec;
+ pgoff_t indices[PAGEVEC_SIZE];
+ bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0);
+ int ret = 0;
+
+ pagevec_init(&pvec);
+ do {
+ unsigned int nr_entries = PAGEVEC_SIZE;
+
+ if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE)
+ nr_entries = *fs_pages_to_unuse;
+
+ pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries,
+ pvec.pages, indices,
+ frontswap);
+ if (pvec.nr == 0) {
+ ret = 0;
+ break;
}
- }
- return error;
+
+ ret = shmem_unuse_swap_entries(inode, pvec, indices);
+ if (ret < 0)
+ break;
+
+ if (frontswap_partial) {
+ *fs_pages_to_unuse -= ret;
+ if (*fs_pages_to_unuse == 0) {
+ ret = FRONTSWAP_PAGES_UNUSED;
+ break;
+ }
+ }
+
+ start = indices[pvec.nr - 1];
+ } while (true);
+
+ return ret;
}
/*
- * Search through swapped inodes to find and replace swap by page.
+ * Read all the shared memory data that resides in the swap
+ * device 'type' back into memory, so the swap device can be
+ * unused.
*/
-int shmem_unuse(swp_entry_t swap, struct page *page)
+int shmem_unuse(unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse)
{
- struct list_head *this, *next;
- struct shmem_inode_info *info;
- struct mem_cgroup *memcg;
+ struct shmem_inode_info *info, *next;
+ struct inode *inode;
+ struct inode *prev_inode = NULL;
int error = 0;
- /*
- * There's a faint possibility that swap page was replaced before
- * caller locked it: caller will come back later with the right page.
- */
- if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
- goto out;
+ if (list_empty(&shmem_swaplist))
+ return 0;
+
+ mutex_lock(&shmem_swaplist_mutex);
/*
- * Charge page using GFP_KERNEL while we can wait, before taking
- * the shmem_swaplist_mutex which might hold up shmem_writepage().
- * Charged back to the user (not to caller) when swap account is used.
+ * The extra refcount on the inode is necessary to safely dereference
+ * p->next after re-acquiring the lock. New shmem inodes with swap
+ * get added to the end of the list and we will scan them all.
*/
- error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL,
- &memcg, false);
- if (error)
- goto out;
- /* No memory allocation: swap entry occupies the slot for the page */
- error = -EAGAIN;
-
- mutex_lock(&shmem_swaplist_mutex);
- list_for_each_safe(this, next, &shmem_swaplist) {
- info = list_entry(this, struct shmem_inode_info, swaplist);
- if (info->swapped)
- error = shmem_unuse_inode(info, swap, &page);
- else
+ list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
+ if (!info->swapped) {
list_del_init(&info->swaplist);
+ continue;
+ }
+
+ inode = igrab(&info->vfs_inode);
+ if (!inode)
+ continue;
+
+ mutex_unlock(&shmem_swaplist_mutex);
+ if (prev_inode)
+ iput(prev_inode);
+ prev_inode = inode;
+
+ error = shmem_unuse_inode(inode, type, frontswap,
+ fs_pages_to_unuse);
cond_resched();
- if (error != -EAGAIN)
+
+ mutex_lock(&shmem_swaplist_mutex);
+ next = list_next_entry(info, swaplist);
+ if (!info->swapped)
+ list_del_init(&info->swaplist);
+ if (error)
break;
- /* found nothing in this: move on to search the next */
}
mutex_unlock(&shmem_swaplist_mutex);
- if (error) {
- if (error != -ENOMEM)
- error = 0;
- mem_cgroup_cancel_charge(page, memcg, false);
- } else
- mem_cgroup_commit_charge(page, memcg, true, false);
-out:
- unlock_page(page);
- put_page(page);
+ if (prev_inode)
+ iput(prev_inode);
+
return error;
}
@@ -1325,7 +1355,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
*/
mutex_lock(&shmem_swaplist_mutex);
if (list_empty(&info->swaplist))
- list_add_tail(&info->swaplist, &shmem_swaplist);
+ list_add(&info->swaplist, &shmem_swaplist);
if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
spin_lock_irq(&info->lock);
@@ -1576,6 +1606,116 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
}
/*
+ * Swap in the page pointed to by *pagep.
+ * Caller has to make sure that *pagep contains a valid swapped page.
+ * Returns 0 and the page in pagep if success. On failure, returns the
+ * the error code and NULL in *pagep.
+ */
+static int shmem_swapin_page(struct inode *inode, pgoff_t index,
+ struct page **pagep, enum sgp_type sgp,
+ gfp_t gfp, struct vm_area_struct *vma,
+ vm_fault_t *fault_type)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm;
+ struct mem_cgroup *memcg;
+ struct page *page;
+ swp_entry_t swap;
+ int error;
+
+ VM_BUG_ON(!*pagep || !xa_is_value(*pagep));
+ swap = radix_to_swp_entry(*pagep);
+ *pagep = NULL;
+
+ /* Look it up and read it in.. */
+ page = lookup_swap_cache(swap, NULL, 0);
+ if (!page) {
+ /* Or update major stats only when swapin succeeds?? */
+ if (fault_type) {
+ *fault_type |= VM_FAULT_MAJOR;
+ count_vm_event(PGMAJFAULT);
+ count_memcg_event_mm(charge_mm, PGMAJFAULT);
+ }
+ /* Here we actually start the io */
+ page = shmem_swapin(swap, gfp, info, index);
+ if (!page) {
+ error = -ENOMEM;
+ goto failed;
+ }
+ }
+
+ /* We have to do this with page locked to prevent races */
+ lock_page(page);
+ if (!PageSwapCache(page) || page_private(page) != swap.val ||
+ !shmem_confirm_swap(mapping, index, swap)) {
+ error = -EEXIST;
+ goto unlock;
+ }
+ if (!PageUptodate(page)) {
+ error = -EIO;
+ goto failed;
+ }
+ wait_on_page_writeback(page);
+
+ if (shmem_should_replace_page(page, gfp)) {
+ error = shmem_replace_page(&page, gfp, info, index);
+ if (error)
+ goto failed;
+ }
+
+ error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
+ false);
+ if (!error) {
+ error = shmem_add_to_page_cache(page, mapping, index,
+ swp_to_radix_entry(swap), gfp);
+ /*
+ * We already confirmed swap under page lock, and make
+ * no memory allocation here, so usually no possibility
+ * of error; but free_swap_and_cache() only trylocks a
+ * page, so it is just possible that the entry has been
+ * truncated or holepunched since swap was confirmed.
+ * shmem_undo_range() will have done some of the
+ * unaccounting, now delete_from_swap_cache() will do
+ * the rest.
+ */
+ if (error) {
+ mem_cgroup_cancel_charge(page, memcg, false);
+ delete_from_swap_cache(page);
+ }
+ }
+ if (error)
+ goto failed;
+
+ mem_cgroup_commit_charge(page, memcg, true, false);
+
+ spin_lock_irq(&info->lock);
+ info->swapped--;
+ shmem_recalc_inode(inode);
+ spin_unlock_irq(&info->lock);
+
+ if (sgp == SGP_WRITE)
+ mark_page_accessed(page);
+
+ delete_from_swap_cache(page);
+ set_page_dirty(page);
+ swap_free(swap);
+
+ *pagep = page;
+ return 0;
+failed:
+ if (!shmem_confirm_swap(mapping, index, swap))
+ error = -EEXIST;
+unlock:
+ if (page) {
+ unlock_page(page);
+ put_page(page);
+ }
+
+ return error;
+}
+
+/*
* shmem_getpage_gfp - find page in cache, or get from swap, or allocate
*
* If we allocate a new one we do not mark it dirty. That's up to the
@@ -1596,7 +1736,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct mm_struct *charge_mm;
struct mem_cgroup *memcg;
struct page *page;
- swp_entry_t swap;
enum sgp_type sgp_huge = sgp;
pgoff_t hindex = index;
int error;
@@ -1608,17 +1747,23 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
if (sgp == SGP_NOHUGE || sgp == SGP_HUGE)
sgp = SGP_CACHE;
repeat:
- swap.val = 0;
+ if (sgp <= SGP_CACHE &&
+ ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
+ return -EINVAL;
+ }
+
+ sbinfo = SHMEM_SB(inode->i_sb);
+ charge_mm = vma ? vma->vm_mm : current->mm;
+
page = find_lock_entry(mapping, index);
if (xa_is_value(page)) {
- swap = radix_to_swp_entry(page);
- page = NULL;
- }
+ error = shmem_swapin_page(inode, index, &page,
+ sgp, gfp, vma, fault_type);
+ if (error == -EEXIST)
+ goto repeat;
- if (sgp <= SGP_CACHE &&
- ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
- error = -EINVAL;
- goto unlock;
+ *pagep = page;
+ return error;
}
if (page && sgp == SGP_WRITE)
@@ -1632,7 +1777,7 @@ repeat:
put_page(page);
page = NULL;
}
- if (page || (sgp == SGP_READ && !swap.val)) {
+ if (page || sgp == SGP_READ) {
*pagep = page;
return 0;
}
@@ -1641,215 +1786,138 @@ repeat:
* Fast cache lookup did not find it:
* bring it back from swap or allocate.
*/
- sbinfo = SHMEM_SB(inode->i_sb);
- charge_mm = vma ? vma->vm_mm : current->mm;
-
- if (swap.val) {
- /* Look it up and read it in.. */
- page = lookup_swap_cache(swap, NULL, 0);
- if (!page) {
- /* Or update major stats only when swapin succeeds?? */
- if (fault_type) {
- *fault_type |= VM_FAULT_MAJOR;
- count_vm_event(PGMAJFAULT);
- count_memcg_event_mm(charge_mm, PGMAJFAULT);
- }
- /* Here we actually start the io */
- page = shmem_swapin(swap, gfp, info, index);
- if (!page) {
- error = -ENOMEM;
- goto failed;
- }
- }
-
- /* We have to do this with page locked to prevent races */
- lock_page(page);
- if (!PageSwapCache(page) || page_private(page) != swap.val ||
- !shmem_confirm_swap(mapping, index, swap)) {
- error = -EEXIST; /* try again */
- goto unlock;
- }
- if (!PageUptodate(page)) {
- error = -EIO;
- goto failed;
- }
- wait_on_page_writeback(page);
-
- if (shmem_should_replace_page(page, gfp)) {
- error = shmem_replace_page(&page, gfp, info, index);
- if (error)
- goto failed;
- }
- error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
- false);
- if (!error) {
- error = shmem_add_to_page_cache(page, mapping, index,
- swp_to_radix_entry(swap), gfp);
- /*
- * We already confirmed swap under page lock, and make
- * no memory allocation here, so usually no possibility
- * of error; but free_swap_and_cache() only trylocks a
- * page, so it is just possible that the entry has been
- * truncated or holepunched since swap was confirmed.
- * shmem_undo_range() will have done some of the
- * unaccounting, now delete_from_swap_cache() will do
- * the rest.
- * Reset swap.val? No, leave it so "failed" goes back to
- * "repeat": reading a hole and writing should succeed.
- */
- if (error) {
- mem_cgroup_cancel_charge(page, memcg, false);
- delete_from_swap_cache(page);
- }
- }
- if (error)
- goto failed;
-
- mem_cgroup_commit_charge(page, memcg, true, false);
-
- spin_lock_irq(&info->lock);
- info->swapped--;
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
-
- if (sgp == SGP_WRITE)
- mark_page_accessed(page);
-
- delete_from_swap_cache(page);
- set_page_dirty(page);
- swap_free(swap);
-
- } else {
- if (vma && userfaultfd_missing(vma)) {
- *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
- return 0;
- }
+ if (vma && userfaultfd_missing(vma)) {
+ *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
+ return 0;
+ }
- /* shmem_symlink() */
- if (mapping->a_ops != &shmem_aops)
- goto alloc_nohuge;
- if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
- goto alloc_nohuge;
- if (shmem_huge == SHMEM_HUGE_FORCE)
+ /* shmem_symlink() */
+ if (mapping->a_ops != &shmem_aops)
+ goto alloc_nohuge;
+ if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
+ goto alloc_nohuge;
+ if (shmem_huge == SHMEM_HUGE_FORCE)
+ goto alloc_huge;
+ switch (sbinfo->huge) {
+ loff_t i_size;
+ pgoff_t off;
+ case SHMEM_HUGE_NEVER:
+ goto alloc_nohuge;
+ case SHMEM_HUGE_WITHIN_SIZE:
+ off = round_up(index, HPAGE_PMD_NR);
+ i_size = round_up(i_size_read(inode), PAGE_SIZE);
+ if (i_size >= HPAGE_PMD_SIZE &&
+ i_size >> PAGE_SHIFT >= off)
goto alloc_huge;
- switch (sbinfo->huge) {
- loff_t i_size;
- pgoff_t off;
- case SHMEM_HUGE_NEVER:
- goto alloc_nohuge;
- case SHMEM_HUGE_WITHIN_SIZE:
- off = round_up(index, HPAGE_PMD_NR);
- i_size = round_up(i_size_read(inode), PAGE_SIZE);
- if (i_size >= HPAGE_PMD_SIZE &&
- i_size >> PAGE_SHIFT >= off)
- goto alloc_huge;
- /* fallthrough */
- case SHMEM_HUGE_ADVISE:
- if (sgp_huge == SGP_HUGE)
- goto alloc_huge;
- /* TODO: implement fadvise() hints */
- goto alloc_nohuge;
- }
+ /* fallthrough */
+ case SHMEM_HUGE_ADVISE:
+ if (sgp_huge == SGP_HUGE)
+ goto alloc_huge;
+ /* TODO: implement fadvise() hints */
+ goto alloc_nohuge;
+ }
alloc_huge:
- page = shmem_alloc_and_acct_page(gfp, inode, index, true);
- if (IS_ERR(page)) {
-alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode,
- index, false);
- }
- if (IS_ERR(page)) {
- int retry = 5;
- error = PTR_ERR(page);
- page = NULL;
- if (error != -ENOSPC)
- goto failed;
- /*
- * Try to reclaim some spece by splitting a huge page
- * beyond i_size on the filesystem.
- */
- while (retry--) {
- int ret;
- ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
- if (ret == SHRINK_STOP)
- break;
- if (ret)
- goto alloc_nohuge;
- }
- goto failed;
- }
-
- if (PageTransHuge(page))
- hindex = round_down(index, HPAGE_PMD_NR);
- else
- hindex = index;
+ page = shmem_alloc_and_acct_page(gfp, inode, index, true);
+ if (IS_ERR(page)) {
+alloc_nohuge:
+ page = shmem_alloc_and_acct_page(gfp, inode,
+ index, false);
+ }
+ if (IS_ERR(page)) {
+ int retry = 5;
- if (sgp == SGP_WRITE)
- __SetPageReferenced(page);
+ error = PTR_ERR(page);
+ page = NULL;
+ if (error != -ENOSPC)
+ goto unlock;
+ /*
+ * Try to reclaim some space by splitting a huge page
+ * beyond i_size on the filesystem.
+ */
+ while (retry--) {
+ int ret;
- error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
- PageTransHuge(page));
- if (error)
- goto unacct;
- error = shmem_add_to_page_cache(page, mapping, hindex,
- NULL, gfp & GFP_RECLAIM_MASK);
- if (error) {
- mem_cgroup_cancel_charge(page, memcg,
- PageTransHuge(page));
- goto unacct;
+ ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
+ if (ret == SHRINK_STOP)
+ break;
+ if (ret)
+ goto alloc_nohuge;
}
- mem_cgroup_commit_charge(page, memcg, false,
- PageTransHuge(page));
- lru_cache_add_anon(page);
+ goto unlock;
+ }
- spin_lock_irq(&info->lock);
- info->alloced += 1 << compound_order(page);
- inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
- alloced = true;
+ if (PageTransHuge(page))
+ hindex = round_down(index, HPAGE_PMD_NR);
+ else
+ hindex = index;
- if (PageTransHuge(page) &&
- DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
- hindex + HPAGE_PMD_NR - 1) {
- /*
- * Part of the huge page is beyond i_size: subject
- * to shrink under memory pressure.
- */
- spin_lock(&sbinfo->shrinklist_lock);
- /*
- * _careful to defend against unlocked access to
- * ->shrink_list in shmem_unused_huge_shrink()
- */
- if (list_empty_careful(&info->shrinklist)) {
- list_add_tail(&info->shrinklist,
- &sbinfo->shrinklist);
- sbinfo->shrinklist_len++;
- }
- spin_unlock(&sbinfo->shrinklist_lock);
- }
+ if (sgp == SGP_WRITE)
+ __SetPageReferenced(page);
+
+ error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
+ PageTransHuge(page));
+ if (error)
+ goto unacct;
+ error = shmem_add_to_page_cache(page, mapping, hindex,
+ NULL, gfp & GFP_RECLAIM_MASK);
+ if (error) {
+ mem_cgroup_cancel_charge(page, memcg,
+ PageTransHuge(page));
+ goto unacct;
+ }
+ mem_cgroup_commit_charge(page, memcg, false,
+ PageTransHuge(page));
+ lru_cache_add_anon(page);
+
+ spin_lock_irq(&info->lock);
+ info->alloced += 1 << compound_order(page);
+ inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
+ shmem_recalc_inode(inode);
+ spin_unlock_irq(&info->lock);
+ alloced = true;
+ if (PageTransHuge(page) &&
+ DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
+ hindex + HPAGE_PMD_NR - 1) {
/*
- * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
+ * Part of the huge page is beyond i_size: subject
+ * to shrink under memory pressure.
*/
- if (sgp == SGP_FALLOC)
- sgp = SGP_WRITE;
-clear:
+ spin_lock(&sbinfo->shrinklist_lock);
/*
- * Let SGP_WRITE caller clear ends if write does not fill page;
- * but SGP_FALLOC on a page fallocated earlier must initialize
- * it now, lest undo on failure cancel our earlier guarantee.
+ * _careful to defend against unlocked access to
+ * ->shrink_list in shmem_unused_huge_shrink()
*/
- if (sgp != SGP_WRITE && !PageUptodate(page)) {
- struct page *head = compound_head(page);
- int i;
+ if (list_empty_careful(&info->shrinklist)) {
+ list_add_tail(&info->shrinklist,
+ &sbinfo->shrinklist);
+ sbinfo->shrinklist_len++;
+ }
+ spin_unlock(&sbinfo->shrinklist_lock);
+ }
- for (i = 0; i < (1 << compound_order(head)); i++) {
- clear_highpage(head + i);
- flush_dcache_page(head + i);
- }
- SetPageUptodate(head);
+ /*
+ * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
+ */
+ if (sgp == SGP_FALLOC)
+ sgp = SGP_WRITE;
+clear:
+ /*
+ * Let SGP_WRITE caller clear ends if write does not fill page;
+ * but SGP_FALLOC on a page fallocated earlier must initialize
+ * it now, lest undo on failure cancel our earlier guarantee.
+ */
+ if (sgp != SGP_WRITE && !PageUptodate(page)) {
+ struct page *head = compound_head(page);
+ int i;
+
+ for (i = 0; i < (1 << compound_order(head)); i++) {
+ clear_highpage(head + i);
+ flush_dcache_page(head + i);
}
+ SetPageUptodate(head);
}
/* Perhaps the file has been truncated since we checked */
@@ -1879,9 +1947,6 @@ unacct:
put_page(page);
goto alloc_nohuge;
}
-failed:
- if (swap.val && !shmem_confirm_swap(mapping, index, swap))
- error = -EEXIST;
unlock:
if (page) {
unlock_page(page);
@@ -2125,6 +2190,24 @@ out_nomem:
static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
+ struct shmem_inode_info *info = SHMEM_I(file_inode(file));
+
+ if (info->seals & F_SEAL_FUTURE_WRITE) {
+ /*
+ * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
+ * "future write" seal active.
+ */
+ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
+ return -EPERM;
+
+ /*
+ * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED
+ * read-only mapping, take care to not allow mprotect to revert
+ * protections.
+ */
+ vma->vm_flags &= ~(VM_MAYWRITE);
+ }
+
file_accessed(file);
vma->vm_ops = &shmem_vm_ops;
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
@@ -2375,8 +2458,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
pgoff_t index = pos >> PAGE_SHIFT;
/* i_mutex is held by caller */
- if (unlikely(info->seals & (F_SEAL_WRITE | F_SEAL_GROW))) {
- if (info->seals & F_SEAL_WRITE)
+ if (unlikely(info->seals & (F_SEAL_GROW |
+ F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
+ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
return -EPERM;
if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
return -EPERM;
@@ -2639,7 +2723,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
/* protected by i_mutex */
- if (info->seals & F_SEAL_WRITE) {
+ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
error = -EPERM;
goto out;
}
@@ -2848,16 +2932,20 @@ static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(old_dentry);
- int ret;
+ int ret = 0;
/*
* No ordinary (disk based) filesystem counts links as inodes;
* but each new link needs a new dentry, pinning lowmem, and
* tmpfs dentries cannot be pruned until they are unlinked.
+ * But if an O_TMPFILE file is linked into the tmpfs, the
+ * first link must skip that, to get the accounting right.
*/
- ret = shmem_reserve_inode(inode->i_sb);
- if (ret)
- goto out;
+ if (inode->i_nlink) {
+ ret = shmem_reserve_inode(inode->i_sb);
+ if (ret)
+ goto out;
+ }
dir->i_size += BOGO_DIRENT_SIZE;
inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
@@ -3843,7 +3931,8 @@ int __init shmem_init(void)
return 0;
}
-int shmem_unuse(swp_entry_t swap, struct page *page)
+int shmem_unuse(unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse)
{
return 0;
}
diff --git a/mm/slab.c b/mm/slab.c
index 73fe23e649c9..28652e4218e0 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -550,14 +550,6 @@ static void start_cpu_timer(int cpu)
static void init_arraycache(struct array_cache *ac, int limit, int batch)
{
- /*
- * The array_cache structures contain pointers to free object.
- * However, when such objects are allocated or transferred to another
- * cache the pointers are not cleared and they could be counted as
- * valid references during a kmemleak scan. Therefore, kmemleak must
- * not scan such objects.
- */
- kmemleak_no_scan(ac);
if (ac) {
ac->avail = 0;
ac->limit = limit;
@@ -573,6 +565,14 @@ static struct array_cache *alloc_arraycache(int node, int entries,
struct array_cache *ac = NULL;
ac = kmalloc_node(memsize, gfp, node);
+ /*
+ * The array_cache structures contain pointers to free object.
+ * However, when such objects are allocated or transferred to another
+ * cache the pointers are not cleared and they could be counted as
+ * valid references during a kmemleak scan. Therefore, kmemleak must
+ * not scan such objects.
+ */
+ kmemleak_no_scan(ac);
init_arraycache(ac, entries, batchcount);
return ac;
}
@@ -666,20 +666,22 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries,
struct alien_cache *alc = NULL;
alc = kmalloc_node(memsize, gfp, node);
- init_arraycache(&alc->ac, entries, batch);
- spin_lock_init(&alc->lock);
+ if (alc) {
+ kmemleak_no_scan(alc);
+ init_arraycache(&alc->ac, entries, batch);
+ spin_lock_init(&alc->lock);
+ }
return alc;
}
static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
{
struct alien_cache **alc_ptr;
- size_t memsize = sizeof(void *) * nr_node_ids;
int i;
if (limit > 1)
limit = 12;
- alc_ptr = kzalloc_node(memsize, gfp, node);
+ alc_ptr = kcalloc_node(nr_node_ids, sizeof(void *), gfp, node);
if (!alc_ptr)
return NULL;
@@ -1725,6 +1727,8 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
* This could be made much more intelligent. For now, try to avoid using
* high order pages for slabs. When the gfp() functions are more friendly
* towards high-order requests, this should be changed.
+ *
+ * Return: number of left-over bytes in a slab
*/
static size_t calculate_slab_order(struct kmem_cache *cachep,
size_t size, slab_flags_t flags)
@@ -1973,6 +1977,8 @@ static bool set_on_slab_cache(struct kmem_cache *cachep,
* %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
* cacheline. This can be beneficial if you're counting cycles as closely
* as davem.
+ *
+ * Return: a pointer to the created cache or %NULL in case of error
*/
int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags)
{
@@ -2357,7 +2363,7 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
void *freelist;
void *addr = page_address(page);
- page->s_mem = kasan_reset_tag(addr) + colour_off;
+ page->s_mem = addr + colour_off;
page->active = 0;
if (OBJFREELIST_SLAB(cachep))
@@ -2366,6 +2372,7 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
/* Slab management obj is off-slab. */
freelist = kmem_cache_alloc_node(cachep->freelist_cache,
local_flags, nodeid);
+ freelist = kasan_reset_tag(freelist);
if (!freelist)
return NULL;
} else {
@@ -2679,6 +2686,13 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep,
offset *= cachep->colour_off;
+ /*
+ * Call kasan_poison_slab() before calling alloc_slabmgmt(), so
+ * page_address() in the latter returns a non-tagged pointer,
+ * as it should be for slab pages.
+ */
+ kasan_poison_slab(page);
+
/* Get slab management. */
freelist = alloc_slabmgmt(cachep, page, offset,
local_flags & ~GFP_CONSTRAINT_MASK, page_node);
@@ -2687,7 +2701,6 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep,
slab_map_pages(cachep, page, freelist);
- kasan_poison_slab(page);
cache_init_objs(cachep, page);
if (gfpflags_allow_blocking(local_flags))
@@ -3533,12 +3546,13 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
*
* Allocate an object from this cache. The flags are only relevant
* if the cache has no available objects.
+ *
+ * Return: pointer to the new object or %NULL in case of error
*/
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *ret = slab_alloc(cachep, flags, _RET_IP_);
- ret = kasan_slab_alloc(cachep, ret, flags);
trace_kmem_cache_alloc(_RET_IP_, ret,
cachep->object_size, cachep->size, flags);
@@ -3623,12 +3637,13 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
* node, which can improve the performance for cpu bound structures.
*
* Fallback to other node is possible if __GFP_THISNODE is not set.
+ *
+ * Return: pointer to the new object or %NULL in case of error
*/
void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
- ret = kasan_slab_alloc(cachep, ret, flags);
trace_kmem_cache_alloc_node(_RET_IP_, ret,
cachep->object_size, cachep->size,
flags, nodeid);
@@ -3692,6 +3707,8 @@ EXPORT_SYMBOL(__kmalloc_node_track_caller);
* @size: how many bytes of memory are required.
* @flags: the type of memory to allocate (see kmalloc).
* @caller: function caller for debug tracking of the caller
+ *
+ * Return: pointer to the allocated memory or %NULL in case of error
*/
static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
unsigned long caller)
@@ -4157,6 +4174,8 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep)
* @buffer: user buffer
* @count: data length
* @ppos: unused
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
ssize_t slabinfo_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
@@ -4406,6 +4425,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
unsigned int objnr;
unsigned long offset;
+ ptr = kasan_reset_tag(ptr);
+
/* Find and validate object. */
cachep = page->slab_cache;
objnr = obj_to_index(cachep, page, (void *)ptr);
@@ -4448,6 +4469,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
* The caller must guarantee that objp points to a valid object previously
* allocated with either kmalloc() or kmem_cache_alloc(). The object
* must not be freed during the duration of the call.
+ *
+ * Return: size of the actual memory used by @objp in bytes
*/
size_t ksize(const void *objp)
{
diff --git a/mm/slab.h b/mm/slab.h
index 4190c24ef0e9..e5e6658eeacc 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -276,8 +276,6 @@ static __always_inline int memcg_charge_slab(struct page *page,
gfp_t gfp, int order,
struct kmem_cache *s)
{
- if (!memcg_kmem_enabled())
- return 0;
if (is_root_cache(s))
return 0;
return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg);
@@ -286,8 +284,6 @@ static __always_inline int memcg_charge_slab(struct page *page,
static __always_inline void memcg_uncharge_slab(struct page *page, int order,
struct kmem_cache *s)
{
- if (!memcg_kmem_enabled())
- return;
memcg_kmem_uncharge(page, order);
}
@@ -437,11 +433,10 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
flags &= gfp_allowed_mask;
for (i = 0; i < size; i++) {
- void *object = p[i];
-
- kmemleak_alloc_recursive(object, s->object_size, 1,
+ p[i] = kasan_slab_alloc(s, p[i], flags);
+ /* As p[i] might get tagged, call kmemleak hook after KASAN. */
+ kmemleak_alloc_recursive(p[i], s->object_size, 1,
s->flags, flags);
- p[i] = kasan_slab_alloc(s, object, flags);
}
if (memcg_kmem_enabled())
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 81732d05e74a..03eeb8b7b4b1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -939,6 +939,8 @@ EXPORT_SYMBOL(kmem_cache_destroy);
*
* Releases as many slabs as possible for a cache.
* To help debugging, a zero exit status indicates all slabs were released.
+ *
+ * Return: %0 if all slabs were released, non-zero otherwise
*/
int kmem_cache_shrink(struct kmem_cache *cachep)
{
@@ -1228,8 +1230,9 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
flags |= __GFP_COMP;
page = alloc_pages(flags, order);
ret = page ? page_address(page) : NULL;
- kmemleak_alloc(ret, size, 1, flags);
ret = kasan_kmalloc_large(ret, size, flags);
+ /* As ret might get tagged, call kmemleak hook after KASAN. */
+ kmemleak_alloc(ret, size, 1, flags);
return ret;
}
EXPORT_SYMBOL(kmalloc_order);
@@ -1424,7 +1427,7 @@ void dump_unreclaimable_slab(void)
#if defined(CONFIG_MEMCG)
void *memcg_slab_start(struct seq_file *m, loff_t *pos)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
mutex_lock(&slab_mutex);
return seq_list_start(&memcg->kmem_caches, *pos);
@@ -1432,7 +1435,7 @@ void *memcg_slab_start(struct seq_file *m, loff_t *pos)
void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
return seq_list_next(p, &memcg->kmem_caches, pos);
}
@@ -1446,7 +1449,7 @@ int memcg_slab_show(struct seq_file *m, void *p)
{
struct kmem_cache *s = list_entry(p, struct kmem_cache,
memcg_params.kmem_caches_node);
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
if (p == memcg->kmem_caches.next)
print_slabinfo_header(m);
@@ -1527,6 +1530,8 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size,
* This function is like krealloc() except it never frees the originally
* allocated buffer. Use this if you don't want to free the buffer immediately
* like, for example, with RCU.
+ *
+ * Return: pointer to the allocated memory or %NULL in case of error
*/
void *__krealloc(const void *p, size_t new_size, gfp_t flags)
{
@@ -1548,6 +1553,8 @@ EXPORT_SYMBOL(__krealloc);
* lesser of the new and old sizes. If @p is %NULL, krealloc()
* behaves exactly like kmalloc(). If @new_size is 0 and @p is not a
* %NULL pointer, the object pointed to is freed.
+ *
+ * Return: pointer to the allocated memory or %NULL in case of error
*/
void *krealloc(const void *p, size_t new_size, gfp_t flags)
{
diff --git a/mm/slub.c b/mm/slub.c
index 36c0befeebd8..1b08fbcb7e61 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -249,7 +249,18 @@ static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr,
unsigned long ptr_addr)
{
#ifdef CONFIG_SLAB_FREELIST_HARDENED
- return (void *)((unsigned long)ptr ^ s->random ^ ptr_addr);
+ /*
+ * When CONFIG_KASAN_SW_TAGS is enabled, ptr_addr might be tagged.
+ * Normally, this doesn't cause any issues, as both set_freepointer()
+ * and get_freepointer() are called with a pointer with the same tag.
+ * However, there are some issues with CONFIG_SLUB_DEBUG code. For
+ * example, when __free_slub() iterates over objects in a cache, it
+ * passes untagged pointers to check_object(). check_object() in turns
+ * calls get_freepointer() with an untagged pointer, which causes the
+ * freepointer to be restored incorrectly.
+ */
+ return (void *)((unsigned long)ptr ^ s->random ^
+ (unsigned long)kasan_reset_tag((void *)ptr_addr));
#else
return ptr;
#endif
@@ -303,15 +314,10 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
__p < (__addr) + (__objects) * (__s)->size; \
__p += (__s)->size)
-#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \
- for (__p = fixup_red_left(__s, __addr), __idx = 1; \
- __idx <= __objects; \
- __p += (__s)->size, __idx++)
-
/* Determine object index from a given position */
static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr)
{
- return (p - addr) / s->size;
+ return (kasan_reset_tag(p) - addr) / s->size;
}
static inline unsigned int order_objects(unsigned int order, unsigned int size)
@@ -507,6 +513,7 @@ static inline int check_valid_pointer(struct kmem_cache *s,
return 1;
base = page_address(page);
+ object = kasan_reset_tag(object);
object = restore_red_left(s, object);
if (object < base || object >= base + page->objects * s->size ||
(object - base) % s->size) {
@@ -1075,9 +1082,18 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
init_tracking(s, object);
}
+static void setup_page_debug(struct kmem_cache *s, void *addr, int order)
+{
+ if (!(s->flags & SLAB_POISON))
+ return;
+
+ metadata_access_enable();
+ memset(addr, POISON_INUSE, PAGE_SIZE << order);
+ metadata_access_disable();
+}
+
static inline int alloc_consistency_checks(struct kmem_cache *s,
- struct page *page,
- void *object, unsigned long addr)
+ struct page *page, void *object)
{
if (!check_slab(s, page))
return 0;
@@ -1098,7 +1114,7 @@ static noinline int alloc_debug_processing(struct kmem_cache *s,
void *object, unsigned long addr)
{
if (s->flags & SLAB_CONSISTENCY_CHECKS) {
- if (!alloc_consistency_checks(s, page, object, addr))
+ if (!alloc_consistency_checks(s, page, object))
goto bad;
}
@@ -1330,6 +1346,8 @@ slab_flags_t kmem_cache_flags(unsigned int object_size,
#else /* !CONFIG_SLUB_DEBUG */
static inline void setup_object_debug(struct kmem_cache *s,
struct page *page, void *object) {}
+static inline void setup_page_debug(struct kmem_cache *s,
+ void *addr, int order) {}
static inline int alloc_debug_processing(struct kmem_cache *s,
struct page *page, void *object, unsigned long addr) { return 0; }
@@ -1374,8 +1392,10 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
*/
static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
{
+ ptr = kasan_kmalloc_large(ptr, size, flags);
+ /* As ptr might get tagged, call kmemleak hook after KASAN. */
kmemleak_alloc(ptr, size, 1, flags);
- return kasan_kmalloc_large(ptr, size, flags);
+ return ptr;
}
static __always_inline void kfree_hook(void *x)
@@ -1641,27 +1661,25 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
if (page_is_pfmemalloc(page))
SetPageSlabPfmemalloc(page);
- start = page_address(page);
+ kasan_poison_slab(page);
- if (unlikely(s->flags & SLAB_POISON))
- memset(start, POISON_INUSE, PAGE_SIZE << order);
+ start = page_address(page);
- kasan_poison_slab(page);
+ setup_page_debug(s, start, order);
shuffle = shuffle_freelist(s, page);
if (!shuffle) {
- for_each_object_idx(p, idx, s, start, page->objects) {
- if (likely(idx < page->objects)) {
- next = p + s->size;
- next = setup_object(s, page, next);
- set_freepointer(s, p, next);
- } else
- set_freepointer(s, p, NULL);
- }
start = fixup_red_left(s, start);
start = setup_object(s, page, start);
page->freelist = start;
+ for (idx = 0, p = start; idx < page->objects - 1; idx++) {
+ next = p + s->size;
+ next = setup_object(s, page, next);
+ set_freepointer(s, p, next);
+ p = next;
+ }
+ set_freepointer(s, p, NULL);
}
page->inuse = page->objects;
@@ -2111,7 +2129,7 @@ redo:
if (!lock) {
lock = 1;
/*
- * Taking the spinlock removes the possiblity
+ * Taking the spinlock removes the possibility
* that acquire_slab() will see a slab page that
* is frozen
*/
@@ -2235,8 +2253,8 @@ static void unfreeze_partials(struct kmem_cache *s,
}
/*
- * Put a page that was just frozen (in __slab_free) into a partial page
- * slot if available.
+ * Put a page that was just frozen (in __slab_free|get_partial_node) into a
+ * partial page slot if available.
*
* If we did not find a slot then simply move all the partials to the
* per node partial list.
@@ -2463,8 +2481,7 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
stat(s, ALLOC_SLAB);
c->page = page;
*pc = c;
- } else
- freelist = NULL;
+ }
return freelist;
}
@@ -3846,6 +3863,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
unsigned int offset;
size_t object_size;
+ ptr = kasan_reset_tag(ptr);
+
/* Find object and usable object size. */
s = page->slab_cache;
@@ -4243,7 +4262,7 @@ void __init kmem_cache_init(void)
cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
slub_cpu_dead);
- pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%d\n",
+ pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n",
cache_line_size(),
slub_min_order, slub_max_order, slub_min_objects,
nr_cpu_ids, nr_node_ids);
diff --git a/mm/sparse.c b/mm/sparse.c
index 7ea5dc6c6b19..69904aa6165b 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -65,11 +65,15 @@ static noinline struct mem_section __ref *sparse_index_alloc(int nid)
unsigned long array_size = SECTIONS_PER_ROOT *
sizeof(struct mem_section);
- if (slab_is_available())
+ if (slab_is_available()) {
section = kzalloc_node(array_size, GFP_KERNEL, nid);
- else
+ } else {
section = memblock_alloc_node(array_size, SMP_CACHE_BYTES,
nid);
+ if (!section)
+ panic("%s: Failed to allocate %lu bytes nid=%d\n",
+ __func__, array_size, nid);
+ }
return section;
}
@@ -197,7 +201,7 @@ static inline int next_present_section_nr(int section_nr)
}
#define for_each_present_section_nr(start, section_nr) \
for (section_nr = next_present_section_nr(start-1); \
- ((section_nr >= 0) && \
+ ((section_nr != -1) && \
(section_nr <= __highest_present_section_nr)); \
section_nr = next_present_section_nr(section_nr))
@@ -218,6 +222,9 @@ void __init memory_present(int nid, unsigned long start, unsigned long end)
size = sizeof(struct mem_section*) * NR_SECTION_ROOTS;
align = 1 << (INTERNODE_CACHE_SHIFT);
mem_section = memblock_alloc(size, align);
+ if (!mem_section)
+ panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
+ __func__, size, align);
}
#endif
@@ -323,9 +330,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
limit = goal + (1UL << PA_SECTION_SHIFT);
nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again:
- p = memblock_alloc_try_nid_nopanic(size,
- SMP_CACHE_BYTES, goal, limit,
- nid);
+ p = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
if (!p && limit) {
limit = 0;
goto again;
@@ -379,7 +384,7 @@ static unsigned long * __init
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
unsigned long size)
{
- return memblock_alloc_node_nopanic(size, pgdat->node_id);
+ return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id);
}
static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -404,13 +409,18 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid,
{
unsigned long size = section_map_size();
struct page *map = sparse_buffer_alloc(size);
+ phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
if (map)
return map;
map = memblock_alloc_try_nid(size,
- PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+ PAGE_SIZE, addr,
MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ if (!map)
+ panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
+ __func__, size, PAGE_SIZE, nid, &addr);
+
return map;
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
@@ -420,10 +430,11 @@ static void *sparsemap_buf_end __meminitdata;
static void __init sparse_buffer_init(unsigned long size, int nid)
{
+ phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */
sparsemap_buf =
memblock_alloc_try_nid_raw(size, PAGE_SIZE,
- __pa(MAX_DMA_ADDRESS),
+ addr,
MEMBLOCK_ALLOC_ACCESSIBLE, nid);
sparsemap_buf_end = sparsemap_buf + size;
}
diff --git a/mm/swap.c b/mm/swap.c
index 4929bc1be60e..301ed4e04320 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -58,16 +58,16 @@ static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
static void __page_cache_release(struct page *page)
{
if (PageLRU(page)) {
- struct zone *zone = page_zone(page);
+ pg_data_t *pgdat = page_pgdat(page);
struct lruvec *lruvec;
unsigned long flags;
- spin_lock_irqsave(zone_lru_lock(zone), flags);
- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
+ spin_lock_irqsave(&pgdat->lru_lock, flags);
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
VM_BUG_ON_PAGE(!PageLRU(page), page);
__ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_off_lru(page));
- spin_unlock_irqrestore(zone_lru_lock(zone), flags);
+ spin_unlock_irqrestore(&pgdat->lru_lock, flags);
}
__ClearPageWaiters(page);
mem_cgroup_uncharge(page);
@@ -320,19 +320,14 @@ static inline void activate_page_drain(int cpu)
{
}
-static bool need_activate_page_drain(int cpu)
-{
- return false;
-}
-
void activate_page(struct page *page)
{
- struct zone *zone = page_zone(page);
+ pg_data_t *pgdat = page_pgdat(page);
page = compound_head(page);
- spin_lock_irq(zone_lru_lock(zone));
- __activate_page(page, mem_cgroup_page_lruvec(page, zone->zone_pgdat), NULL);
- spin_unlock_irq(zone_lru_lock(zone));
+ spin_lock_irq(&pgdat->lru_lock);
+ __activate_page(page, mem_cgroup_page_lruvec(page, pgdat), NULL);
+ spin_unlock_irq(&pgdat->lru_lock);
}
#endif
@@ -653,13 +648,15 @@ void lru_add_drain(void)
put_cpu();
}
+#ifdef CONFIG_SMP
+
+static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
+
static void lru_add_drain_per_cpu(struct work_struct *dummy)
{
lru_add_drain();
}
-static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
-
/*
* Doesn't need any cpu hotplug locking because we do rely on per-cpu
* kworkers being shut down before our page_alloc_cpu_dead callback is
@@ -702,6 +699,12 @@ void lru_add_drain_all(void)
mutex_unlock(&lock);
}
+#else
+void lru_add_drain_all(void)
+{
+ lru_add_drain();
+}
+#endif
/**
* release_pages - batched put_page()
diff --git a/mm/swap_state.c b/mm/swap_state.c
index fd2f21e1c60a..85245fdec8d9 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -523,7 +523,7 @@ static unsigned long swapin_nr_pages(unsigned long offset)
* This has been extended to use the NUMA policies from the mm triggering
* the readahead.
*
- * Caller must hold down_read on the vma->vm_mm if vmf->vma is not NULL.
+ * Caller must hold read mmap_sem if vmf->vma is not NULL.
*/
struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct vm_fault *vmf)
@@ -543,6 +543,13 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
if (!mask)
goto skip;
+ /* Test swap type to make sure the dereference is safe */
+ if (likely(si->flags & (SWP_BLKDEV | SWP_FS))) {
+ struct inode *inode = si->swap_file->f_mapping->host;
+ if (inode_read_congested(inode))
+ goto skip;
+ }
+
do_poll = false;
/* Read a page_cluster sized and aligned cluster around offset. */
start_offset = offset & ~mask;
@@ -691,6 +698,20 @@ static void swap_ra_info(struct vm_fault *vmf,
pte_unmap(orig_pte);
}
+/**
+ * swap_vma_readahead - swap in pages in hope we need them soon
+ * @entry: swap entry of this memory
+ * @gfp_mask: memory allocation flags
+ * @vmf: fault information
+ *
+ * Returns the struct page for entry and addr, after queueing swapin.
+ *
+ * Primitive swap readahead code. We simply read in a few pages whoes
+ * virtual addresses are around the fault address in the same vma.
+ *
+ * Caller must hold read mmap_sem if vmf->vma is not NULL.
+ *
+ */
static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
struct vm_fault *vmf)
{
diff --git a/mm/swapfile.c b/mm/swapfile.c
index dbac1d49469d..2b8d9c3fbb47 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -98,6 +98,15 @@ static atomic_t proc_poll_event = ATOMIC_INIT(0);
atomic_t nr_rotate_swap = ATOMIC_INIT(0);
+static struct swap_info_struct *swap_type_to_swap_info(int type)
+{
+ if (type >= READ_ONCE(nr_swapfiles))
+ return NULL;
+
+ smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */
+ return READ_ONCE(swap_info[type]);
+}
+
static inline unsigned char swap_count(unsigned char ent)
{
return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
@@ -1044,12 +1053,14 @@ noswap:
/* The only caller of this function is now suspend routine */
swp_entry_t get_swap_page_of_type(int type)
{
- struct swap_info_struct *si;
+ struct swap_info_struct *si = swap_type_to_swap_info(type);
pgoff_t offset;
- si = swap_info[type];
+ if (!si)
+ goto fail;
+
spin_lock(&si->lock);
- if (si && (si->flags & SWP_WRITEOK)) {
+ if (si->flags & SWP_WRITEOK) {
atomic_long_dec(&nr_swap_pages);
/* This is called for allocating swap entry, not cache */
offset = scan_swap_map(si, 1);
@@ -1060,6 +1071,7 @@ swp_entry_t get_swap_page_of_type(int type)
atomic_long_inc(&nr_swap_pages);
}
spin_unlock(&si->lock);
+fail:
return (swp_entry_t) {0};
}
@@ -1071,9 +1083,9 @@ static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
if (!entry.val)
goto out;
type = swp_type(entry);
- if (type >= nr_swapfiles)
+ p = swap_type_to_swap_info(type);
+ if (!p)
goto bad_nofile;
- p = swap_info[type];
if (!(p->flags & SWP_USED))
goto bad_device;
offset = swp_offset(entry);
@@ -1697,10 +1709,9 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
sector_t swapdev_block(int type, pgoff_t offset)
{
struct block_device *bdev;
+ struct swap_info_struct *si = swap_type_to_swap_info(type);
- if ((unsigned int)type >= nr_swapfiles)
- return 0;
- if (!(swap_info[type]->flags & SWP_WRITEOK))
+ if (!si || !(si->flags & SWP_WRITEOK))
return 0;
return map_swap_entry(swp_entry(type, offset), &bdev);
}
@@ -1799,44 +1810,77 @@ out_nolock:
}
static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, unsigned long end,
- swp_entry_t entry, struct page *page)
+ unsigned long addr, unsigned long end,
+ unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse)
{
- pte_t swp_pte = swp_entry_to_pte(entry);
+ struct page *page;
+ swp_entry_t entry;
pte_t *pte;
+ struct swap_info_struct *si;
+ unsigned long offset;
int ret = 0;
+ volatile unsigned char *swap_map;
- /*
- * We don't actually need pte lock while scanning for swp_pte: since
- * we hold page lock and mmap_sem, swp_pte cannot be inserted into the
- * page table while we're scanning; though it could get zapped, and on
- * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
- * of unmatched parts which look like swp_pte, so unuse_pte must
- * recheck under pte lock. Scanning without pte lock lets it be
- * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
- */
+ si = swap_info[type];
pte = pte_offset_map(pmd, addr);
do {
- /*
- * swapoff spends a _lot_ of time in this loop!
- * Test inline before going to call unuse_pte.
- */
- if (unlikely(pte_same_as_swp(*pte, swp_pte))) {
- pte_unmap(pte);
- ret = unuse_pte(vma, pmd, addr, entry, page);
- if (ret)
- goto out;
- pte = pte_offset_map(pmd, addr);
+ struct vm_fault vmf;
+
+ if (!is_swap_pte(*pte))
+ continue;
+
+ entry = pte_to_swp_entry(*pte);
+ if (swp_type(entry) != type)
+ continue;
+
+ offset = swp_offset(entry);
+ if (frontswap && !frontswap_test(si, offset))
+ continue;
+
+ pte_unmap(pte);
+ swap_map = &si->swap_map[offset];
+ vmf.vma = vma;
+ vmf.address = addr;
+ vmf.pmd = pmd;
+ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf);
+ if (!page) {
+ if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
+ goto try_next;
+ return -ENOMEM;
+ }
+
+ lock_page(page);
+ wait_on_page_writeback(page);
+ ret = unuse_pte(vma, pmd, addr, entry, page);
+ if (ret < 0) {
+ unlock_page(page);
+ put_page(page);
+ goto out;
}
+
+ try_to_free_swap(page);
+ unlock_page(page);
+ put_page(page);
+
+ if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) {
+ ret = FRONTSWAP_PAGES_UNUSED;
+ goto out;
+ }
+try_next:
+ pte = pte_offset_map(pmd, addr);
} while (pte++, addr += PAGE_SIZE, addr != end);
pte_unmap(pte - 1);
+
+ ret = 0;
out:
return ret;
}
static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end,
- swp_entry_t entry, struct page *page)
+ unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse)
{
pmd_t *pmd;
unsigned long next;
@@ -1848,7 +1892,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
next = pmd_addr_end(addr, end);
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
continue;
- ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
+ ret = unuse_pte_range(vma, pmd, addr, next, type,
+ frontswap, fs_pages_to_unuse);
if (ret)
return ret;
} while (pmd++, addr = next, addr != end);
@@ -1857,7 +1902,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
unsigned long addr, unsigned long end,
- swp_entry_t entry, struct page *page)
+ unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse)
{
pud_t *pud;
unsigned long next;
@@ -1868,7 +1914,8 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
+ ret = unuse_pmd_range(vma, pud, addr, next, type,
+ frontswap, fs_pages_to_unuse);
if (ret)
return ret;
} while (pud++, addr = next, addr != end);
@@ -1877,7 +1924,8 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end,
- swp_entry_t entry, struct page *page)
+ unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse)
{
p4d_t *p4d;
unsigned long next;
@@ -1888,78 +1936,66 @@ static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
next = p4d_addr_end(addr, end);
if (p4d_none_or_clear_bad(p4d))
continue;
- ret = unuse_pud_range(vma, p4d, addr, next, entry, page);
+ ret = unuse_pud_range(vma, p4d, addr, next, type,
+ frontswap, fs_pages_to_unuse);
if (ret)
return ret;
} while (p4d++, addr = next, addr != end);
return 0;
}
-static int unuse_vma(struct vm_area_struct *vma,
- swp_entry_t entry, struct page *page)
+static int unuse_vma(struct vm_area_struct *vma, unsigned int type,
+ bool frontswap, unsigned long *fs_pages_to_unuse)
{
pgd_t *pgd;
unsigned long addr, end, next;
int ret;
- if (page_anon_vma(page)) {
- addr = page_address_in_vma(page, vma);
- if (addr == -EFAULT)
- return 0;
- else
- end = addr + PAGE_SIZE;
- } else {
- addr = vma->vm_start;
- end = vma->vm_end;
- }
+ addr = vma->vm_start;
+ end = vma->vm_end;
pgd = pgd_offset(vma->vm_mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- ret = unuse_p4d_range(vma, pgd, addr, next, entry, page);
+ ret = unuse_p4d_range(vma, pgd, addr, next, type,
+ frontswap, fs_pages_to_unuse);
if (ret)
return ret;
} while (pgd++, addr = next, addr != end);
return 0;
}
-static int unuse_mm(struct mm_struct *mm,
- swp_entry_t entry, struct page *page)
+static int unuse_mm(struct mm_struct *mm, unsigned int type,
+ bool frontswap, unsigned long *fs_pages_to_unuse)
{
struct vm_area_struct *vma;
int ret = 0;
- if (!down_read_trylock(&mm->mmap_sem)) {
- /*
- * Activate page so shrink_inactive_list is unlikely to unmap
- * its ptes while lock is dropped, so swapoff can make progress.
- */
- activate_page(page);
- unlock_page(page);
- down_read(&mm->mmap_sem);
- lock_page(page);
- }
+ down_read(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
- break;
+ if (vma->anon_vma) {
+ ret = unuse_vma(vma, type, frontswap,
+ fs_pages_to_unuse);
+ if (ret)
+ break;
+ }
cond_resched();
}
up_read(&mm->mmap_sem);
- return (ret < 0)? ret: 0;
+ return ret;
}
/*
* Scan swap_map (or frontswap_map if frontswap parameter is true)
- * from current position to next entry still in use.
- * Recycle to start on reaching the end, returning 0 when empty.
+ * from current position to next entry still in use. Return 0
+ * if there are no inuse entries after prev till end of the map.
*/
static unsigned int find_next_to_unuse(struct swap_info_struct *si,
unsigned int prev, bool frontswap)
{
- unsigned int max = si->max;
- unsigned int i = prev;
+ unsigned int i;
unsigned char count;
/*
@@ -1968,20 +2004,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
* hits are okay, and sys_swapoff() has already prevented new
* allocations from this area (while holding swap_lock).
*/
- for (;;) {
- if (++i >= max) {
- if (!prev) {
- i = 0;
- break;
- }
- /*
- * No entries in use at top of swap_map,
- * loop back to start and recheck there.
- */
- max = prev + 1;
- prev = 0;
- i = 1;
- }
+ for (i = prev + 1; i < si->max; i++) {
count = READ_ONCE(si->swap_map[i]);
if (count && swap_count(count) != SWAP_MAP_BAD)
if (!frontswap || frontswap_test(si, i))
@@ -1989,240 +2012,121 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
if ((i % LATENCY_LIMIT) == 0)
cond_resched();
}
+
+ if (i == si->max)
+ i = 0;
+
return i;
}
/*
- * We completely avoid races by reading each swap page in advance,
- * and then search for the process using it. All the necessary
- * page table adjustments can then be made atomically.
- *
- * if the boolean frontswap is true, only unuse pages_to_unuse pages;
+ * If the boolean frontswap is true, only unuse pages_to_unuse pages;
* pages_to_unuse==0 means all pages; ignored if frontswap is false
*/
+#define SWAP_UNUSE_MAX_TRIES 3
int try_to_unuse(unsigned int type, bool frontswap,
unsigned long pages_to_unuse)
{
+ struct mm_struct *prev_mm;
+ struct mm_struct *mm;
+ struct list_head *p;
+ int retval = 0;
struct swap_info_struct *si = swap_info[type];
- struct mm_struct *start_mm;
- volatile unsigned char *swap_map; /* swap_map is accessed without
- * locking. Mark it as volatile
- * to prevent compiler doing
- * something odd.
- */
- unsigned char swcount;
struct page *page;
swp_entry_t entry;
- unsigned int i = 0;
- int retval = 0;
+ unsigned int i;
+ int retries = 0;
- /*
- * When searching mms for an entry, a good strategy is to
- * start at the first mm we freed the previous entry from
- * (though actually we don't notice whether we or coincidence
- * freed the entry). Initialize this start_mm with a hold.
- *
- * A simpler strategy would be to start at the last mm we
- * freed the previous entry from; but that would take less
- * advantage of mmlist ordering, which clusters forked mms
- * together, child after parent. If we race with dup_mmap(), we
- * prefer to resolve parent before child, lest we miss entries
- * duplicated after we scanned child: using last mm would invert
- * that.
- */
- start_mm = &init_mm;
- mmget(&init_mm);
+ if (!si->inuse_pages)
+ return 0;
- /*
- * Keep on scanning until all entries have gone. Usually,
- * one pass through swap_map is enough, but not necessarily:
- * there are races when an instance of an entry might be missed.
- */
- while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
+ if (!frontswap)
+ pages_to_unuse = 0;
+
+retry:
+ retval = shmem_unuse(type, frontswap, &pages_to_unuse);
+ if (retval)
+ goto out;
+
+ prev_mm = &init_mm;
+ mmget(prev_mm);
+
+ spin_lock(&mmlist_lock);
+ p = &init_mm.mmlist;
+ while ((p = p->next) != &init_mm.mmlist) {
if (signal_pending(current)) {
retval = -EINTR;
break;
}
- /*
- * Get a page for the entry, using the existing swap
- * cache page if there is one. Otherwise, get a clean
- * page and read the swap into it.
- */
- swap_map = &si->swap_map[i];
- entry = swp_entry(type, i);
- page = read_swap_cache_async(entry,
- GFP_HIGHUSER_MOVABLE, NULL, 0, false);
- if (!page) {
- /*
- * Either swap_duplicate() failed because entry
- * has been freed independently, and will not be
- * reused since sys_swapoff() already disabled
- * allocation from here, or alloc_page() failed.
- */
- swcount = *swap_map;
- /*
- * We don't hold lock here, so the swap entry could be
- * SWAP_MAP_BAD (when the cluster is discarding).
- * Instead of fail out, We can just skip the swap
- * entry because swapoff will wait for discarding
- * finish anyway.
- */
- if (!swcount || swcount == SWAP_MAP_BAD)
- continue;
- retval = -ENOMEM;
- break;
- }
+ mm = list_entry(p, struct mm_struct, mmlist);
+ if (!mmget_not_zero(mm))
+ continue;
+ spin_unlock(&mmlist_lock);
+ mmput(prev_mm);
+ prev_mm = mm;
+ retval = unuse_mm(mm, type, frontswap, &pages_to_unuse);
- /*
- * Don't hold on to start_mm if it looks like exiting.
- */
- if (atomic_read(&start_mm->mm_users) == 1) {
- mmput(start_mm);
- start_mm = &init_mm;
- mmget(&init_mm);
+ if (retval) {
+ mmput(prev_mm);
+ goto out;
}
/*
- * Wait for and lock page. When do_swap_page races with
- * try_to_unuse, do_swap_page can handle the fault much
- * faster than try_to_unuse can locate the entry. This
- * apparently redundant "wait_on_page_locked" lets try_to_unuse
- * defer to do_swap_page in such a case - in some tests,
- * do_swap_page and try_to_unuse repeatedly compete.
- */
- wait_on_page_locked(page);
- wait_on_page_writeback(page);
- lock_page(page);
- wait_on_page_writeback(page);
-
- /*
- * Remove all references to entry.
+ * Make sure that we aren't completely killing
+ * interactive performance.
*/
- swcount = *swap_map;
- if (swap_count(swcount) == SWAP_MAP_SHMEM) {
- retval = shmem_unuse(entry, page);
- /* page has already been unlocked and released */
- if (retval < 0)
- break;
- continue;
- }
- if (swap_count(swcount) && start_mm != &init_mm)
- retval = unuse_mm(start_mm, entry, page);
-
- if (swap_count(*swap_map)) {
- int set_start_mm = (*swap_map >= swcount);
- struct list_head *p = &start_mm->mmlist;
- struct mm_struct *new_start_mm = start_mm;
- struct mm_struct *prev_mm = start_mm;
- struct mm_struct *mm;
-
- mmget(new_start_mm);
- mmget(prev_mm);
- spin_lock(&mmlist_lock);
- while (swap_count(*swap_map) && !retval &&
- (p = p->next) != &start_mm->mmlist) {
- mm = list_entry(p, struct mm_struct, mmlist);
- if (!mmget_not_zero(mm))
- continue;
- spin_unlock(&mmlist_lock);
- mmput(prev_mm);
- prev_mm = mm;
+ cond_resched();
+ spin_lock(&mmlist_lock);
+ }
+ spin_unlock(&mmlist_lock);
- cond_resched();
+ mmput(prev_mm);
- swcount = *swap_map;
- if (!swap_count(swcount)) /* any usage ? */
- ;
- else if (mm == &init_mm)
- set_start_mm = 1;
- else
- retval = unuse_mm(mm, entry, page);
-
- if (set_start_mm && *swap_map < swcount) {
- mmput(new_start_mm);
- mmget(mm);
- new_start_mm = mm;
- set_start_mm = 0;
- }
- spin_lock(&mmlist_lock);
- }
- spin_unlock(&mmlist_lock);
- mmput(prev_mm);
- mmput(start_mm);
- start_mm = new_start_mm;
- }
- if (retval) {
- unlock_page(page);
- put_page(page);
- break;
- }
+ i = 0;
+ while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
- /*
- * If a reference remains (rare), we would like to leave
- * the page in the swap cache; but try_to_unmap could
- * then re-duplicate the entry once we drop page lock,
- * so we might loop indefinitely; also, that page could
- * not be swapped out to other storage meanwhile. So:
- * delete from cache even if there's another reference,
- * after ensuring that the data has been saved to disk -
- * since if the reference remains (rarer), it will be
- * read from disk into another page. Splitting into two
- * pages would be incorrect if swap supported "shared
- * private" pages, but they are handled by tmpfs files.
- *
- * Given how unuse_vma() targets one particular offset
- * in an anon_vma, once the anon_vma has been determined,
- * this splitting happens to be just what is needed to
- * handle where KSM pages have been swapped out: re-reading
- * is unnecessarily slow, but we can fix that later on.
- */
- if (swap_count(*swap_map) &&
- PageDirty(page) && PageSwapCache(page)) {
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- };
-
- swap_writepage(compound_head(page), &wbc);
- lock_page(page);
- wait_on_page_writeback(page);
- }
+ entry = swp_entry(type, i);
+ page = find_get_page(swap_address_space(entry), i);
+ if (!page)
+ continue;
/*
* It is conceivable that a racing task removed this page from
- * swap cache just before we acquired the page lock at the top,
- * or while we dropped it in unuse_mm(). The page might even
- * be back in swap cache on another swap area: that we must not
- * delete, since it may not have been written out to swap yet.
- */
- if (PageSwapCache(page) &&
- likely(page_private(page) == entry.val) &&
- (!PageTransCompound(page) ||
- !swap_page_trans_huge_swapped(si, entry)))
- delete_from_swap_cache(compound_head(page));
-
- /*
- * So we could skip searching mms once swap count went
- * to 1, we did not mark any present ptes as dirty: must
- * mark page dirty so shrink_page_list will preserve it.
+ * swap cache just before we acquired the page lock. The page
+ * might even be back in swap cache on another swap area. But
+ * that is okay, try_to_free_swap() only removes stale pages.
*/
- SetPageDirty(page);
+ lock_page(page);
+ wait_on_page_writeback(page);
+ try_to_free_swap(page);
unlock_page(page);
put_page(page);
/*
- * Make sure that we aren't completely killing
- * interactive performance.
+ * For frontswap, we just need to unuse pages_to_unuse, if
+ * it was specified. Need not check frontswap again here as
+ * we already zeroed out pages_to_unuse if not frontswap.
*/
- cond_resched();
- if (frontswap && pages_to_unuse > 0) {
- if (!--pages_to_unuse)
- break;
- }
+ if (pages_to_unuse && --pages_to_unuse == 0)
+ goto out;
}
- mmput(start_mm);
- return retval;
+ /*
+ * Lets check again to see if there are still swap entries in the map.
+ * If yes, we would need to do retry the unuse logic again.
+ * Under global memory pressure, swap entries can be reinserted back
+ * into process space after the mmlist loop above passes over them.
+ * Its not worth continuosuly retrying to unuse the swap in this case.
+ * So we try SWAP_UNUSE_MAX_TRIES times.
+ */
+ if (++retries >= SWAP_UNUSE_MAX_TRIES)
+ retval = -EBUSY;
+ else if (si->inuse_pages)
+ goto retry;
+
+out:
+ return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval;
}
/*
@@ -2258,7 +2162,7 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
struct swap_extent *se;
pgoff_t offset;
- sis = swap_info[swp_type(entry)];
+ sis = swp_swap_info(entry);
*bdev = sis->bdev;
offset = swp_offset(entry);
@@ -2700,9 +2604,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
if (!l)
return SEQ_START_TOKEN;
- for (type = 0; type < nr_swapfiles; type++) {
- smp_rmb(); /* read nr_swapfiles before swap_info[type] */
- si = swap_info[type];
+ for (type = 0; (si = swap_type_to_swap_info(type)); type++) {
if (!(si->flags & SWP_USED) || !si->swap_map)
continue;
if (!--l)
@@ -2722,9 +2624,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
else
type = si->type + 1;
- for (; type < nr_swapfiles; type++) {
- smp_rmb(); /* read nr_swapfiles before swap_info[type] */
- si = swap_info[type];
+ for (; (si = swap_type_to_swap_info(type)); type++) {
if (!(si->flags & SWP_USED) || !si->swap_map)
continue;
++*pos;
@@ -2813,9 +2713,8 @@ static struct swap_info_struct *alloc_swap_info(void)
struct swap_info_struct *p;
unsigned int type;
int i;
- int size = sizeof(*p) + nr_node_ids * sizeof(struct plist_node);
- p = kvzalloc(size, GFP_KERNEL);
+ p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
if (!p)
return ERR_PTR(-ENOMEM);
@@ -2831,14 +2730,14 @@ static struct swap_info_struct *alloc_swap_info(void)
}
if (type >= nr_swapfiles) {
p->type = type;
- swap_info[type] = p;
+ WRITE_ONCE(swap_info[type], p);
/*
* Write swap_info[type] before nr_swapfiles, in case a
* racing procfs swap_start() or swap_next() is reading them.
* (We never shrink nr_swapfiles, we never free this entry.)
*/
smp_wmb();
- nr_swapfiles++;
+ WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1);
} else {
kvfree(p);
p = swap_info[type];
@@ -3358,7 +3257,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
{
struct swap_info_struct *p;
struct swap_cluster_info *ci;
- unsigned long offset, type;
+ unsigned long offset;
unsigned char count;
unsigned char has_cache;
int err = -EINVAL;
@@ -3366,10 +3265,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
if (non_swap_entry(entry))
goto out;
- type = swp_type(entry);
- if (type >= nr_swapfiles)
+ p = swp_swap_info(entry);
+ if (!p)
goto bad_file;
- p = swap_info[type];
+
offset = swp_offset(entry);
if (unlikely(offset >= p->max))
goto out;
@@ -3466,7 +3365,7 @@ int swapcache_prepare(swp_entry_t entry)
struct swap_info_struct *swp_swap_info(swp_entry_t entry)
{
- return swap_info[swp_type(entry)];
+ return swap_type_to_swap_info(swp_type(entry));
}
struct swap_info_struct *page_swap_info(struct page *page)
diff --git a/mm/truncate.c b/mm/truncate.c
index 798e7ccfb030..b7d3c99f00c9 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -539,6 +539,8 @@ EXPORT_SYMBOL(truncate_inode_pages_final);
* invalidate_mapping_pages() will not block on IO activity. It will not
* invalidate pages which are dirty, locked, under writeback or mapped into
* pagetables.
+ *
+ * Return: the number of the pages that were invalidated
*/
unsigned long invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end)
@@ -664,7 +666,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page)
* Any pages which are found to be mapped into pagetables are unmapped prior to
* invalidation.
*
- * Returns -EBUSY if any pages could not be invalidated.
+ * Return: -EBUSY if any pages could not be invalidated.
*/
int invalidate_inode_pages2_range(struct address_space *mapping,
pgoff_t start, pgoff_t end)
@@ -761,7 +763,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
* Any pages which are found to be mapped into pagetables are unmapped prior to
* invalidation.
*
- * Returns -EBUSY if any pages could not be invalidated.
+ * Return: -EBUSY if any pages could not be invalidated.
*/
int invalidate_inode_pages2(struct address_space *mapping)
{
diff --git a/mm/usercopy.c b/mm/usercopy.c
index 852eb4e53f06..14faadcedd06 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -247,7 +247,8 @@ static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);
/*
* Validates that the given object is:
* - not bogus address
- * - known-safe heap or stack object
+ * - fully contained by stack (or stack frame, when available)
+ * - fully within SLAB object (or object whitelist area, when available)
* - not in kernel text
*/
void __check_object_size(const void *ptr, unsigned long n, bool to_user)
@@ -262,9 +263,6 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
/* Check for invalid addresses. */
check_bogus_address((const unsigned long)ptr, n, to_user);
- /* Check for bad heap object. */
- check_heap_object(ptr, n, to_user);
-
/* Check for bad stack object. */
switch (check_stack_object(ptr, n)) {
case NOT_STACK:
@@ -282,6 +280,9 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
usercopy_abort("process stack", NULL, to_user, 0, n);
}
+ /* Check for bad heap object. */
+ check_heap_object(ptr, n, to_user);
+
/* Check for object in kernel to avoid text exposure. */
check_kernel_text_object((const unsigned long)ptr, n, to_user);
}
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 065c1ce191c4..d59b5a73dfb3 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -267,14 +267,10 @@ retry:
VM_BUG_ON(dst_addr & ~huge_page_mask(h));
/*
- * Serialize via i_mmap_rwsem and hugetlb_fault_mutex.
- * i_mmap_rwsem ensures the dst_pte remains valid even
- * in the case of shared pmds. fault mutex prevents
- * races with other faulting threads.
+ * Serialize via hugetlb_fault_mutex
*/
- mapping = dst_vma->vm_file->f_mapping;
- i_mmap_lock_read(mapping);
idx = linear_page_index(dst_vma, dst_addr);
+ mapping = dst_vma->vm_file->f_mapping;
hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
idx, dst_addr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -283,7 +279,6 @@ retry:
dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
if (!dst_pte) {
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
goto out_unlock;
}
@@ -291,7 +286,6 @@ retry:
dst_pteval = huge_ptep_get(dst_pte);
if (!huge_pte_none(dst_pteval)) {
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
goto out_unlock;
}
@@ -299,7 +293,6 @@ retry:
dst_addr, src_addr, &page);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
vm_alloc_shared = vm_shared;
cond_resched();
diff --git a/mm/util.c b/mm/util.c
index 4df23d64aac7..d559bde497a9 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -36,6 +36,8 @@ EXPORT_SYMBOL(kfree_const);
* kstrdup - allocate space for and copy an existing string
* @s: the string to duplicate
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Return: newly allocated copy of @s or %NULL in case of error
*/
char *kstrdup(const char *s, gfp_t gfp)
{
@@ -58,9 +60,10 @@ EXPORT_SYMBOL(kstrdup);
* @s: the string to duplicate
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
*
- * Function returns source string if it is in .rodata section otherwise it
- * fallbacks to kstrdup.
- * Strings allocated by kstrdup_const should be freed by kfree_const.
+ * Note: Strings allocated by kstrdup_const should be freed by kfree_const.
+ *
+ * Return: source string if it is in .rodata section otherwise
+ * fallback to kstrdup.
*/
const char *kstrdup_const(const char *s, gfp_t gfp)
{
@@ -78,6 +81,8 @@ EXPORT_SYMBOL(kstrdup_const);
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
*
* Note: Use kmemdup_nul() instead if the size is known exactly.
+ *
+ * Return: newly allocated copy of @s or %NULL in case of error
*/
char *kstrndup(const char *s, size_t max, gfp_t gfp)
{
@@ -103,6 +108,8 @@ EXPORT_SYMBOL(kstrndup);
* @src: memory region to duplicate
* @len: memory region length
* @gfp: GFP mask to use
+ *
+ * Return: newly allocated copy of @src or %NULL in case of error
*/
void *kmemdup(const void *src, size_t len, gfp_t gfp)
{
@@ -120,6 +127,9 @@ EXPORT_SYMBOL(kmemdup);
* @s: The data to stringify
* @len: The size of the data
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Return: newly allocated copy of @s with NUL-termination or %NULL in
+ * case of error
*/
char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
{
@@ -143,14 +153,14 @@ EXPORT_SYMBOL(kmemdup_nul);
* @src: source address in user space
* @len: number of bytes to copy
*
- * Returns an ERR_PTR() on failure. Result is physically
+ * Return: an ERR_PTR() on failure. Result is physically
* contiguous, to be freed by kfree().
*/
void *memdup_user(const void __user *src, size_t len)
{
void *p;
- p = kmalloc_track_caller(len, GFP_USER);
+ p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);
if (!p)
return ERR_PTR(-ENOMEM);
@@ -169,7 +179,7 @@ EXPORT_SYMBOL(memdup_user);
* @src: source address in user space
* @len: number of bytes to copy
*
- * Returns an ERR_PTR() on failure. Result may be not
+ * Return: an ERR_PTR() on failure. Result may be not
* physically contiguous. Use kvfree() to free.
*/
void *vmemdup_user(const void __user *src, size_t len)
@@ -193,6 +203,8 @@ EXPORT_SYMBOL(vmemdup_user);
* strndup_user - duplicate an existing string from user space
* @s: The string to duplicate
* @n: Maximum number of bytes to copy, including the trailing NUL.
+ *
+ * Return: newly allocated copy of @s or %NULL in case of error
*/
char *strndup_user(const char __user *s, long n)
{
@@ -224,7 +236,7 @@ EXPORT_SYMBOL(strndup_user);
* @src: source address in user space
* @len: number of bytes to copy
*
- * Returns an ERR_PTR() on failure.
+ * Return: an ERR_PTR() on failure.
*/
void *memdup_user_nul(const void __user *src, size_t len)
{
@@ -310,10 +322,6 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno.
- *
* get_user_pages_fast provides equivalent functionality to get_user_pages,
* operating on current and current->mm, with force=0 and vma=NULL. However
* unlike get_user_pages, it must be called without mmap_sem held.
@@ -325,6 +333,10 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
* pages have to be faulted in, it may turn out to be slightly slower so
* callers need to carefully consider what to use. On many architectures,
* get_user_pages_fast simply falls back to get_user_pages.
+ *
+ * Return: number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno.
*/
int __weak get_user_pages_fast(unsigned long start,
int nr_pages, int write, struct page **pages)
@@ -386,6 +398,8 @@ EXPORT_SYMBOL(vm_mmap);
*
* Please note that any use of gfp flags outside of GFP_KERNEL is careful to not
* fall back to vmalloc.
+ *
+ * Return: pointer to the allocated memory of %NULL in case of failure
*/
void *kvmalloc_node(size_t size, gfp_t flags, int node)
{
@@ -478,7 +492,7 @@ bool page_mapped(struct page *page)
return true;
if (PageHuge(page))
return false;
- for (i = 0; i < hpage_nr_pages(page); i++) {
+ for (i = 0; i < (1 << compound_order(page)); i++) {
if (atomic_read(&page[i]._mapcount) >= 0)
return true;
}
@@ -729,7 +743,8 @@ error:
* @buffer: the buffer to copy to.
* @buflen: the length of the buffer. Larger cmdline values are truncated
* to this length.
- * Returns the size of the cmdline field copied. Note that the copy does
+ *
+ * Return: the size of the cmdline field copied. Note that the copy does
* not guarantee an ending NULL byte.
*/
int get_cmdline(struct task_struct *task, char *buffer, int buflen)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 871e41c55e23..e86ba6e74b50 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -498,7 +498,11 @@ nocache:
}
found:
- if (addr + size > vend)
+ /*
+ * Check also calculated address against the vstart,
+ * because it can be 0 because of big align request.
+ */
+ if (addr + size > vend || addr < vstart)
goto overflow;
va->va_start = addr;
@@ -840,7 +844,7 @@ static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
* @order: how many 2^order pages should be occupied in newly allocated block
* @gfp_mask: flags for the page level allocator
*
- * Returns: virtual address in a newly allocated block or ERR_PTR(-errno)
+ * Return: virtual address in a newly allocated block or ERR_PTR(-errno)
*/
static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
{
@@ -1187,6 +1191,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
EXPORT_SYMBOL(vm_map_ram);
static struct vm_struct *vmlist __initdata;
+
/**
* vm_area_add_early - add vmap area early during boot
* @vm: vm_struct to add
@@ -1421,13 +1426,15 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
}
/**
- * get_vm_area - reserve a contiguous kernel virtual area
- * @size: size of the area
- * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
+ * get_vm_area - reserve a contiguous kernel virtual area
+ * @size: size of the area
+ * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
*
- * Search an area of @size in the kernel virtual mapping area,
- * and reserved it for out purposes. Returns the area descriptor
- * on success or %NULL on failure.
+ * Search an area of @size in the kernel virtual mapping area,
+ * and reserved it for out purposes. Returns the area descriptor
+ * on success or %NULL on failure.
+ *
+ * Return: the area descriptor on success or %NULL on failure.
*/
struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
{
@@ -1444,12 +1451,14 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
}
/**
- * find_vm_area - find a continuous kernel virtual area
- * @addr: base address
+ * find_vm_area - find a continuous kernel virtual area
+ * @addr: base address
+ *
+ * Search for the kernel VM area starting at @addr, and return it.
+ * It is up to the caller to do all required locking to keep the returned
+ * pointer valid.
*
- * Search for the kernel VM area starting at @addr, and return it.
- * It is up to the caller to do all required locking to keep the returned
- * pointer valid.
+ * Return: pointer to the found area or %NULL on faulure
*/
struct vm_struct *find_vm_area(const void *addr)
{
@@ -1463,12 +1472,14 @@ struct vm_struct *find_vm_area(const void *addr)
}
/**
- * remove_vm_area - find and remove a continuous kernel virtual area
- * @addr: base address
+ * remove_vm_area - find and remove a continuous kernel virtual area
+ * @addr: base address
*
- * Search for the kernel VM area starting at @addr, and remove it.
- * This function returns the found VM area, but using it is NOT safe
- * on SMP machines, except for its size or flags.
+ * Search for the kernel VM area starting at @addr, and remove it.
+ * This function returns the found VM area, but using it is NOT safe
+ * on SMP machines, except for its size or flags.
+ *
+ * Return: pointer to the found area or %NULL on faulure
*/
struct vm_struct *remove_vm_area(const void *addr)
{
@@ -1505,7 +1516,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
addr))
return;
- area = find_vmap_area((unsigned long)addr)->vm;
+ area = find_vm_area(addr);
if (unlikely(!area)) {
WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
addr);
@@ -1548,11 +1559,11 @@ static inline void __vfree_deferred(const void *addr)
}
/**
- * vfree_atomic - release memory allocated by vmalloc()
- * @addr: memory base address
+ * vfree_atomic - release memory allocated by vmalloc()
+ * @addr: memory base address
*
- * This one is just like vfree() but can be called in any atomic context
- * except NMIs.
+ * This one is just like vfree() but can be called in any atomic context
+ * except NMIs.
*/
void vfree_atomic(const void *addr)
{
@@ -1565,21 +1576,29 @@ void vfree_atomic(const void *addr)
__vfree_deferred(addr);
}
+static void __vfree(const void *addr)
+{
+ if (unlikely(in_interrupt()))
+ __vfree_deferred(addr);
+ else
+ __vunmap(addr, 1);
+}
+
/**
- * vfree - release memory allocated by vmalloc()
- * @addr: memory base address
+ * vfree - release memory allocated by vmalloc()
+ * @addr: memory base address
*
- * Free the virtually continuous memory area starting at @addr, as
- * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
- * NULL, no operation is performed.
+ * Free the virtually continuous memory area starting at @addr, as
+ * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
+ * NULL, no operation is performed.
*
- * Must not be called in NMI context (strictly speaking, only if we don't
- * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
- * conventions for vfree() arch-depenedent would be a really bad idea)
+ * Must not be called in NMI context (strictly speaking, only if we don't
+ * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
+ * conventions for vfree() arch-depenedent would be a really bad idea)
*
- * May sleep if called *not* from interrupt context.
+ * May sleep if called *not* from interrupt context.
*
- * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
+ * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
*/
void vfree(const void *addr)
{
@@ -1591,21 +1610,19 @@ void vfree(const void *addr)
if (!addr)
return;
- if (unlikely(in_interrupt()))
- __vfree_deferred(addr);
- else
- __vunmap(addr, 1);
+
+ __vfree(addr);
}
EXPORT_SYMBOL(vfree);
/**
- * vunmap - release virtual mapping obtained by vmap()
- * @addr: memory base address
+ * vunmap - release virtual mapping obtained by vmap()
+ * @addr: memory base address
*
- * Free the virtually contiguous memory area starting at @addr,
- * which was created from the page array passed to vmap().
+ * Free the virtually contiguous memory area starting at @addr,
+ * which was created from the page array passed to vmap().
*
- * Must not be called in interrupt context.
+ * Must not be called in interrupt context.
*/
void vunmap(const void *addr)
{
@@ -1617,17 +1634,19 @@ void vunmap(const void *addr)
EXPORT_SYMBOL(vunmap);
/**
- * vmap - map an array of pages into virtually contiguous space
- * @pages: array of page pointers
- * @count: number of pages to map
- * @flags: vm_area->flags
- * @prot: page protection for the mapping
- *
- * Maps @count pages from @pages into contiguous kernel virtual
- * space.
+ * vmap - map an array of pages into virtually contiguous space
+ * @pages: array of page pointers
+ * @count: number of pages to map
+ * @flags: vm_area->flags
+ * @prot: page protection for the mapping
+ *
+ * Maps @count pages from @pages into contiguous kernel virtual
+ * space.
+ *
+ * Return: the address of the area or %NULL on failure
*/
void *vmap(struct page **pages, unsigned int count,
- unsigned long flags, pgprot_t prot)
+ unsigned long flags, pgprot_t prot)
{
struct vm_struct *area;
unsigned long size; /* In bytes */
@@ -1709,25 +1728,27 @@ fail:
warn_alloc(gfp_mask, NULL,
"vmalloc: allocation failure, allocated %ld of %ld bytes",
(area->nr_pages*PAGE_SIZE), area->size);
- vfree(area->addr);
+ __vfree(area->addr);
return NULL;
}
/**
- * __vmalloc_node_range - allocate virtually contiguous memory
- * @size: allocation size
- * @align: desired alignment
- * @start: vm area range start
- * @end: vm area range end
- * @gfp_mask: flags for the page level allocator
- * @prot: protection mask for the allocated pages
- * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
- * @node: node to use for allocation or NUMA_NO_NODE
- * @caller: caller's return address
- *
- * Allocate enough pages to cover @size from the page level
- * allocator with @gfp_mask flags. Map them into contiguous
- * kernel virtual space, using a pagetable protection of @prot.
+ * __vmalloc_node_range - allocate virtually contiguous memory
+ * @size: allocation size
+ * @align: desired alignment
+ * @start: vm area range start
+ * @end: vm area range end
+ * @gfp_mask: flags for the page level allocator
+ * @prot: protection mask for the allocated pages
+ * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
+ * @node: node to use for allocation or NUMA_NO_NODE
+ * @caller: caller's return address
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator with @gfp_mask flags. Map them into contiguous
+ * kernel virtual space, using a pagetable protection of @prot.
+ *
+ * Return: the address of the area or %NULL on failure
*/
void *__vmalloc_node_range(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
@@ -1768,25 +1789,35 @@ fail:
return NULL;
}
+/*
+ * This is only for performance analysis of vmalloc and stress purpose.
+ * It is required by vmalloc test module, therefore do not use it other
+ * than that.
+ */
+#ifdef CONFIG_TEST_VMALLOC_MODULE
+EXPORT_SYMBOL_GPL(__vmalloc_node_range);
+#endif
+
/**
- * __vmalloc_node - allocate virtually contiguous memory
- * @size: allocation size
- * @align: desired alignment
- * @gfp_mask: flags for the page level allocator
- * @prot: protection mask for the allocated pages
- * @node: node to use for allocation or NUMA_NO_NODE
- * @caller: caller's return address
+ * __vmalloc_node - allocate virtually contiguous memory
+ * @size: allocation size
+ * @align: desired alignment
+ * @gfp_mask: flags for the page level allocator
+ * @prot: protection mask for the allocated pages
+ * @node: node to use for allocation or NUMA_NO_NODE
+ * @caller: caller's return address
*
- * Allocate enough pages to cover @size from the page level
- * allocator with @gfp_mask flags. Map them into contiguous
- * kernel virtual space, using a pagetable protection of @prot.
+ * Allocate enough pages to cover @size from the page level
+ * allocator with @gfp_mask flags. Map them into contiguous
+ * kernel virtual space, using a pagetable protection of @prot.
*
- * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
- * and __GFP_NOFAIL are not supported
+ * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
+ * and __GFP_NOFAIL are not supported
*
- * Any use of gfp flags outside of GFP_KERNEL should be consulted
- * with mm people.
+ * Any use of gfp flags outside of GFP_KERNEL should be consulted
+ * with mm people.
*
+ * Return: pointer to the allocated memory or %NULL on error
*/
static void *__vmalloc_node(unsigned long size, unsigned long align,
gfp_t gfp_mask, pgprot_t prot,
@@ -1818,13 +1849,16 @@ void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
}
/**
- * vmalloc - allocate virtually contiguous memory
- * @size: allocation size
- * Allocate enough pages to cover @size from the page level
- * allocator and map them into contiguous kernel virtual space.
+ * vmalloc - allocate virtually contiguous memory
+ * @size: allocation size
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
*
- * For tight control over page level allocator and protection flags
- * use __vmalloc() instead.
+ * Return: pointer to the allocated memory or %NULL on error
*/
void *vmalloc(unsigned long size)
{
@@ -1834,14 +1868,17 @@ void *vmalloc(unsigned long size)
EXPORT_SYMBOL(vmalloc);
/**
- * vzalloc - allocate virtually contiguous memory with zero fill
- * @size: allocation size
- * Allocate enough pages to cover @size from the page level
- * allocator and map them into contiguous kernel virtual space.
- * The memory allocated is set to zero.
- *
- * For tight control over page level allocator and protection flags
- * use __vmalloc() instead.
+ * vzalloc - allocate virtually contiguous memory with zero fill
+ * @size: allocation size
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
*/
void *vzalloc(unsigned long size)
{
@@ -1856,34 +1893,30 @@ EXPORT_SYMBOL(vzalloc);
*
* The resulting memory area is zeroed so it can be mapped to userspace
* without leaking data.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
*/
void *vmalloc_user(unsigned long size)
{
- struct vm_struct *area;
- void *ret;
-
- ret = __vmalloc_node(size, SHMLBA,
- GFP_KERNEL | __GFP_ZERO,
- PAGE_KERNEL, NUMA_NO_NODE,
- __builtin_return_address(0));
- if (ret) {
- area = find_vm_area(ret);
- area->flags |= VM_USERMAP;
- }
- return ret;
+ return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
+ VM_USERMAP, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_user);
/**
- * vmalloc_node - allocate memory on a specific node
- * @size: allocation size
- * @node: numa node
+ * vmalloc_node - allocate memory on a specific node
+ * @size: allocation size
+ * @node: numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
*
- * Allocate enough pages to cover @size from the page level
- * allocator and map them into contiguous kernel virtual space.
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
*
- * For tight control over page level allocator and protection flags
- * use __vmalloc() instead.
+ * Return: pointer to the allocated memory or %NULL on error
*/
void *vmalloc_node(unsigned long size, int node)
{
@@ -1903,6 +1936,8 @@ EXPORT_SYMBOL(vmalloc_node);
*
* For tight control over page level allocator and protection flags
* use __vmalloc_node() instead.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
*/
void *vzalloc_node(unsigned long size, int node)
{
@@ -1912,17 +1947,18 @@ void *vzalloc_node(unsigned long size, int node)
EXPORT_SYMBOL(vzalloc_node);
/**
- * vmalloc_exec - allocate virtually contiguous, executable memory
- * @size: allocation size
+ * vmalloc_exec - allocate virtually contiguous, executable memory
+ * @size: allocation size
*
- * Kernel-internal function to allocate enough pages to cover @size
- * the page level allocator and map them into contiguous and
- * executable kernel virtual space.
+ * Kernel-internal function to allocate enough pages to cover @size
+ * the page level allocator and map them into contiguous and
+ * executable kernel virtual space.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
*
- * For tight control over page level allocator and protection flags
- * use __vmalloc() instead.
+ * Return: pointer to the allocated memory or %NULL on error
*/
-
void *vmalloc_exec(unsigned long size)
{
return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC,
@@ -1942,11 +1978,13 @@ void *vmalloc_exec(unsigned long size)
#endif
/**
- * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
- * @size: allocation size
+ * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
+ * @size: allocation size
*
- * Allocate enough 32bit PA addressable pages to cover @size from the
- * page level allocator and map them into contiguous kernel virtual space.
+ * Allocate enough 32bit PA addressable pages to cover @size from the
+ * page level allocator and map them into contiguous kernel virtual space.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
*/
void *vmalloc_32(unsigned long size)
{
@@ -1957,23 +1995,19 @@ EXPORT_SYMBOL(vmalloc_32);
/**
* vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
- * @size: allocation size
+ * @size: allocation size
*
* The resulting memory area is 32bit addressable and zeroed so it can be
* mapped to userspace without leaking data.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
*/
void *vmalloc_32_user(unsigned long size)
{
- struct vm_struct *area;
- void *ret;
-
- ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
- NUMA_NO_NODE, __builtin_return_address(0));
- if (ret) {
- area = find_vm_area(ret);
- area->flags |= VM_USERMAP;
- }
- return ret;
+ return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
+ GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
+ VM_USERMAP, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32_user);
@@ -2059,31 +2093,29 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
}
/**
- * vread() - read vmalloc area in a safe way.
- * @buf: buffer for reading data
- * @addr: vm address.
- * @count: number of bytes to be read.
- *
- * Returns # of bytes which addr and buf should be increased.
- * (same number to @count). Returns 0 if [addr...addr+count) doesn't
- * includes any intersect with alive vmalloc area.
- *
- * This function checks that addr is a valid vmalloc'ed area, and
- * copy data from that area to a given buffer. If the given memory range
- * of [addr...addr+count) includes some valid address, data is copied to
- * proper area of @buf. If there are memory holes, they'll be zero-filled.
- * IOREMAP area is treated as memory hole and no copy is done.
- *
- * If [addr...addr+count) doesn't includes any intersects with alive
- * vm_struct area, returns 0. @buf should be kernel's buffer.
- *
- * Note: In usual ops, vread() is never necessary because the caller
- * should know vmalloc() area is valid and can use memcpy().
- * This is for routines which have to access vmalloc area without
- * any informaion, as /dev/kmem.
- *
+ * vread() - read vmalloc area in a safe way.
+ * @buf: buffer for reading data
+ * @addr: vm address.
+ * @count: number of bytes to be read.
+ *
+ * This function checks that addr is a valid vmalloc'ed area, and
+ * copy data from that area to a given buffer. If the given memory range
+ * of [addr...addr+count) includes some valid address, data is copied to
+ * proper area of @buf. If there are memory holes, they'll be zero-filled.
+ * IOREMAP area is treated as memory hole and no copy is done.
+ *
+ * If [addr...addr+count) doesn't includes any intersects with alive
+ * vm_struct area, returns 0. @buf should be kernel's buffer.
+ *
+ * Note: In usual ops, vread() is never necessary because the caller
+ * should know vmalloc() area is valid and can use memcpy().
+ * This is for routines which have to access vmalloc area without
+ * any informaion, as /dev/kmem.
+ *
+ * Return: number of bytes for which addr and buf should be increased
+ * (same number as @count) or %0 if [addr...addr+count) doesn't
+ * include any intersection with valid vmalloc area
*/
-
long vread(char *buf, char *addr, unsigned long count)
{
struct vmap_area *va;
@@ -2140,31 +2172,29 @@ finished:
}
/**
- * vwrite() - write vmalloc area in a safe way.
- * @buf: buffer for source data
- * @addr: vm address.
- * @count: number of bytes to be read.
- *
- * Returns # of bytes which addr and buf should be incresed.
- * (same number to @count).
- * If [addr...addr+count) doesn't includes any intersect with valid
- * vmalloc area, returns 0.
- *
- * This function checks that addr is a valid vmalloc'ed area, and
- * copy data from a buffer to the given addr. If specified range of
- * [addr...addr+count) includes some valid address, data is copied from
- * proper area of @buf. If there are memory holes, no copy to hole.
- * IOREMAP area is treated as memory hole and no copy is done.
- *
- * If [addr...addr+count) doesn't includes any intersects with alive
- * vm_struct area, returns 0. @buf should be kernel's buffer.
- *
- * Note: In usual ops, vwrite() is never necessary because the caller
- * should know vmalloc() area is valid and can use memcpy().
- * This is for routines which have to access vmalloc area without
- * any informaion, as /dev/kmem.
+ * vwrite() - write vmalloc area in a safe way.
+ * @buf: buffer for source data
+ * @addr: vm address.
+ * @count: number of bytes to be read.
+ *
+ * This function checks that addr is a valid vmalloc'ed area, and
+ * copy data from a buffer to the given addr. If specified range of
+ * [addr...addr+count) includes some valid address, data is copied from
+ * proper area of @buf. If there are memory holes, no copy to hole.
+ * IOREMAP area is treated as memory hole and no copy is done.
+ *
+ * If [addr...addr+count) doesn't includes any intersects with alive
+ * vm_struct area, returns 0. @buf should be kernel's buffer.
+ *
+ * Note: In usual ops, vwrite() is never necessary because the caller
+ * should know vmalloc() area is valid and can use memcpy().
+ * This is for routines which have to access vmalloc area without
+ * any informaion, as /dev/kmem.
+ *
+ * Return: number of bytes for which addr and buf should be
+ * increased (same number as @count) or %0 if [addr...addr+count)
+ * doesn't include any intersection with valid vmalloc area
*/
-
long vwrite(char *buf, char *addr, unsigned long count)
{
struct vmap_area *va;
@@ -2216,20 +2246,20 @@ finished:
}
/**
- * remap_vmalloc_range_partial - map vmalloc pages to userspace
- * @vma: vma to cover
- * @uaddr: target user address to start at
- * @kaddr: virtual address of vmalloc kernel memory
- * @size: size of map area
+ * remap_vmalloc_range_partial - map vmalloc pages to userspace
+ * @vma: vma to cover
+ * @uaddr: target user address to start at
+ * @kaddr: virtual address of vmalloc kernel memory
+ * @size: size of map area
*
- * Returns: 0 for success, -Exxx on failure
+ * Returns: 0 for success, -Exxx on failure
*
- * This function checks that @kaddr is a valid vmalloc'ed area,
- * and that it is big enough to cover the range starting at
- * @uaddr in @vma. Will return failure if that criteria isn't
- * met.
+ * This function checks that @kaddr is a valid vmalloc'ed area,
+ * and that it is big enough to cover the range starting at
+ * @uaddr in @vma. Will return failure if that criteria isn't
+ * met.
*
- * Similar to remap_pfn_range() (see mm/memory.c)
+ * Similar to remap_pfn_range() (see mm/memory.c)
*/
int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
void *kaddr, unsigned long size)
@@ -2248,7 +2278,7 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
if (!(area->flags & VM_USERMAP))
return -EINVAL;
- if (kaddr + size > area->addr + area->size)
+ if (kaddr + size > area->addr + get_vm_area_size(area))
return -EINVAL;
do {
@@ -2271,18 +2301,18 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
EXPORT_SYMBOL(remap_vmalloc_range_partial);
/**
- * remap_vmalloc_range - map vmalloc pages to userspace
- * @vma: vma to cover (map full range of vma)
- * @addr: vmalloc memory
- * @pgoff: number of pages into addr before first page to map
+ * remap_vmalloc_range - map vmalloc pages to userspace
+ * @vma: vma to cover (map full range of vma)
+ * @addr: vmalloc memory
+ * @pgoff: number of pages into addr before first page to map
*
- * Returns: 0 for success, -Exxx on failure
+ * Returns: 0 for success, -Exxx on failure
*
- * This function checks that addr is a valid vmalloc'ed area, and
- * that it is big enough to cover the vma. Will return failure if
- * that criteria isn't met.
+ * This function checks that addr is a valid vmalloc'ed area, and
+ * that it is big enough to cover the vma. Will return failure if
+ * that criteria isn't met.
*
- * Similar to remap_pfn_range() (see mm/memory.c)
+ * Similar to remap_pfn_range() (see mm/memory.c)
*/
int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
unsigned long pgoff)
@@ -2314,18 +2344,18 @@ static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data)
}
/**
- * alloc_vm_area - allocate a range of kernel address space
- * @size: size of the area
- * @ptes: returns the PTEs for the address space
+ * alloc_vm_area - allocate a range of kernel address space
+ * @size: size of the area
+ * @ptes: returns the PTEs for the address space
*
- * Returns: NULL on failure, vm_struct on success
+ * Returns: NULL on failure, vm_struct on success
*
- * This function reserves a range of kernel address space, and
- * allocates pagetables to map that range. No actual mappings
- * are created.
+ * This function reserves a range of kernel address space, and
+ * allocates pagetables to map that range. No actual mappings
+ * are created.
*
- * If @ptes is non-NULL, pointers to the PTEs (in init_mm)
- * allocated for the VM area are returned.
+ * If @ptes is non-NULL, pointers to the PTEs (in init_mm)
+ * allocated for the VM area are returned.
*/
struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
{
@@ -2751,4 +2781,3 @@ static int __init proc_vmalloc_init(void)
module_init(proc_vmalloc_init);
#endif
-
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a714c4f800e9..a5ad0b35ab8e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -374,7 +374,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone
*/
int prealloc_shrinker(struct shrinker *shrinker)
{
- size_t size = sizeof(*shrinker->nr_deferred);
+ unsigned int size = sizeof(*shrinker->nr_deferred);
if (shrinker->flags & SHRINKER_NUMA_AWARE)
size *= nr_node_ids;
@@ -491,16 +491,6 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
delta = freeable / 2;
}
- /*
- * Make sure we apply some minimal pressure on default priority
- * even on small cgroups. Stale objects are not only consuming memory
- * by themselves, but can also hold a reference to a dying cgroup,
- * preventing it from being reclaimed. A dying cgroup with all
- * corresponding structures like per-cpu stats and kmem caches
- * can be really big, so it may lead to a significant waste of memory.
- */
- delta = max_t(unsigned long long, delta, min(freeable, batch_size));
-
total_scan += delta;
if (total_scan < 0) {
pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
@@ -962,7 +952,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
*/
if (reclaimed && page_is_file_cache(page) &&
!mapping_exiting(mapping) && !dax_mapping(mapping))
- shadow = workingset_eviction(mapping, page);
+ shadow = workingset_eviction(page);
__delete_from_page_cache(page, shadow);
xa_unlock_irqrestore(&mapping->i_pages, flags);
@@ -1116,16 +1106,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
- int pgactivate = 0;
- unsigned nr_unqueued_dirty = 0;
- unsigned nr_dirty = 0;
- unsigned nr_congested = 0;
unsigned nr_reclaimed = 0;
- unsigned nr_writeback = 0;
- unsigned nr_immediate = 0;
- unsigned nr_ref_keep = 0;
- unsigned nr_unmap_fail = 0;
+ memset(stat, 0, sizeof(*stat));
cond_resched();
while (!list_empty(page_list)) {
@@ -1169,10 +1152,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
*/
page_check_dirty_writeback(page, &dirty, &writeback);
if (dirty || writeback)
- nr_dirty++;
+ stat->nr_dirty++;
if (dirty && !writeback)
- nr_unqueued_dirty++;
+ stat->nr_unqueued_dirty++;
/*
* Treat this page as congested if the underlying BDI is or if
@@ -1184,7 +1167,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (((dirty || writeback) && mapping &&
inode_write_congested(mapping->host)) ||
(writeback && PageReclaim(page)))
- nr_congested++;
+ stat->nr_congested++;
/*
* If a page at the tail of the LRU is under writeback, there
@@ -1233,7 +1216,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (current_is_kswapd() &&
PageReclaim(page) &&
test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
- nr_immediate++;
+ stat->nr_immediate++;
goto activate_locked;
/* Case 2 above */
@@ -1251,7 +1234,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* and it's also appropriate in global reclaim.
*/
SetPageReclaim(page);
- nr_writeback++;
+ stat->nr_writeback++;
goto activate_locked;
/* Case 3 above */
@@ -1271,7 +1254,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
case PAGEREF_ACTIVATE:
goto activate_locked;
case PAGEREF_KEEP:
- nr_ref_keep++;
+ stat->nr_ref_keep++;
goto keep_locked;
case PAGEREF_RECLAIM:
case PAGEREF_RECLAIM_CLEAN:
@@ -1336,7 +1319,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (unlikely(PageTransHuge(page)))
flags |= TTU_SPLIT_HUGE_PMD;
if (!try_to_unmap(page, flags)) {
- nr_unmap_fail++;
+ stat->nr_unmap_fail++;
goto activate_locked;
}
}
@@ -1484,7 +1467,7 @@ activate_locked:
VM_BUG_ON_PAGE(PageActive(page), page);
if (!PageMlocked(page)) {
SetPageActive(page);
- pgactivate++;
+ stat->nr_activate++;
count_memcg_page_event(page, PGACTIVATE);
}
keep_locked:
@@ -1499,18 +1482,8 @@ keep:
free_unref_page_list(&free_pages);
list_splice(&ret_pages, page_list);
- count_vm_events(PGACTIVATE, pgactivate);
-
- if (stat) {
- stat->nr_dirty = nr_dirty;
- stat->nr_congested = nr_congested;
- stat->nr_unqueued_dirty = nr_unqueued_dirty;
- stat->nr_writeback = nr_writeback;
- stat->nr_immediate = nr_immediate;
- stat->nr_activate = pgactivate;
- stat->nr_ref_keep = nr_ref_keep;
- stat->nr_unmap_fail = nr_unmap_fail;
- }
+ count_vm_events(PGACTIVATE, stat->nr_activate);
+
return nr_reclaimed;
}
@@ -1522,6 +1495,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
.priority = DEF_PRIORITY,
.may_unmap = 1,
};
+ struct reclaim_stat dummy_stat;
unsigned long ret;
struct page *page, *next;
LIST_HEAD(clean_pages);
@@ -1535,7 +1509,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
}
ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
- TTU_IGNORE_ACCESS, NULL, true);
+ TTU_IGNORE_ACCESS, &dummy_stat, true);
list_splice(&clean_pages, page_list);
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
return ret;
@@ -1640,8 +1614,8 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
}
-/*
- * zone_lru_lock is heavily contended. Some of the functions that
+/**
+ * pgdat->lru_lock is heavily contended. Some of the functions that
* shrink the lists perform better by taking out a batch of pages
* and working on them outside the LRU lock.
*
@@ -1663,7 +1637,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
struct lruvec *lruvec, struct list_head *dst,
unsigned long *nr_scanned, struct scan_control *sc,
- isolate_mode_t mode, enum lru_list lru)
+ enum lru_list lru)
{
struct list_head *src = &lruvec->lists[lru];
unsigned long nr_taken = 0;
@@ -1672,6 +1646,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
unsigned long skipped = 0;
unsigned long scan, total_scan, nr_pages;
LIST_HEAD(pages_skipped);
+ isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
scan = 0;
for (total_scan = 0;
@@ -1775,11 +1750,11 @@ int isolate_lru_page(struct page *page)
WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
if (PageLRU(page)) {
- struct zone *zone = page_zone(page);
+ pg_data_t *pgdat = page_pgdat(page);
struct lruvec *lruvec;
- spin_lock_irq(zone_lru_lock(zone));
- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
+ spin_lock_irq(&pgdat->lru_lock);
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
if (PageLRU(page)) {
int lru = page_lru(page);
get_page(page);
@@ -1787,7 +1762,7 @@ int isolate_lru_page(struct page *page)
del_page_from_lru_list(page, lruvec, lru);
ret = 0;
}
- spin_unlock_irq(zone_lru_lock(zone));
+ spin_unlock_irq(&pgdat->lru_lock);
}
return ret;
}
@@ -1909,8 +1884,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
unsigned long nr_scanned;
unsigned long nr_reclaimed = 0;
unsigned long nr_taken;
- struct reclaim_stat stat = {};
- isolate_mode_t isolate_mode = 0;
+ struct reclaim_stat stat;
int file = is_file_lru(lru);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
@@ -1931,13 +1905,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
lru_add_drain();
- if (!sc->may_unmap)
- isolate_mode |= ISOLATE_UNMAPPED;
-
spin_lock_irq(&pgdat->lru_lock);
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
- &nr_scanned, sc, isolate_mode, lru);
+ &nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
reclaim_stat->recent_scanned[file] += nr_taken;
@@ -2019,9 +1990,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
* processes, from rmap.
*
* If the pages are mostly unmapped, the processing is fast and it is
- * appropriate to hold zone_lru_lock across the whole operation. But if
+ * appropriate to hold pgdat->lru_lock across the whole operation. But if
* the pages are mapped, the processing is slow (page_referenced()) so we
- * should drop zone_lru_lock around each page. It's impossible to balance
+ * should drop pgdat->lru_lock around each page. It's impossible to balance
* this, so instead we remove the pages from the LRU while processing them.
* It is safe to rely on PG_active against the non-LRU pages in here because
* nobody will play with that bit on a non-LRU page.
@@ -2094,19 +2065,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
unsigned nr_deactivate, nr_activate;
unsigned nr_rotated = 0;
- isolate_mode_t isolate_mode = 0;
int file = is_file_lru(lru);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
lru_add_drain();
- if (!sc->may_unmap)
- isolate_mode |= ISOLATE_UNMAPPED;
-
spin_lock_irq(&pgdat->lru_lock);
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
- &nr_scanned, sc, isolate_mode, lru);
+ &nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
reclaim_stat->recent_scanned[file] += nr_taken;
@@ -2764,16 +2731,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
sc->nr_reclaimed - reclaimed);
/*
- * Direct reclaim and kswapd have to scan all memory
- * cgroups to fulfill the overall scan target for the
- * node.
+ * Kswapd have to scan all memory cgroups to fulfill
+ * the overall scan target for the node.
*
* Limit reclaim, on the other hand, only cares about
* nr_to_reclaim pages to be reclaimed and it will
* retry with decreasing priority if one round over the
* whole hierarchy is not sufficient.
*/
- if (!global_reclaim(sc) &&
+ if (!current_is_kswapd() &&
sc->nr_reclaimed >= sc->nr_to_reclaim) {
mem_cgroup_iter_break(root, memcg);
break;
@@ -3537,7 +3503,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
*
* kswapd scans the zones in the highmem->normal->dma direction. It skips
* zones which have free_pages > high_wmark_pages(zone), but once a zone is
- * found to have free_pages <= high_wmark_pages(zone), any page is that zone
+ * found to have free_pages <= high_wmark_pages(zone), any page in that zone
* or lower is eligible for reclaim until at least one usable zone is
* balanced.
*/
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 83b30edc2f7f..36b56f858f0f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -2121,21 +2121,14 @@ static int __init extfrag_debug_init(void)
struct dentry *extfrag_debug_root;
extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
- if (!extfrag_debug_root)
- return -ENOMEM;
- if (!debugfs_create_file("unusable_index", 0444,
- extfrag_debug_root, NULL, &unusable_file_ops))
- goto fail;
+ debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
+ &unusable_file_ops);
- if (!debugfs_create_file("extfrag_index", 0444,
- extfrag_debug_root, NULL, &extfrag_file_ops))
- goto fail;
+ debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
+ &extfrag_file_ops);
return 0;
-fail:
- debugfs_remove_recursive(extfrag_debug_root);
- return -ENOMEM;
}
module_init(extfrag_debug_init);
diff --git a/mm/workingset.c b/mm/workingset.c
index dcb994f2acc2..0bedf67502d5 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -215,13 +215,12 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
/**
* workingset_eviction - note the eviction of a page from memory
- * @mapping: address space the page was backing
* @page: the page being evicted
*
- * Returns a shadow entry to be stored in @mapping->i_pages in place
+ * Returns a shadow entry to be stored in @page->mapping->i_pages in place
* of the evicted @page so that a later refault can be detected.
*/
-void *workingset_eviction(struct address_space *mapping, struct page *page)
+void *workingset_eviction(struct page *page)
{
struct pglist_data *pgdat = page_pgdat(page);
struct mem_cgroup *memcg = page_memcg(page);