diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-07-29 02:36:48 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-07-29 02:36:48 +0300 |
commit | 1c88e19b0f6a8471ee50d5062721ba30b8fd4ba9 (patch) | |
tree | 6d227487ca2cf391589c73af1c40ec7b7126feec | |
parent | 6039b80eb50a893476fea7d56e86ed2d19290054 (diff) | |
parent | c3486f5376696034d0fcbef8ba70c70cfcb26f51 (diff) | |
download | linux-1c88e19b0f6a8471ee50d5062721ba30b8fd4ba9.tar.xz |
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton:
"The rest of MM"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (101 commits)
mm, compaction: simplify contended compaction handling
mm, compaction: introduce direct compaction priority
mm, thp: remove __GFP_NORETRY from khugepaged and madvised allocations
mm, page_alloc: make THP-specific decisions more generic
mm, page_alloc: restructure direct compaction handling in slowpath
mm, page_alloc: don't retry initial attempt in slowpath
mm, page_alloc: set alloc_flags only once in slowpath
lib/stackdepot.c: use __GFP_NOWARN for stack allocations
mm, kasan: switch SLUB to stackdepot, enable memory quarantine for SLUB
mm, kasan: account for object redzone in SLUB's nearest_obj()
mm: fix use-after-free if memory allocation failed in vma_adjust()
zsmalloc: Delete an unnecessary check before the function call "iput"
mm/memblock.c: fix index adjustment error in __next_mem_range_rev()
mem-hotplug: alloc new page from a nearest neighbor node when mem-offline
mm: optimize copy_page_to/from_iter_iovec
mm: add cond_resched() to generic_swapfile_activate()
Revert "mm, mempool: only set __GFP_NOMEMALLOC if there are free elements"
mm, compaction: don't isolate PageWriteback pages in MIGRATE_SYNC_LIGHT mode
mm: hwpoison: remove incorrect comments
make __section_nr() more efficient
...
90 files changed, 2517 insertions, 1978 deletions
diff --git a/Documentation/cgroup-v1/memcg_test.txt b/Documentation/cgroup-v1/memcg_test.txt index 8870b0212150..78a8c2963b38 100644 --- a/Documentation/cgroup-v1/memcg_test.txt +++ b/Documentation/cgroup-v1/memcg_test.txt @@ -107,9 +107,9 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. 8. LRU Each memcg has its own private LRU. Now, its handling is under global - VM's control (means that it's handled under global zone->lru_lock). + VM's control (means that it's handled under global zone_lru_lock). Almost all routines around memcg's LRU is called by global LRU's - list management functions under zone->lru_lock(). + list management functions under zone_lru_lock(). A special function is mem_cgroup_isolate_pages(). This scans memcg's private LRU and call __isolate_lru_page() to extract a page diff --git a/Documentation/cgroup-v1/memory.txt b/Documentation/cgroup-v1/memory.txt index b14abf217239..946e69103cdd 100644 --- a/Documentation/cgroup-v1/memory.txt +++ b/Documentation/cgroup-v1/memory.txt @@ -267,11 +267,11 @@ When oom event notifier is registered, event will be delivered. Other lock order is following: PG_locked. mm->page_table_lock - zone->lru_lock + zone_lru_lock lock_page_cgroup. In many cases, just lock_page_cgroup() is called. per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by - zone->lru_lock, it has no lock of its own. + zone_lru_lock, it has no lock of its own. 2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 2ade7a6a10a7..bbb7ee76e319 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -224,7 +224,7 @@ void __init arm64_memblock_init(void) * via the linear mapping. */ if (memory_limit != (phys_addr_t)ULLONG_MAX) { - memblock_enforce_memory_limit(memory_limit); + memblock_mem_limit_remove_map(memory_limit); memblock_add(__pa(_text), (u64)(_end - _text)); } diff --git a/arch/s390/appldata/appldata_mem.c b/arch/s390/appldata/appldata_mem.c index edcf2a706942..598df5708501 100644 --- a/arch/s390/appldata/appldata_mem.c +++ b/arch/s390/appldata/appldata_mem.c @@ -102,7 +102,7 @@ static void appldata_get_mem_data(void *data) mem_data->totalhigh = P2K(val.totalhigh); mem_data->freehigh = P2K(val.freehigh); mem_data->bufferram = P2K(val.bufferram); - mem_data->cached = P2K(global_page_state(NR_FILE_PAGES) + mem_data->cached = P2K(global_node_page_state(NR_FILE_PAGES) - val.bufferram); si_swapinfo(&val); diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c index c4d5bf841a7f..7cc6ee7f1a58 100644 --- a/arch/tile/mm/pgtable.c +++ b/arch/tile/mm/pgtable.c @@ -45,20 +45,20 @@ void show_mem(unsigned int filter) struct zone *zone; pr_err("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu pagecache:%lu swap:%lu\n", - (global_page_state(NR_ACTIVE_ANON) + - global_page_state(NR_ACTIVE_FILE)), - (global_page_state(NR_INACTIVE_ANON) + - global_page_state(NR_INACTIVE_FILE)), - global_page_state(NR_FILE_DIRTY), - global_page_state(NR_WRITEBACK), - global_page_state(NR_UNSTABLE_NFS), + (global_node_page_state(NR_ACTIVE_ANON) + + global_node_page_state(NR_ACTIVE_FILE)), + (global_node_page_state(NR_INACTIVE_ANON) + + global_node_page_state(NR_INACTIVE_FILE)), + global_node_page_state(NR_FILE_DIRTY), + global_node_page_state(NR_WRITEBACK), + global_node_page_state(NR_UNSTABLE_NFS), global_page_state(NR_FREE_PAGES), (global_page_state(NR_SLAB_RECLAIMABLE) + global_page_state(NR_SLAB_UNRECLAIMABLE)), - global_page_state(NR_FILE_MAPPED), + global_node_page_state(NR_FILE_MAPPED), global_page_state(NR_PAGETABLE), global_page_state(NR_BOUNCE), - global_page_state(NR_FILE_PAGES), + global_node_page_state(NR_FILE_PAGES), get_nr_swap_pages()); for_each_zone(zone) { diff --git a/drivers/base/node.c b/drivers/base/node.c index 51c7db2c4ee2..29cd96661b30 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -56,6 +56,7 @@ static ssize_t node_read_meminfo(struct device *dev, { int n; int nid = dev->id; + struct pglist_data *pgdat = NODE_DATA(nid); struct sysinfo i; si_meminfo_node(&i, nid); @@ -74,16 +75,16 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(i.totalram), nid, K(i.freeram), nid, K(i.totalram - i.freeram), - nid, K(node_page_state(nid, NR_ACTIVE_ANON) + - node_page_state(nid, NR_ACTIVE_FILE)), - nid, K(node_page_state(nid, NR_INACTIVE_ANON) + - node_page_state(nid, NR_INACTIVE_FILE)), - nid, K(node_page_state(nid, NR_ACTIVE_ANON)), - nid, K(node_page_state(nid, NR_INACTIVE_ANON)), - nid, K(node_page_state(nid, NR_ACTIVE_FILE)), - nid, K(node_page_state(nid, NR_INACTIVE_FILE)), - nid, K(node_page_state(nid, NR_UNEVICTABLE)), - nid, K(node_page_state(nid, NR_MLOCK))); + nid, K(node_page_state(pgdat, NR_ACTIVE_ANON) + + node_page_state(pgdat, NR_ACTIVE_FILE)), + nid, K(node_page_state(pgdat, NR_INACTIVE_ANON) + + node_page_state(pgdat, NR_INACTIVE_FILE)), + nid, K(node_page_state(pgdat, NR_ACTIVE_ANON)), + nid, K(node_page_state(pgdat, NR_INACTIVE_ANON)), + nid, K(node_page_state(pgdat, NR_ACTIVE_FILE)), + nid, K(node_page_state(pgdat, NR_INACTIVE_FILE)), + nid, K(node_page_state(pgdat, NR_UNEVICTABLE)), + nid, K(sum_zone_node_page_state(nid, NR_MLOCK))); #ifdef CONFIG_HIGHMEM n += sprintf(buf + n, @@ -117,31 +118,30 @@ static ssize_t node_read_meminfo(struct device *dev, "Node %d ShmemPmdMapped: %8lu kB\n" #endif , - nid, K(node_page_state(nid, NR_FILE_DIRTY)), - nid, K(node_page_state(nid, NR_WRITEBACK)), - nid, K(node_page_state(nid, NR_FILE_PAGES)), - nid, K(node_page_state(nid, NR_FILE_MAPPED)), - nid, K(node_page_state(nid, NR_ANON_PAGES)), + nid, K(node_page_state(pgdat, NR_FILE_DIRTY)), + nid, K(node_page_state(pgdat, NR_WRITEBACK)), + nid, K(node_page_state(pgdat, NR_FILE_PAGES)), + nid, K(node_page_state(pgdat, NR_FILE_MAPPED)), + nid, K(node_page_state(pgdat, NR_ANON_MAPPED)), nid, K(i.sharedram), - nid, node_page_state(nid, NR_KERNEL_STACK) * - THREAD_SIZE / 1024, - nid, K(node_page_state(nid, NR_PAGETABLE)), - nid, K(node_page_state(nid, NR_UNSTABLE_NFS)), - nid, K(node_page_state(nid, NR_BOUNCE)), - nid, K(node_page_state(nid, NR_WRITEBACK_TEMP)), - nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE) + - node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), - nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)), + nid, sum_zone_node_page_state(nid, NR_KERNEL_STACK_KB), + nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)), + nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)), + nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)), + nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), + nid, K(sum_zone_node_page_state(nid, NR_SLAB_RECLAIMABLE) + + sum_zone_node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), + nid, K(sum_zone_node_page_state(nid, NR_SLAB_RECLAIMABLE)), #ifdef CONFIG_TRANSPARENT_HUGEPAGE - nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), - nid, K(node_page_state(nid, NR_ANON_THPS) * + nid, K(sum_zone_node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), + nid, K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), - nid, K(node_page_state(nid, NR_SHMEM_THPS) * + nid, K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), - nid, K(node_page_state(nid, NR_SHMEM_PMDMAPPED) * + nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR)); #else - nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE))); + nid, K(sum_zone_node_page_state(nid, NR_SLAB_UNRECLAIMABLE))); #endif n += hugetlb_report_node_meminfo(nid, buf + n); return n; @@ -160,12 +160,12 @@ static ssize_t node_read_numastat(struct device *dev, "interleave_hit %lu\n" "local_node %lu\n" "other_node %lu\n", - node_page_state(dev->id, NUMA_HIT), - node_page_state(dev->id, NUMA_MISS), - node_page_state(dev->id, NUMA_FOREIGN), - node_page_state(dev->id, NUMA_INTERLEAVE_HIT), - node_page_state(dev->id, NUMA_LOCAL), - node_page_state(dev->id, NUMA_OTHER)); + sum_zone_node_page_state(dev->id, NUMA_HIT), + sum_zone_node_page_state(dev->id, NUMA_MISS), + sum_zone_node_page_state(dev->id, NUMA_FOREIGN), + sum_zone_node_page_state(dev->id, NUMA_INTERLEAVE_HIT), + sum_zone_node_page_state(dev->id, NUMA_LOCAL), + sum_zone_node_page_state(dev->id, NUMA_OTHER)); } static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL); @@ -173,12 +173,18 @@ static ssize_t node_read_vmstat(struct device *dev, struct device_attribute *attr, char *buf) { int nid = dev->id; + struct pglist_data *pgdat = NODE_DATA(nid); int i; int n = 0; for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) n += sprintf(buf+n, "%s %lu\n", vmstat_text[i], - node_page_state(nid, i)); + sum_zone_node_page_state(nid, i)); + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + n += sprintf(buf+n, "%s %lu\n", + vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], + node_page_state(pgdat, i)); return n; } diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c index 24d2745e9437..45a1b4ec4ca3 100644 --- a/drivers/staging/android/lowmemorykiller.c +++ b/drivers/staging/android/lowmemorykiller.c @@ -72,10 +72,10 @@ static unsigned long lowmem_deathpending_timeout; static unsigned long lowmem_count(struct shrinker *s, struct shrink_control *sc) { - return global_page_state(NR_ACTIVE_ANON) + - global_page_state(NR_ACTIVE_FILE) + - global_page_state(NR_INACTIVE_ANON) + - global_page_state(NR_INACTIVE_FILE); + return global_node_page_state(NR_ACTIVE_ANON) + + global_node_page_state(NR_ACTIVE_FILE) + + global_node_page_state(NR_INACTIVE_ANON) + + global_node_page_state(NR_INACTIVE_FILE); } static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc) @@ -91,8 +91,8 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc) short selected_oom_score_adj; int array_size = ARRAY_SIZE(lowmem_adj); int other_free = global_page_state(NR_FREE_PAGES) - totalreserve_pages; - int other_file = global_page_state(NR_FILE_PAGES) - - global_page_state(NR_SHMEM) - + int other_file = global_node_page_state(NR_FILE_PAGES) - + global_node_page_state(NR_SHMEM) - total_swapcache_pages(); if (lowmem_adj_size < array_size) diff --git a/drivers/staging/lustre/lustre/osc/osc_cache.c b/drivers/staging/lustre/lustre/osc/osc_cache.c index d1a7d6beee60..d011135802d5 100644 --- a/drivers/staging/lustre/lustre/osc/osc_cache.c +++ b/drivers/staging/lustre/lustre/osc/osc_cache.c @@ -1864,7 +1864,8 @@ void osc_dec_unstable_pages(struct ptlrpc_request *req) LASSERT(page_count >= 0); for (i = 0; i < page_count; i++) - dec_zone_page_state(desc->bd_iov[i].kiov_page, NR_UNSTABLE_NFS); + dec_node_page_state(desc->bd_iov[i].kiov_page, + NR_UNSTABLE_NFS); atomic_sub(page_count, &cli->cl_cache->ccc_unstable_nr); LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0); @@ -1898,7 +1899,8 @@ void osc_inc_unstable_pages(struct ptlrpc_request *req) LASSERT(page_count >= 0); for (i = 0; i < page_count; i++) - inc_zone_page_state(desc->bd_iov[i].kiov_page, NR_UNSTABLE_NFS); + inc_node_page_state(desc->bd_iov[i].kiov_page, + NR_UNSTABLE_NFS); LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0); atomic_add(page_count, &cli->cl_cache->ccc_unstable_nr); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 6f9c9f6f5157..56c8fda436c0 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1807,8 +1807,8 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb) */ static unsigned long get_nr_dirty_pages(void) { - return global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) + + return global_node_page_state(NR_FILE_DIRTY) + + global_node_page_state(NR_UNSTABLE_NFS) + get_nr_dirty_inodes(); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 9154f8679024..2382f22a2a8b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1452,7 +1452,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) list_del(&req->writepages_entry); for (i = 0; i < req->num_pages; i++) { dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP); + dec_node_page_state(req->pages[i], NR_WRITEBACK_TEMP); wb_writeout_inc(&bdi->wb); } wake_up(&fi->page_waitq); @@ -1642,7 +1642,7 @@ static int fuse_writepage_locked(struct page *page) req->inode = inode; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); + inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); spin_lock(&fc->lock); list_add(&req->writepages_entry, &fi->writepages); @@ -1756,7 +1756,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req, spin_unlock(&fc->lock); dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_zone_page_state(page, NR_WRITEBACK_TEMP); + dec_node_page_state(page, NR_WRITEBACK_TEMP); wb_writeout_inc(&bdi->wb); fuse_writepage_free(fc, new_req); fuse_request_free(new_req); @@ -1855,7 +1855,7 @@ static int fuse_writepages_fill(struct page *page, req->page_descs[req->num_pages].length = PAGE_SIZE; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); + inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); err = 0; if (is_writeback && fuse_writepage_in_flight(req, page)) { diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 5154fa65a2f2..5ea04d87fc65 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -623,7 +623,7 @@ void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo) if (!cinfo->dreq) { struct inode *inode = page_file_mapping(page)->host; - inc_zone_page_state(page, NR_UNSTABLE_NFS); + inc_node_page_state(page, NR_UNSTABLE_NFS); inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e1c74d3db64d..593fa21a02c0 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -898,7 +898,7 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, static void nfs_clear_page_commit(struct page *page) { - dec_zone_page_state(page, NR_UNSTABLE_NFS); + dec_node_page_state(page, NR_UNSTABLE_NFS); dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb, WB_RECLAIMABLE); } diff --git a/fs/proc/base.c b/fs/proc/base.c index a11eb7196ec8..31370da2ee7c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1024,23 +1024,107 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, char buffer[PROC_NUMBUF]; int oom_adj = OOM_ADJUST_MIN; size_t len; - unsigned long flags; if (!task) return -ESRCH; - if (lock_task_sighand(task, &flags)) { - if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX) - oom_adj = OOM_ADJUST_MAX; - else - oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / - OOM_SCORE_ADJ_MAX; - unlock_task_sighand(task, &flags); - } + if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX) + oom_adj = OOM_ADJUST_MAX; + else + oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / + OOM_SCORE_ADJ_MAX; put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj); return simple_read_from_buffer(buf, count, ppos, buffer, len); } +static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) +{ + static DEFINE_MUTEX(oom_adj_mutex); + struct mm_struct *mm = NULL; + struct task_struct *task; + int err = 0; + + task = get_proc_task(file_inode(file)); + if (!task) + return -ESRCH; + + mutex_lock(&oom_adj_mutex); + if (legacy) { + if (oom_adj < task->signal->oom_score_adj && + !capable(CAP_SYS_RESOURCE)) { + err = -EACCES; + goto err_unlock; + } + /* + * /proc/pid/oom_adj is provided for legacy purposes, ask users to use + * /proc/pid/oom_score_adj instead. + */ + pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", + current->comm, task_pid_nr(current), task_pid_nr(task), + task_pid_nr(task)); + } else { + if ((short)oom_adj < task->signal->oom_score_adj_min && + !capable(CAP_SYS_RESOURCE)) { + err = -EACCES; + goto err_unlock; + } + } + + /* + * Make sure we will check other processes sharing the mm if this is + * not vfrok which wants its own oom_score_adj. + * pin the mm so it doesn't go away and get reused after task_unlock + */ + if (!task->vfork_done) { + struct task_struct *p = find_lock_task_mm(task); + + if (p) { + if (atomic_read(&p->mm->mm_users) > 1) { + mm = p->mm; + atomic_inc(&mm->mm_count); + } + task_unlock(p); + } + } + + task->signal->oom_score_adj = oom_adj; + if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE)) + task->signal->oom_score_adj_min = (short)oom_adj; + trace_oom_score_adj_update(task); + + if (mm) { + struct task_struct *p; + + rcu_read_lock(); + for_each_process(p) { + if (same_thread_group(task, p)) + continue; + + /* do not touch kernel threads or the global init */ + if (p->flags & PF_KTHREAD || is_global_init(p)) + continue; + + task_lock(p); + if (!p->vfork_done && process_shares_mm(p, mm)) { + pr_info("updating oom_score_adj for %d (%s) from %d to %d because it shares mm with %d (%s). Report if this is unexpected.\n", + task_pid_nr(p), p->comm, + p->signal->oom_score_adj, oom_adj, + task_pid_nr(task), task->comm); + p->signal->oom_score_adj = oom_adj; + if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE)) + p->signal->oom_score_adj_min = (short)oom_adj; + } + task_unlock(p); + } + rcu_read_unlock(); + mmdrop(mm); + } +err_unlock: + mutex_unlock(&oom_adj_mutex); + put_task_struct(task); + return err; +} + /* * /proc/pid/oom_adj exists solely for backwards compatibility with previous * kernels. The effective policy is defined by oom_score_adj, which has a @@ -1054,10 +1138,8 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, static ssize_t oom_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task; char buffer[PROC_NUMBUF]; int oom_adj; - unsigned long flags; int err; memset(buffer, 0, sizeof(buffer)); @@ -1077,23 +1159,6 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf, goto out; } - task = get_proc_task(file_inode(file)); - if (!task) { - err = -ESRCH; - goto out; - } - - task_lock(task); - if (!task->mm) { - err = -EINVAL; - goto err_task_lock; - } - - if (!lock_task_sighand(task, &flags)) { - err = -ESRCH; - goto err_task_lock; - } - /* * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum * value is always attainable. @@ -1103,27 +1168,7 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf, else oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; - if (oom_adj < task->signal->oom_score_adj && - !capable(CAP_SYS_RESOURCE)) { - err = -EACCES; - goto err_sighand; - } - - /* - * /proc/pid/oom_adj is provided for legacy purposes, ask users to use - * /proc/pid/oom_score_adj instead. - */ - pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", - current->comm, task_pid_nr(current), task_pid_nr(task), - task_pid_nr(task)); - - task->signal->oom_score_adj = oom_adj; - trace_oom_score_adj_update(task); -err_sighand: - unlock_task_sighand(task, &flags); -err_task_lock: - task_unlock(task); - put_task_struct(task); + err = __set_oom_adj(file, oom_adj, true); out: return err < 0 ? err : count; } @@ -1140,15 +1185,11 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf, struct task_struct *task = get_proc_task(file_inode(file)); char buffer[PROC_NUMBUF]; short oom_score_adj = OOM_SCORE_ADJ_MIN; - unsigned long flags; size_t len; if (!task) return -ESRCH; - if (lock_task_sighand(task, &flags)) { - oom_score_adj = task->signal->oom_score_adj; - unlock_task_sighand(task, &flags); - } + oom_score_adj = task->signal->oom_score_adj; put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj); return simple_read_from_buffer(buf, count, ppos, buffer, len); @@ -1157,9 +1198,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf, static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task; char buffer[PROC_NUMBUF]; - unsigned long flags; int oom_score_adj; int err; @@ -1180,39 +1219,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, goto out; } - task = get_proc_task(file_inode(file)); - if (!task) { - err = -ESRCH; - goto out; - } - - task_lock(task); - if (!task->mm) { - err = -EINVAL; - goto err_task_lock; - } - - if (!lock_task_sighand(task, &flags)) { - err = -ESRCH; - goto err_task_lock; - } - - if ((short)oom_score_adj < task->signal->oom_score_adj_min && - !capable(CAP_SYS_RESOURCE)) { - err = -EACCES; - goto err_sighand; - } - - task->signal->oom_score_adj = (short)oom_score_adj; - if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) - task->signal->oom_score_adj_min = (short)oom_score_adj; - trace_oom_score_adj_update(task); - -err_sighand: - unlock_task_sighand(task, &flags); -err_task_lock: - task_unlock(task); - put_task_struct(task); + err = __set_oom_adj(file, oom_score_adj, false); out: return err < 0 ? err : count; } diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index cf301a9ef512..09e18fdf61e5 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -40,7 +40,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) si_swapinfo(&i); committed = percpu_counter_read_positive(&vm_committed_as); - cached = global_page_state(NR_FILE_PAGES) - + cached = global_node_page_state(NR_FILE_PAGES) - total_swapcache_pages() - i.bufferram; if (cached < 0) cached = 0; @@ -138,23 +138,23 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #endif K(i.totalswap), K(i.freeswap), - K(global_page_state(NR_FILE_DIRTY)), - K(global_page_state(NR_WRITEBACK)), - K(global_page_state(NR_ANON_PAGES)), - K(global_page_state(NR_FILE_MAPPED)), + K(global_node_page_state(NR_FILE_DIRTY)), + K(global_node_page_state(NR_WRITEBACK)), + K(global_node_page_state(NR_ANON_MAPPED)), + K(global_node_page_state(NR_FILE_MAPPED)), K(i.sharedram), K(global_page_state(NR_SLAB_RECLAIMABLE) + global_page_state(NR_SLAB_UNRECLAIMABLE)), K(global_page_state(NR_SLAB_RECLAIMABLE)), K(global_page_state(NR_SLAB_UNRECLAIMABLE)), - global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024, + global_page_state(NR_KERNEL_STACK_KB), K(global_page_state(NR_PAGETABLE)), #ifdef CONFIG_QUICKLIST K(quicklist_total_size()), #endif - K(global_page_state(NR_UNSTABLE_NFS)), + K(global_node_page_state(NR_UNSTABLE_NFS)), K(global_page_state(NR_BOUNCE)), - K(global_page_state(NR_WRITEBACK_TEMP)), + K(global_node_page_state(NR_WRITEBACK_TEMP)), K(vm_commit_limit()), K(committed), (unsigned long)VMALLOC_TOTAL >> 10, @@ -164,9 +164,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE - , K(global_page_state(NR_ANON_THPS) * HPAGE_PMD_NR) - , K(global_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR) - , K(global_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR) + , K(global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR) + , K(global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR) + , K(global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR) #endif #ifdef CONFIG_CMA , K(totalcma_pages) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index c82794f20110..491a91717788 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -197,7 +197,7 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) } long congestion_wait(int sync, long timeout); -long wait_iff_congested(struct zone *zone, int sync, long timeout); +long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout); int pdflush_proc_obsolete(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 1a02dab16646..d4e106b5dc27 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -1,6 +1,18 @@ #ifndef _LINUX_COMPACTION_H #define _LINUX_COMPACTION_H +/* + * Determines how hard direct compaction should try to succeed. + * Lower value means higher priority, analogically to reclaim priority. + */ +enum compact_priority { + COMPACT_PRIO_SYNC_LIGHT, + MIN_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT, + DEF_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT, + COMPACT_PRIO_ASYNC, + INIT_COMPACT_PRIORITY = COMPACT_PRIO_ASYNC +}; + /* Return values for compact_zone() and try_to_compact_pages() */ /* When adding new states, please adjust include/trace/events/compaction.h */ enum compact_result { @@ -43,14 +55,6 @@ enum compact_result { COMPACT_PARTIAL, }; -/* Used to signal whether compaction detected need_sched() or lock contention */ -/* No contention detected */ -#define COMPACT_CONTENDED_NONE 0 -/* Either need_sched() was true or fatal signal pending */ -#define COMPACT_CONTENDED_SCHED 1 -/* Zone lock or lru_lock was contended in async compaction */ -#define COMPACT_CONTENDED_LOCK 2 - struct alloc_context; /* in mm/internal.h */ #ifdef CONFIG_COMPACTION @@ -64,9 +68,8 @@ extern int sysctl_compact_unevictable_allowed; extern int fragmentation_index(struct zone *zone, unsigned int order); extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, - unsigned int order, - unsigned int alloc_flags, const struct alloc_context *ac, - enum migrate_mode mode, int *contended); + unsigned int order, unsigned int alloc_flags, + const struct alloc_context *ac, enum compact_priority prio); extern void compact_pgdat(pg_data_t *pgdat, int order); extern void reset_isolation_suitable(pg_data_t *pgdat); extern enum compact_result compaction_suitable(struct zone *zone, int order, @@ -151,14 +154,6 @@ extern void kcompactd_stop(int nid); extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx); #else -static inline enum compact_result try_to_compact_pages(gfp_t gfp_mask, - unsigned int order, int alloc_flags, - const struct alloc_context *ac, - enum migrate_mode mode, int *contended) -{ - return COMPACT_CONTINUE; -} - static inline void compact_pgdat(pg_data_t *pgdat, int order) { } diff --git a/include/linux/gfp.h b/include/linux/gfp.h index c29e9d347bc6..f8041f9de31e 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -237,9 +237,11 @@ struct vm_area_struct; * are expected to be movable via page reclaim or page migration. Typically, * pages on the LRU would also be allocated with GFP_HIGHUSER_MOVABLE. * - * GFP_TRANSHUGE is used for THP allocations. They are compound allocations - * that will fail quickly if memory is not available and will not wake - * kswapd on failure. + * GFP_TRANSHUGE and GFP_TRANSHUGE_LIGHT are used for THP allocations. They are + * compound allocations that will generally fail quickly if memory is not + * available and will not wake kswapd/kcompactd on failure. The _LIGHT + * version does not attempt reclaim/compaction at all and is by default used + * in page fault path, while the non-light is used by khugepaged. */ #define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) @@ -254,9 +256,9 @@ struct vm_area_struct; #define GFP_DMA32 __GFP_DMA32 #define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM) #define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE) -#define GFP_TRANSHUGE ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ - __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) & \ - ~__GFP_RECLAIM) +#define GFP_TRANSHUGE_LIGHT ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ + __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM) +#define GFP_TRANSHUGE (GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM) /* Convert GFP flags to their corresponding migrate type */ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 92ce91c03cd0..6f14de45b5ce 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -11,7 +11,7 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags); -extern int madvise_free_huge_pmd(struct mmu_gather *tlb, +extern bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long next); extern int zap_huge_pmd(struct mmu_gather *tlb, diff --git a/include/linux/kasan.h b/include/linux/kasan.h index ac4b3c46a84d..c9cf374445d8 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -77,6 +77,7 @@ void kasan_free_shadow(const struct vm_struct *vm); size_t ksize(const void *); static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); } +size_t kasan_metadata_size(struct kmem_cache *cache); #else /* CONFIG_KASAN */ @@ -121,6 +122,7 @@ static inline int kasan_module_alloc(void *addr, size_t size) { return 0; } static inline void kasan_free_shadow(const struct vm_struct *vm) {} static inline void kasan_unpoison_slab(const void *ptr) { } +static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; } #endif /* CONFIG_KASAN */ diff --git a/include/linux/kdb.h b/include/linux/kdb.h index a19bcf9e762e..410decacff8f 100644 --- a/include/linux/kdb.h +++ b/include/linux/kdb.h @@ -177,7 +177,7 @@ extern int kdb_get_kbd_char(void); static inline int kdb_process_cpu(const struct task_struct *p) { - unsigned int cpu = task_thread_info(p)->cpu; + unsigned int cpu = task_cpu(p); if (cpu > num_possible_cpus()) cpu = 0; return cpu; diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 6c14b6179727..2925da23505d 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -332,6 +332,7 @@ phys_addr_t memblock_mem_size(unsigned long limit_pfn); phys_addr_t memblock_start_of_DRAM(void); phys_addr_t memblock_end_of_DRAM(void); void memblock_enforce_memory_limit(phys_addr_t memory_limit); +void memblock_mem_limit_remove_map(phys_addr_t limit); bool memblock_is_memory(phys_addr_t addr); int memblock_is_map_memory(phys_addr_t addr); int memblock_is_region_memory(phys_addr_t base, phys_addr_t size); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 71aff733a497..5d8ca6e02e39 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -52,7 +52,7 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ MEM_CGROUP_STAT_NSTATS, /* default hierarchy stats */ - MEMCG_KERNEL_STACK = MEM_CGROUP_STAT_NSTATS, + MEMCG_KERNEL_STACK_KB = MEM_CGROUP_STAT_NSTATS, MEMCG_SLAB_RECLAIMABLE, MEMCG_SLAB_UNRECLAIMABLE, MEMCG_SOCK, @@ -60,7 +60,7 @@ enum mem_cgroup_stat_index { }; struct mem_cgroup_reclaim_cookie { - struct zone *zone; + pg_data_t *pgdat; int priority; unsigned int generation; }; @@ -118,7 +118,7 @@ struct mem_cgroup_reclaim_iter { /* * per-zone information in memory controller. */ -struct mem_cgroup_per_zone { +struct mem_cgroup_per_node { struct lruvec lruvec; unsigned long lru_size[NR_LRU_LISTS]; @@ -132,10 +132,6 @@ struct mem_cgroup_per_zone { /* use container_of */ }; -struct mem_cgroup_per_node { - struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; -}; - struct mem_cgroup_threshold { struct eventfd_ctx *eventfd; unsigned long threshold; @@ -314,8 +310,46 @@ void mem_cgroup_uncharge_list(struct list_head *page_list); void mem_cgroup_migrate(struct page *oldpage, struct page *newpage); -struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); -struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); +static struct mem_cgroup_per_node * +mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid) +{ + return memcg->nodeinfo[nid]; +} + +/** + * mem_cgroup_lruvec - get the lru list vector for a node or a memcg zone + * @node: node of the wanted lruvec + * @memcg: memcg of the wanted lruvec + * + * Returns the lru list vector holding pages for a given @node or a given + * @memcg and @zone. This can be the node lruvec, if the memory controller + * is disabled. + */ +static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat, + struct mem_cgroup *memcg) +{ + struct mem_cgroup_per_node *mz; + struct lruvec *lruvec; + + if (mem_cgroup_disabled()) { + lruvec = node_lruvec(pgdat); + goto out; + } + + mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); + lruvec = &mz->lruvec; +out: + /* + * Since a node can be onlined after the mem_cgroup was created, + * we have to be prepared to initialize lruvec->pgdat here; + * and if offlined then reonlined, we need to reinitialize it. + */ + if (unlikely(lruvec->pgdat != pgdat)) + lruvec->pgdat = pgdat; + return lruvec; +} + +struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *); bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); @@ -404,9 +438,9 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, static inline unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) { - struct mem_cgroup_per_zone *mz; + struct mem_cgroup_per_node *mz; - mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); + mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); return mz->lru_size[lru]; } @@ -477,7 +511,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, mem_cgroup_update_page_stat(page, idx, -1); } -unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, +unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); @@ -568,16 +602,16 @@ static inline void mem_cgroup_migrate(struct page *old, struct page *new) { } -static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, - struct mem_cgroup *memcg) +static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat, + struct mem_cgroup *memcg) { - return &zone->lruvec; + return node_lruvec(pgdat); } static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page, - struct zone *zone) + struct pglist_data *pgdat) { - return &zone->lruvec; + return &pgdat->lruvec; } static inline bool mm_match_cgroup(struct mm_struct *mm, @@ -681,7 +715,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, } static inline -unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, +unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned) { diff --git a/include/linux/memremap.h b/include/linux/memremap.h index bcaa634139a9..93416196ba64 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -26,7 +26,7 @@ struct vmem_altmap { unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); -#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_ZONE_DEVICE) +#ifdef CONFIG_ZONE_DEVICE struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start); #else static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) diff --git a/include/linux/mm.h b/include/linux/mm.h index 192c1bbe5fcd..08ed53eeedd5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -933,6 +933,11 @@ static inline struct zone *page_zone(const struct page *page) return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; } +static inline pg_data_t *page_pgdat(const struct page *page) +{ + return NODE_DATA(page_to_nid(page)); +} + #ifdef SECTION_IN_PAGE_FLAGS static inline void set_page_section(struct page *page, unsigned long section) { @@ -973,11 +978,21 @@ static inline struct mem_cgroup *page_memcg(struct page *page) { return page->mem_cgroup; } +static inline struct mem_cgroup *page_memcg_rcu(struct page *page) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return READ_ONCE(page->mem_cgroup); +} #else static inline struct mem_cgroup *page_memcg(struct page *page) { return NULL; } +static inline struct mem_cgroup *page_memcg_rcu(struct page *page) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return NULL; +} #endif /* @@ -2284,6 +2299,8 @@ static inline int in_gate_area(struct mm_struct *mm, unsigned long addr) } #endif /* __HAVE_ARCH_GATE_AREA */ +extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm); + #ifdef CONFIG_SYSCTL extern int sysctl_drop_caches; int drop_caches_sysctl_handler(struct ctl_table *, int, diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 5bd29ba4f174..71613e8a720f 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -23,25 +23,30 @@ static inline int page_is_file_cache(struct page *page) } static __always_inline void __update_lru_size(struct lruvec *lruvec, - enum lru_list lru, int nr_pages) + enum lru_list lru, enum zone_type zid, + int nr_pages) { - __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + __mod_node_page_state(pgdat, NR_LRU_BASE + lru, nr_pages); + __mod_zone_page_state(&pgdat->node_zones[zid], + NR_ZONE_LRU_BASE + lru, nr_pages); } static __always_inline void update_lru_size(struct lruvec *lruvec, - enum lru_list lru, int nr_pages) + enum lru_list lru, enum zone_type zid, + int nr_pages) { + __update_lru_size(lruvec, lru, zid, nr_pages); #ifdef CONFIG_MEMCG mem_cgroup_update_lru_size(lruvec, lru, nr_pages); -#else - __update_lru_size(lruvec, lru, nr_pages); #endif } static __always_inline void add_page_to_lru_list(struct page *page, struct lruvec *lruvec, enum lru_list lru) { - update_lru_size(lruvec, lru, hpage_nr_pages(page)); + update_lru_size(lruvec, lru, page_zonenum(page), hpage_nr_pages(page)); list_add(&page->lru, &lruvec->lists[lru]); } @@ -49,7 +54,7 @@ static __always_inline void del_page_from_lru_list(struct page *page, struct lruvec *lruvec, enum lru_list lru) { list_del(&page->lru); - update_lru_size(lruvec, lru, -hpage_nr_pages(page)); + update_lru_size(lruvec, lru, page_zonenum(page), -hpage_nr_pages(page)); } /** diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 79472b22d23f..903200f4ec41 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -118,7 +118,7 @@ struct page { */ union { struct list_head lru; /* Pageout list, eg. active_list - * protected by zone->lru_lock ! + * protected by zone_lru_lock ! * Can be used as a generic list * by the page owner. */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 19425e988bdc..f2e4e90621ec 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -93,7 +93,7 @@ struct free_area { struct pglist_data; /* - * zone->lock and zone->lru_lock are two of the hottest locks in the kernel. + * zone->lock and the zone lru_lock are two of the hottest locks in the kernel. * So add a wild amount of padding here to ensure that they fall into separate * cachelines. There are very few zone structures in the machine, so space * consumption is not a concern here. @@ -110,36 +110,20 @@ struct zone_padding { enum zone_stat_item { /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, - NR_ALLOC_BATCH, - NR_LRU_BASE, - NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ - NR_ACTIVE_ANON, /* " " " " " */ - NR_INACTIVE_FILE, /* " " " " " */ - NR_ACTIVE_FILE, /* " " " " " */ - NR_UNEVICTABLE, /* " " " " " */ + NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */ + NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE, + NR_ZONE_ACTIVE_ANON, + NR_ZONE_INACTIVE_FILE, + NR_ZONE_ACTIVE_FILE, + NR_ZONE_UNEVICTABLE, + NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ - NR_ANON_PAGES, /* Mapped anonymous pages */ - NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. - only modified from process context */ - NR_FILE_PAGES, - NR_FILE_DIRTY, - NR_WRITEBACK, NR_SLAB_RECLAIMABLE, NR_SLAB_UNRECLAIMABLE, NR_PAGETABLE, /* used for pagetables */ - NR_KERNEL_STACK, + NR_KERNEL_STACK_KB, /* measured in KiB */ /* Second 128 byte cacheline */ - NR_UNSTABLE_NFS, /* NFS unstable pages */ NR_BOUNCE, - NR_VMSCAN_WRITE, - NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ - NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */ - NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ - NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ - NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ - NR_DIRTIED, /* page dirtyings since bootup */ - NR_WRITTEN, /* page writings since bootup */ - NR_PAGES_SCANNED, /* pages scanned since last reclaim */ #if IS_ENABLED(CONFIG_ZSMALLOC) NR_ZSPAGES, /* allocated in zsmalloc */ #endif @@ -151,14 +135,40 @@ enum zone_stat_item { NUMA_LOCAL, /* allocation from local node */ NUMA_OTHER, /* allocation from other node */ #endif + NR_FREE_CMA_PAGES, + NR_VM_ZONE_STAT_ITEMS }; + +enum node_stat_item { + NR_LRU_BASE, + NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ + NR_ACTIVE_ANON, /* " " " " " */ + NR_INACTIVE_FILE, /* " " " " " */ + NR_ACTIVE_FILE, /* " " " " " */ + NR_UNEVICTABLE, /* " " " " " */ + NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ + NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ + NR_PAGES_SCANNED, /* pages scanned since last reclaim */ WORKINGSET_REFAULT, WORKINGSET_ACTIVATE, WORKINGSET_NODERECLAIM, - NR_ANON_THPS, + NR_ANON_MAPPED, /* Mapped anonymous pages */ + NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. + only modified from process context */ + NR_FILE_PAGES, + NR_FILE_DIRTY, + NR_WRITEBACK, + NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */ + NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ NR_SHMEM_THPS, NR_SHMEM_PMDMAPPED, - NR_FREE_CMA_PAGES, - NR_VM_ZONE_STAT_ITEMS }; + NR_ANON_THPS, + NR_UNSTABLE_NFS, /* NFS unstable pages */ + NR_VMSCAN_WRITE, + NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ + NR_DIRTIED, /* page dirtyings since bootup */ + NR_WRITTEN, /* page writings since bootup */ + NR_VM_NODE_STAT_ITEMS +}; /* * We do arithmetic on the LRU lists in various places in the code, @@ -215,7 +225,7 @@ struct lruvec { /* Evictions & activations on the inactive file list */ atomic_long_t inactive_age; #ifdef CONFIG_MEMCG - struct zone *zone; + struct pglist_data *pgdat; #endif }; @@ -267,6 +277,11 @@ struct per_cpu_pageset { #endif }; +struct per_cpu_nodestat { + s8 stat_threshold; + s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS]; +}; + #endif /* !__GENERATING_BOUNDS.H */ enum zone_type { @@ -348,22 +363,9 @@ struct zone { #ifdef CONFIG_NUMA int node; #endif - - /* - * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on - * this zone's LRU. Maintained by the pageout code. - */ - unsigned int inactive_ratio; - struct pglist_data *zone_pgdat; struct per_cpu_pageset __percpu *pageset; - /* - * This is a per-zone reserve of pages that are not available - * to userspace allocations. - */ - unsigned long totalreserve_pages; - #ifndef CONFIG_SPARSEMEM /* * Flags for a pageblock_nr_pages block. See pageblock-flags.h. @@ -372,14 +374,6 @@ struct zone { unsigned long *pageblock_flags; #endif /* CONFIG_SPARSEMEM */ -#ifdef CONFIG_NUMA - /* - * zone reclaim becomes active if more unmapped pages exist. - */ - unsigned long min_unmapped_pages; - unsigned long min_slab_pages; -#endif /* CONFIG_NUMA */ - /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; @@ -472,24 +466,21 @@ struct zone { unsigned long wait_table_hash_nr_entries; unsigned long wait_table_bits; + /* Write-intensive fields used from the page allocator */ ZONE_PADDING(_pad1_) + /* free areas of different sizes */ struct free_area free_area[MAX_ORDER]; /* zone flags, see below */ unsigned long flags; - /* Write-intensive fields used from the page allocator */ + /* Primarily protects free_area */ spinlock_t lock; + /* Write-intensive fields used by compaction and vmstats. */ ZONE_PADDING(_pad2_) - /* Write-intensive fields used by page reclaim */ - - /* Fields commonly accessed by the page reclaim scanner */ - spinlock_t lru_lock; - struct lruvec lruvec; - /* * When free pages are below this point, additional steps are taken * when reading the number of free pages to avoid per-cpu counter @@ -527,19 +518,18 @@ struct zone { atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; } ____cacheline_internodealigned_in_smp; -enum zone_flags { - ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */ - ZONE_CONGESTED, /* zone has many dirty pages backed by +enum pgdat_flags { + PGDAT_CONGESTED, /* pgdat has many dirty pages backed by * a congested BDI */ - ZONE_DIRTY, /* reclaim scanning has recently found + PGDAT_DIRTY, /* reclaim scanning has recently found * many dirty file pages at the tail * of the LRU. */ - ZONE_WRITEBACK, /* reclaim scanning has recently found + PGDAT_WRITEBACK, /* reclaim scanning has recently found * many pages under writeback */ - ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */ + PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */ }; static inline unsigned long zone_end_pfn(const struct zone *zone) @@ -663,8 +653,9 @@ typedef struct pglist_data { wait_queue_head_t pfmemalloc_wait; struct task_struct *kswapd; /* Protected by mem_hotplug_begin/end() */ - int kswapd_max_order; - enum zone_type classzone_idx; + int kswapd_order; + enum zone_type kswapd_classzone_idx; + #ifdef CONFIG_COMPACTION int kcompactd_max_order; enum zone_type kcompactd_classzone_idx; @@ -681,6 +672,23 @@ typedef struct pglist_data { /* Number of pages migrated during the rate limiting time interval */ unsigned long numabalancing_migrate_nr_pages; #endif + /* + * This is a per-node reserve of pages that are not available + * to userspace allocations. + */ + unsigned long totalreserve_pages; + +#ifdef CONFIG_NUMA + /* + * zone reclaim becomes active if more unmapped pages exist. + */ + unsigned long min_unmapped_pages; + unsigned long min_slab_pages; +#endif /* CONFIG_NUMA */ + + /* Write-intensive fields used by page reclaim */ + ZONE_PADDING(_pad1_) + spinlock_t lru_lock; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* @@ -695,6 +703,23 @@ typedef struct pglist_data { struct list_head split_queue; unsigned long split_queue_len; #endif + + /* Fields commonly accessed by the page reclaim scanner */ + struct lruvec lruvec; + + /* + * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on + * this node's LRU. Maintained by the pageout code. + */ + unsigned int inactive_ratio; + + unsigned long flags; + + ZONE_PADDING(_pad2_) + + /* Per-node vmstats */ + struct per_cpu_nodestat __percpu *per_cpu_nodestats; + atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS]; } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) @@ -708,6 +733,15 @@ typedef struct pglist_data { #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) +static inline spinlock_t *zone_lru_lock(struct zone *zone) +{ + return &zone->zone_pgdat->lru_lock; +} + +static inline struct lruvec *node_lruvec(struct pglist_data *pgdat) +{ + return &pgdat->lruvec; +} static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) { @@ -760,12 +794,12 @@ extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, extern void lruvec_init(struct lruvec *lruvec); -static inline struct zone *lruvec_zone(struct lruvec *lruvec) +static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec) { #ifdef CONFIG_MEMCG - return lruvec->zone; + return lruvec->pgdat; #else - return container_of(lruvec, struct zone, lruvec); + return container_of(lruvec, struct pglist_data, lruvec); #endif } diff --git a/include/linux/oom.h b/include/linux/oom.h index 606137b3b778..5bc0457ee3a8 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -73,9 +73,9 @@ static inline bool oom_task_origin(const struct task_struct *p) extern void mark_oom_victim(struct task_struct *tsk); #ifdef CONFIG_MMU -extern void try_oom_reaper(struct task_struct *tsk); +extern void wake_oom_reaper(struct task_struct *tsk); #else -static inline void try_oom_reaper(struct task_struct *tsk) +static inline void wake_oom_reaper(struct task_struct *tsk) { } #endif @@ -107,27 +107,7 @@ extern void oom_killer_enable(void); extern struct task_struct *find_lock_task_mm(struct task_struct *p); -static inline bool task_will_free_mem(struct task_struct *task) -{ - struct signal_struct *sig = task->signal; - - /* - * A coredumping process may sleep for an extended period in exit_mm(), - * so the oom killer cannot assume that the process will promptly exit - * and release memory. - */ - if (sig->flags & SIGNAL_GROUP_COREDUMP) - return false; - - if (!(task->flags & PF_EXITING)) - return false; - - /* Make sure that the whole thread group is going down */ - if (!thread_group_empty(task) && !(sig->flags & SIGNAL_GROUP_EXIT)) - return false; - - return true; -} +bool task_will_free_mem(struct task_struct *task); /* sysctls */ extern int sysctl_oom_dump_tasks; diff --git a/include/linux/sched.h b/include/linux/sched.h index d99218a1e043..553af2923824 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -523,6 +523,7 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_HAS_UPROBES 19 /* has uprobes */ #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ #define MMF_OOM_REAPED 21 /* mm has been already reaped */ +#define MMF_OOM_NOT_REAPABLE 22 /* mm couldn't be reaped */ #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) @@ -1949,6 +1950,32 @@ static inline int tsk_nr_cpus_allowed(struct task_struct *p) #define TNF_FAULT_LOCAL 0x08 #define TNF_MIGRATE_FAIL 0x10 +static inline bool in_vfork(struct task_struct *tsk) +{ + bool ret; + + /* + * need RCU to access ->real_parent if CLONE_VM was used along with + * CLONE_PARENT. + * + * We check real_parent->mm == tsk->mm because CLONE_VFORK does not + * imply CLONE_VM + * + * CLONE_VFORK can be used with CLONE_PARENT/CLONE_THREAD and thus + * ->real_parent is not necessarily the task doing vfork(), so in + * theory we can't rely on task_lock() if we want to dereference it. + * + * And in this case we can't trust the real_parent->mm == tsk->mm + * check, it can be false negative. But we do not care, if init or + * another oom-unkillable task does this it should blame itself. + */ + rcu_read_lock(); + ret = tsk->vfork_done && tsk->real_parent->mm == tsk->mm; + rcu_read_unlock(); + + return ret; +} + #ifdef CONFIG_NUMA_BALANCING extern void task_numa_fault(int last_node, int node, int pages, int flags); extern pid_t task_numa_group_id(struct task_struct *p); diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 339ba027ade9..4ad2c5a26399 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -88,7 +88,8 @@ struct kmem_cache { }; static inline void *nearest_obj(struct kmem_cache *cache, struct page *page, - void *x) { + void *x) +{ void *object = x - (x - page->s_mem) % cache->size; void *last_object = page->s_mem + (cache->num - 1) * cache->size; diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 5624c1f3eb0a..75f56c2ef2d4 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -104,6 +104,10 @@ struct kmem_cache { unsigned int *random_seq; #endif +#ifdef CONFIG_KASAN + struct kasan_cache kasan_info; +#endif + struct kmem_cache_node *node[MAX_NUMNODES]; }; @@ -119,15 +123,17 @@ static inline void sysfs_slab_remove(struct kmem_cache *s) void object_err(struct kmem_cache *s, struct page *page, u8 *object, char *reason); +void *fixup_red_left(struct kmem_cache *s, void *p); + static inline void *nearest_obj(struct kmem_cache *cache, struct page *page, void *x) { void *object = x - (x - page_address(page)) % cache->size; void *last_object = page_address(page) + (page->objects - 1) * cache->size; - if (unlikely(object > last_object)) - return last_object; - else - return object; + void *result = (unlikely(object > last_object)) ? last_object : object; + + result = fixup_red_left(cache, result); + return result; } #endif /* _LINUX_SLUB_DEF_H */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 0af2bb2028fd..b17cc4830fa6 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -157,15 +157,6 @@ enum { #define SWAP_CLUSTER_MAX 32UL #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX -/* - * Ratio between zone->managed_pages and the "gap" that above the per-zone - * "high_wmark". While balancing nodes, We allow kswapd to shrink zones that - * do not meet the (high_wmark + gap) watermark, even which already met the - * high_wmark, in order to provide better per-zone lru behavior. We are ok to - * spend not more than 1% of the memory for this zone balancing "gap". - */ -#define KSWAPD_ZONE_BALANCE_GAP_RATIO 100 - #define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */ #define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */ #define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */ @@ -317,6 +308,7 @@ extern void lru_cache_add_active_or_unevictable(struct page *page, /* linux/mm/vmscan.c */ extern unsigned long zone_reclaimable_pages(struct zone *zone); +extern unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); extern int __isolate_lru_page(struct page *page, isolate_mode_t mode); @@ -324,9 +316,9 @@ extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, bool may_swap); -extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, +extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, - struct zone *zone, + pg_data_t *pgdat, unsigned long *nr_scanned); extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; @@ -334,13 +326,14 @@ extern int remove_mapping(struct address_space *mapping, struct page *page); extern unsigned long vm_total_pages; #ifdef CONFIG_NUMA -extern int zone_reclaim_mode; +extern int node_reclaim_mode; extern int sysctl_min_unmapped_ratio; extern int sysctl_min_slab_ratio; -extern int zone_reclaim(struct zone *, gfp_t, unsigned int); +extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int); #else -#define zone_reclaim_mode 0 -static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) +#define node_reclaim_mode 0 +static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask, + unsigned int order) { return 0; } diff --git a/include/linux/topology.h b/include/linux/topology.h index afce69296ac0..cb0775e1ee4b 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -54,7 +54,7 @@ int arch_update_cpu_topology(void); /* * If the distance between nodes in a system is larger than RECLAIM_DISTANCE * (in whatever arch specific measurement units returned by node_distance()) - * and zone_reclaim_mode is enabled then the VM will only call zone_reclaim() + * and node_reclaim_mode is enabled then the VM will only call node_reclaim() * on nodes within this distance. */ #define RECLAIM_DISTANCE 30 diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 42604173f122..4d6ec58a8d45 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -23,21 +23,23 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, FOR_ALL_ZONES(PGALLOC), + FOR_ALL_ZONES(ALLOCSTALL), + FOR_ALL_ZONES(PGSCAN_SKIP), PGFREE, PGACTIVATE, PGDEACTIVATE, PGFAULT, PGMAJFAULT, PGLAZYFREED, - FOR_ALL_ZONES(PGREFILL), - FOR_ALL_ZONES(PGSTEAL_KSWAPD), - FOR_ALL_ZONES(PGSTEAL_DIRECT), - FOR_ALL_ZONES(PGSCAN_KSWAPD), - FOR_ALL_ZONES(PGSCAN_DIRECT), + PGREFILL, + PGSTEAL_KSWAPD, + PGSTEAL_DIRECT, + PGSCAN_KSWAPD, + PGSCAN_DIRECT, PGSCAN_DIRECT_THROTTLE, #ifdef CONFIG_NUMA PGSCAN_ZONE_RECLAIM_FAILED, #endif PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL, KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY, - PAGEOUTRUN, ALLOCSTALL, PGROTATED, + PAGEOUTRUN, PGROTATED, DROP_PAGECACHE, DROP_SLAB, #ifdef CONFIG_NUMA_BALANCING NUMA_PTE_UPDATES, diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index d2da8e053210..613771909b6e 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -101,25 +101,42 @@ static inline void vm_events_fold_cpu(int cpu) #define count_vm_vmacache_event(x) do {} while (0) #endif -#define __count_zone_vm_events(item, zone, delta) \ - __count_vm_events(item##_NORMAL - ZONE_NORMAL + \ - zone_idx(zone), delta) +#define __count_zid_vm_events(item, zid, delta) \ + __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta) /* - * Zone based page accounting with per cpu differentials. + * Zone and node-based page accounting with per cpu differentials. */ -extern atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; +extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS]; +extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS]; static inline void zone_page_state_add(long x, struct zone *zone, enum zone_stat_item item) { atomic_long_add(x, &zone->vm_stat[item]); - atomic_long_add(x, &vm_stat[item]); + atomic_long_add(x, &vm_zone_stat[item]); +} + +static inline void node_page_state_add(long x, struct pglist_data *pgdat, + enum node_stat_item item) +{ + atomic_long_add(x, &pgdat->vm_stat[item]); + atomic_long_add(x, &vm_node_stat[item]); } static inline unsigned long global_page_state(enum zone_stat_item item) { - long x = atomic_long_read(&vm_stat[item]); + long x = atomic_long_read(&vm_zone_stat[item]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} + +static inline unsigned long global_node_page_state(enum node_stat_item item) +{ + long x = atomic_long_read(&vm_node_stat[item]); #ifdef CONFIG_SMP if (x < 0) x = 0; @@ -160,32 +177,61 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone, return x; } -#ifdef CONFIG_NUMA +static inline unsigned long node_page_state_snapshot(pg_data_t *pgdat, + enum node_stat_item item) +{ + long x = atomic_long_read(&pgdat->vm_stat[item]); -extern unsigned long node_page_state(int node, enum zone_stat_item item); +#ifdef CONFIG_SMP + int cpu; + for_each_online_cpu(cpu) + x += per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->vm_node_stat_diff[item]; -#else + if (x < 0) + x = 0; +#endif + return x; +} -#define node_page_state(node, item) global_page_state(item) +#ifdef CONFIG_NUMA +extern unsigned long sum_zone_node_page_state(int node, + enum zone_stat_item item); +extern unsigned long node_page_state(struct pglist_data *pgdat, + enum node_stat_item item); +#else +#define sum_zone_node_page_state(node, item) global_page_state(item) +#define node_page_state(node, item) global_node_page_state(item) #endif /* CONFIG_NUMA */ #define add_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, __d) #define sub_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, -(__d)) +#define add_node_page_state(__p, __i, __d) mod_node_page_state(__p, __i, __d) +#define sub_node_page_state(__p, __i, __d) mod_node_page_state(__p, __i, -(__d)) #ifdef CONFIG_SMP void __mod_zone_page_state(struct zone *, enum zone_stat_item item, long); void __inc_zone_page_state(struct page *, enum zone_stat_item); void __dec_zone_page_state(struct page *, enum zone_stat_item); +void __mod_node_page_state(struct pglist_data *, enum node_stat_item item, long); +void __inc_node_page_state(struct page *, enum node_stat_item); +void __dec_node_page_state(struct page *, enum node_stat_item); + void mod_zone_page_state(struct zone *, enum zone_stat_item, long); void inc_zone_page_state(struct page *, enum zone_stat_item); void dec_zone_page_state(struct page *, enum zone_stat_item); -extern void inc_zone_state(struct zone *, enum zone_stat_item); +void mod_node_page_state(struct pglist_data *, enum node_stat_item, long); +void inc_node_page_state(struct page *, enum node_stat_item); +void dec_node_page_state(struct page *, enum node_stat_item); + +extern void inc_node_state(struct pglist_data *, enum node_stat_item); extern void __inc_zone_state(struct zone *, enum zone_stat_item); +extern void __inc_node_state(struct pglist_data *, enum node_stat_item); extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); +extern void __dec_node_state(struct pglist_data *, enum node_stat_item); void quiet_vmstat(void); void cpu_vm_stats_fold(int cpu); @@ -213,16 +259,34 @@ static inline void __mod_zone_page_state(struct zone *zone, zone_page_state_add(delta, zone, item); } +static inline void __mod_node_page_state(struct pglist_data *pgdat, + enum node_stat_item item, int delta) +{ + node_page_state_add(delta, pgdat, item); +} + static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { atomic_long_inc(&zone->vm_stat[item]); - atomic_long_inc(&vm_stat[item]); + atomic_long_inc(&vm_zone_stat[item]); +} + +static inline void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) +{ + atomic_long_inc(&pgdat->vm_stat[item]); + atomic_long_inc(&vm_node_stat[item]); } static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { atomic_long_dec(&zone->vm_stat[item]); - atomic_long_dec(&vm_stat[item]); + atomic_long_dec(&vm_zone_stat[item]); +} + +static inline void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) +{ + atomic_long_dec(&pgdat->vm_stat[item]); + atomic_long_dec(&vm_node_stat[item]); } static inline void __inc_zone_page_state(struct page *page, @@ -231,12 +295,26 @@ static inline void __inc_zone_page_state(struct page *page, __inc_zone_state(page_zone(page), item); } +static inline void __inc_node_page_state(struct page *page, + enum node_stat_item item) +{ + __inc_node_state(page_pgdat(page), item); +} + + static inline void __dec_zone_page_state(struct page *page, enum zone_stat_item item) { __dec_zone_state(page_zone(page), item); } +static inline void __dec_node_page_state(struct page *page, + enum node_stat_item item) +{ + __dec_node_state(page_pgdat(page), item); +} + + /* * We only use atomic operations to update counters. So there is no need to * disable interrupts. @@ -245,7 +323,12 @@ static inline void __dec_zone_page_state(struct page *page, #define dec_zone_page_state __dec_zone_page_state #define mod_zone_page_state __mod_zone_page_state +#define inc_node_page_state __inc_node_page_state +#define dec_node_page_state __dec_node_page_state +#define mod_node_page_state __mod_node_page_state + #define inc_zone_state __inc_zone_state +#define inc_node_state __inc_node_state #define dec_zone_state __dec_zone_state #define set_pgdat_percpu_threshold(pgdat, callback) { } diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 717e6149e753..fc1e16c25a29 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -320,7 +320,7 @@ void laptop_mode_timer_fn(unsigned long data); static inline void laptop_sync_completion(void) { } #endif void throttle_vm_writeout(gfp_t gfp_mask); -bool zone_dirty_ok(struct zone *zone); +bool node_dirty_ok(struct pglist_data *pgdat); int wb_domain_init(struct wb_domain *dom, gfp_t gfp); #ifdef CONFIG_CGROUP_WRITEBACK void wb_domain_exit(struct wb_domain *dom); diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index 36e2d6fb1360..c2ba402ab256 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -226,26 +226,26 @@ TRACE_EVENT(mm_compaction_try_to_compact_pages, TP_PROTO( int order, gfp_t gfp_mask, - enum migrate_mode mode), + int prio), - TP_ARGS(order, gfp_mask, mode), + TP_ARGS(order, gfp_mask, prio), TP_STRUCT__entry( __field(int, order) __field(gfp_t, gfp_mask) - __field(enum migrate_mode, mode) + __field(int, prio) ), TP_fast_assign( __entry->order = order; __entry->gfp_mask = gfp_mask; - __entry->mode = mode; + __entry->prio = prio; ), - TP_printk("order=%d gfp_mask=0x%x mode=%d", + TP_printk("order=%d gfp_mask=0x%x priority=%d", __entry->order, __entry->gfp_mask, - (int)__entry->mode) + __entry->prio) ); DECLARE_EVENT_CLASS(mm_compaction_suitable_template, diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 43cedbf0c759..5a81ab48a2fb 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -11,6 +11,7 @@ #define __def_gfpflag_names \ {(unsigned long)GFP_TRANSHUGE, "GFP_TRANSHUGE"}, \ + {(unsigned long)GFP_TRANSHUGE_LIGHT, "GFP_TRANSHUGE_LIGHT"}, \ {(unsigned long)GFP_HIGHUSER_MOVABLE, "GFP_HIGHUSER_MOVABLE"},\ {(unsigned long)GFP_HIGHUSER, "GFP_HIGHUSER"}, \ {(unsigned long)GFP_USER, "GFP_USER"}, \ diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 0101ef37f1ee..c88fd0934e7e 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -55,21 +55,23 @@ TRACE_EVENT(mm_vmscan_kswapd_sleep, TRACE_EVENT(mm_vmscan_kswapd_wake, - TP_PROTO(int nid, int order), + TP_PROTO(int nid, int zid, int order), - TP_ARGS(nid, order), + TP_ARGS(nid, zid, order), TP_STRUCT__entry( __field( int, nid ) + __field( int, zid ) __field( int, order ) ), TP_fast_assign( __entry->nid = nid; + __entry->zid = zid; __entry->order = order; ), - TP_printk("nid=%d order=%d", __entry->nid, __entry->order) + TP_printk("nid=%d zid=%d order=%d", __entry->nid, __entry->zid, __entry->order) ); TRACE_EVENT(mm_vmscan_wakeup_kswapd, @@ -98,47 +100,50 @@ TRACE_EVENT(mm_vmscan_wakeup_kswapd, DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template, - TP_PROTO(int order, int may_writepage, gfp_t gfp_flags), + TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx), - TP_ARGS(order, may_writepage, gfp_flags), + TP_ARGS(order, may_writepage, gfp_flags, classzone_idx), TP_STRUCT__entry( __field( int, order ) __field( int, may_writepage ) __field( gfp_t, gfp_flags ) + __field( int, classzone_idx ) ), TP_fast_assign( __entry->order = order; __entry->may_writepage = may_writepage; __entry->gfp_flags = gfp_flags; + __entry->classzone_idx = classzone_idx; ), - TP_printk("order=%d may_writepage=%d gfp_flags=%s", + TP_printk("order=%d may_writepage=%d gfp_flags=%s classzone_idx=%d", __entry->order, __entry->may_writepage, - show_gfp_flags(__entry->gfp_flags)) + show_gfp_flags(__entry->gfp_flags), + __entry->classzone_idx) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin, - TP_PROTO(int order, int may_writepage, gfp_t gfp_flags), + TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx), - TP_ARGS(order, may_writepage, gfp_flags) + TP_ARGS(order, may_writepage, gfp_flags, classzone_idx) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin, - TP_PROTO(int order, int may_writepage, gfp_t gfp_flags), + TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx), - TP_ARGS(order, may_writepage, gfp_flags) + TP_ARGS(order, may_writepage, gfp_flags, classzone_idx) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin, - TP_PROTO(int order, int may_writepage, gfp_t gfp_flags), + TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx), - TP_ARGS(order, may_writepage, gfp_flags) + TP_ARGS(order, may_writepage, gfp_flags, classzone_idx) ); DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template, @@ -266,16 +271,18 @@ TRACE_EVENT(mm_shrink_slab_end, DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, - TP_PROTO(int order, + TP_PROTO(int classzone_idx, + int order, unsigned long nr_requested, unsigned long nr_scanned, unsigned long nr_taken, isolate_mode_t isolate_mode, int file), - TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file), + TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file), TP_STRUCT__entry( + __field(int, classzone_idx) __field(int, order) __field(unsigned long, nr_requested) __field(unsigned long, nr_scanned) @@ -285,6 +292,7 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, ), TP_fast_assign( + __entry->classzone_idx = classzone_idx; __entry->order = order; __entry->nr_requested = nr_requested; __entry->nr_scanned = nr_scanned; @@ -293,8 +301,9 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, __entry->file = file; ), - TP_printk("isolate_mode=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu file=%d", + TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu file=%d", __entry->isolate_mode, + __entry->classzone_idx, __entry->order, __entry->nr_requested, __entry->nr_scanned, @@ -304,27 +313,29 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate, - TP_PROTO(int order, + TP_PROTO(int classzone_idx, + int order, unsigned long nr_requested, unsigned long nr_scanned, unsigned long nr_taken, isolate_mode_t isolate_mode, int file), - TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file) + TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file) ); DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate, - TP_PROTO(int order, + TP_PROTO(int classzone_idx, + int order, unsigned long nr_requested, unsigned long nr_scanned, unsigned long nr_taken, isolate_mode_t isolate_mode, int file), - TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file) + TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file) ); @@ -352,15 +363,14 @@ TRACE_EVENT(mm_vmscan_writepage, TRACE_EVENT(mm_vmscan_lru_shrink_inactive, - TP_PROTO(struct zone *zone, + TP_PROTO(int nid, unsigned long nr_scanned, unsigned long nr_reclaimed, int priority, int file), - TP_ARGS(zone, nr_scanned, nr_reclaimed, priority, file), + TP_ARGS(nid, nr_scanned, nr_reclaimed, priority, file), TP_STRUCT__entry( __field(int, nid) - __field(int, zid) __field(unsigned long, nr_scanned) __field(unsigned long, nr_reclaimed) __field(int, priority) @@ -368,16 +378,15 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive, ), TP_fast_assign( - __entry->nid = zone_to_nid(zone); - __entry->zid = zone_idx(zone); + __entry->nid = nid; __entry->nr_scanned = nr_scanned; __entry->nr_reclaimed = nr_reclaimed; __entry->priority = priority; __entry->reclaim_flags = trace_shrink_flags(file); ), - TP_printk("nid=%d zid=%d nr_scanned=%ld nr_reclaimed=%ld priority=%d flags=%s", - __entry->nid, __entry->zid, + TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld priority=%d flags=%s", + __entry->nid, __entry->nr_scanned, __entry->nr_reclaimed, __entry->priority, show_reclaim_flags(__entry->reclaim_flags)) diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 531f5811ff6b..2ccd9ccbf9ef 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -412,11 +412,11 @@ TRACE_EVENT(global_dirty_state, ), TP_fast_assign( - __entry->nr_dirty = global_page_state(NR_FILE_DIRTY); - __entry->nr_writeback = global_page_state(NR_WRITEBACK); - __entry->nr_unstable = global_page_state(NR_UNSTABLE_NFS); - __entry->nr_dirtied = global_page_state(NR_DIRTIED); - __entry->nr_written = global_page_state(NR_WRITTEN); + __entry->nr_dirty = global_node_page_state(NR_FILE_DIRTY); + __entry->nr_writeback = global_node_page_state(NR_WRITEBACK); + __entry->nr_unstable = global_node_page_state(NR_UNSTABLE_NFS); + __entry->nr_dirtied = global_node_page_state(NR_DIRTIED); + __entry->nr_written = global_node_page_state(NR_WRITTEN); __entry->background_thresh = background_thresh; __entry->dirty_thresh = dirty_thresh; __entry->dirty_limit = global_wb_domain.dirty_limit; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 73e93e53884d..c7fd2778ed50 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1034,15 +1034,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, { bool need_loop; - /* - * Allow tasks that have access to memory reserves because they have - * been OOM killed to get memory anywhere. - */ - if (unlikely(test_thread_flag(TIF_MEMDIE))) - return; - if (current->flags & PF_EXITING) /* Let dying task have memory */ - return; - task_lock(tsk); /* * Determine if a loop is necessary if another thread is doing diff --git a/kernel/fork.c b/kernel/fork.c index de21f25e0d2c..52e725d4a866 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -165,20 +165,12 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, struct page *page = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); - if (page) - memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, - 1 << THREAD_SIZE_ORDER); - return page ? page_address(page) : NULL; } static inline void free_thread_stack(unsigned long *stack) { - struct page *page = virt_to_page(stack); - - memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, - -(1 << THREAD_SIZE_ORDER)); - __free_pages(page, THREAD_SIZE_ORDER); + __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_stack_cache; @@ -223,9 +215,15 @@ static struct kmem_cache *mm_cachep; static void account_kernel_stack(unsigned long *stack, int account) { - struct zone *zone = page_zone(virt_to_page(stack)); + /* All stack pages are in the same zone and belong to the same memcg. */ + struct page *first_page = virt_to_page(stack); + + mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, + THREAD_SIZE / 1024 * account); - mod_zone_page_state(zone, NR_KERNEL_STACK, account); + memcg_kmem_update_page_stat( + first_page, MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } void free_task(struct task_struct *tsk) diff --git a/kernel/freezer.c b/kernel/freezer.c index a8900a3bc27a..6f56a9e219fa 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -42,7 +42,7 @@ bool freezing_slow_path(struct task_struct *p) if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) return false; - if (test_thread_flag(TIF_MEMDIE)) + if (test_tsk_thread_flag(p, TIF_MEMDIE)) return false; if (pm_nosig_freezing || cgroup_freezing(p)) diff --git a/kernel/memremap.c b/kernel/memremap.c index 017532193fb1..ddb3247a872a 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -308,12 +308,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (is_ram == REGION_INTERSECTS) return __va(res->start); - if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) { - dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n", - __func__); - return ERR_PTR(-ENXIO); - } - if (!ref) return ERR_PTR(-EINVAL); @@ -401,7 +395,6 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) altmap->alloc -= nr_pfns; } -#ifdef CONFIG_SPARSEMEM_VMEMMAP struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) { /* @@ -427,5 +420,4 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) return pgmap ? pgmap->altmap : NULL; } -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ #endif /* CONFIG_ZONE_DEVICE */ diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d90df926b59f..9a0178c2ac1d 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1627,11 +1627,11 @@ static unsigned long minimum_image_size(unsigned long saveable) unsigned long size; size = global_page_state(NR_SLAB_RECLAIMABLE) - + global_page_state(NR_ACTIVE_ANON) - + global_page_state(NR_INACTIVE_ANON) - + global_page_state(NR_ACTIVE_FILE) - + global_page_state(NR_INACTIVE_FILE) - - global_page_state(NR_FILE_MAPPED); + + global_node_page_state(NR_ACTIVE_ANON) + + global_node_page_state(NR_INACTIVE_ANON) + + global_node_page_state(NR_ACTIVE_FILE) + + global_node_page_state(NR_INACTIVE_FILE) + - global_node_page_state(NR_FILE_MAPPED); return saveable <= size ? 0 : saveable - size; } diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 60cdf6386763..d4de33934dac 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3177,9 +3177,8 @@ void show_regs_print_info(const char *log_lvl) { dump_stack_print_info(log_lvl); - printk("%stask: %p ti: %p task.ti: %p\n", - log_lvl, current, current_thread_info(), - task_thread_info(current)); + printk("%stask: %p task.stack: %p\n", + log_lvl, current, task_stack_page(current)); } #endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 35f0dcb1cb4f..53954631a4e1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1508,8 +1508,8 @@ static struct ctl_table vm_table[] = { #ifdef CONFIG_NUMA { .procname = "zone_reclaim_mode", - .data = &zone_reclaim_mode, - .maxlen = sizeof(zone_reclaim_mode), + .data = &node_reclaim_mode, + .maxlen = sizeof(node_reclaim_mode), .mode = 0644, .proc_handler = proc_dointvec, .extra1 = &zero, diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 67d8c6838ba9..bd38aab05929 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -5,9 +5,9 @@ if HAVE_ARCH_KASAN config KASAN bool "KASan: runtime memory debugger" - depends on SLUB_DEBUG || (SLAB && !DEBUG_SLAB) + depends on SLUB || (SLAB && !DEBUG_SLAB) select CONSTRUCTORS - select STACKDEPOT if SLAB + select STACKDEPOT help Enables kernel address sanitizer - runtime memory debugger, designed to find out-of-bounds accesses and use-after-free bugs. diff --git a/lib/iov_iter.c b/lib/iov_iter.c index d67c8288d95d..9e8c7386b3a0 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -144,7 +144,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b buf = iov->iov_base + skip; copy = min(bytes, iov->iov_len - skip); - if (!fault_in_pages_writeable(buf, copy)) { + if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) { kaddr = kmap_atomic(page); from = kaddr + offset; @@ -175,6 +175,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b copy = min(bytes, iov->iov_len - skip); } /* Too bad - revert to non-atomic kmap */ + kaddr = kmap(page); from = kaddr + offset; left = __copy_to_user(buf, from, copy); @@ -193,6 +194,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b bytes -= copy; } kunmap(page); + done: if (skip == iov->iov_len) { iov++; @@ -225,7 +227,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t buf = iov->iov_base + skip; copy = min(bytes, iov->iov_len - skip); - if (!fault_in_pages_readable(buf, copy)) { + if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) { kaddr = kmap_atomic(page); to = kaddr + offset; @@ -256,6 +258,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t copy = min(bytes, iov->iov_len - skip); } /* Too bad - revert to non-atomic kmap */ + kaddr = kmap(page); to = kaddr + offset; left = __copy_from_user(to, buf, copy); @@ -274,6 +277,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes -= copy; } kunmap(page); + done: if (skip == iov->iov_len) { iov++; diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 53ad6c0831ae..60f77f1d470a 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -242,6 +242,7 @@ depot_stack_handle_t depot_save_stack(struct stack_trace *trace, */ alloc_flags &= ~GFP_ZONEMASK; alloc_flags &= (GFP_ATOMIC | GFP_KERNEL); + alloc_flags |= __GFP_NOWARN; page = alloc_pages(alloc_flags, STACK_ALLOC_ORDER); if (page) prealloc = page_address(page); diff --git a/mm/Kconfig b/mm/Kconfig index 3c81803b00a3..c0837845c17c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -681,7 +681,7 @@ config IDLE_PAGE_TRACKING See Documentation/vm/idle_page_tracking.txt for more details. config ZONE_DEVICE - bool "Device memory (pmem, etc...) hotplug support" if EXPERT + bool "Device memory (pmem, etc...) hotplug support" depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE depends on SPARSEMEM_VMEMMAP diff --git a/mm/backing-dev.c b/mm/backing-dev.c index ed173b8ae8f2..efe237742074 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -947,24 +947,24 @@ long congestion_wait(int sync, long timeout) EXPORT_SYMBOL(congestion_wait); /** - * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes - * @zone: A zone to check if it is heavily congested + * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes + * @pgdat: A pgdat to check if it is heavily congested * @sync: SYNC or ASYNC IO * @timeout: timeout in jiffies * * In the event of a congested backing_dev (any backing_dev) and the given - * @zone has experienced recent congestion, this waits for up to @timeout + * @pgdat has experienced recent congestion, this waits for up to @timeout * jiffies for either a BDI to exit congestion of the given @sync queue * or a write to complete. * - * In the absence of zone congestion, cond_resched() is called to yield + * In the absence of pgdat congestion, cond_resched() is called to yield * the processor if necessary but otherwise does not sleep. * * The return value is 0 if the sleep is for the full timeout. Otherwise, * it is the number of jiffies that were still remaining when the function * returned. return_value == timeout implies the function did not sleep. */ -long wait_iff_congested(struct zone *zone, int sync, long timeout) +long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout) { long ret; unsigned long start = jiffies; @@ -973,12 +973,13 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout) /* * If there is no congestion, or heavy congestion is not being - * encountered in the current zone, yield if necessary instead + * encountered in the current pgdat, yield if necessary instead * of sleeping on the congestion queue */ if (atomic_read(&nr_wb_congested[sync]) == 0 || - !test_bit(ZONE_CONGESTED, &zone->flags)) { + !test_bit(PGDAT_CONGESTED, &pgdat->flags)) { cond_resched(); + /* In case we scheduled, work out time remaining */ ret = timeout - (jiffies - start); if (ret < 0) diff --git a/mm/compaction.c b/mm/compaction.c index 64df5fe052db..9affb2908304 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -331,7 +331,7 @@ static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags, { if (cc->mode == MIGRATE_ASYNC) { if (!spin_trylock_irqsave(lock, *flags)) { - cc->contended = COMPACT_CONTENDED_LOCK; + cc->contended = true; return false; } } else { @@ -365,13 +365,13 @@ static bool compact_unlock_should_abort(spinlock_t *lock, } if (fatal_signal_pending(current)) { - cc->contended = COMPACT_CONTENDED_SCHED; + cc->contended = true; return true; } if (need_resched()) { if (cc->mode == MIGRATE_ASYNC) { - cc->contended = COMPACT_CONTENDED_SCHED; + cc->contended = true; return true; } cond_resched(); @@ -394,7 +394,7 @@ static inline bool compact_should_abort(struct compact_control *cc) /* async compaction aborts if contended */ if (need_resched()) { if (cc->mode == MIGRATE_ASYNC) { - cc->contended = COMPACT_CONTENDED_SCHED; + cc->contended = true; return true; } @@ -646,8 +646,8 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc) list_for_each_entry(page, &cc->migratepages, lru) count[!!page_is_file_cache(page)]++; - mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); - mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, count[0]); + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, count[1]); } /* Similar to reclaim, but different enough that they don't share logic */ @@ -655,12 +655,12 @@ static bool too_many_isolated(struct zone *zone) { unsigned long active, inactive, isolated; - inactive = zone_page_state(zone, NR_INACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_ANON); - active = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_ACTIVE_ANON); - isolated = zone_page_state(zone, NR_ISOLATED_FILE) + - zone_page_state(zone, NR_ISOLATED_ANON); + inactive = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) + + node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON); + active = node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE) + + node_page_state(zone->zone_pgdat, NR_ACTIVE_ANON); + isolated = node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE) + + node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON); return isolated > (inactive + active) / 2; } @@ -752,7 +752,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * if contended. */ if (!(low_pfn % SWAP_CLUSTER_MAX) - && compact_unlock_should_abort(&zone->lru_lock, flags, + && compact_unlock_should_abort(zone_lru_lock(zone), flags, &locked, cc)) break; @@ -813,7 +813,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (unlikely(__PageMovable(page)) && !PageIsolated(page)) { if (locked) { - spin_unlock_irqrestore(&zone->lru_lock, + spin_unlock_irqrestore(zone_lru_lock(zone), flags); locked = false; } @@ -836,7 +836,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* If we already hold the lock, we can skip some rechecking */ if (!locked) { - locked = compact_trylock_irqsave(&zone->lru_lock, + locked = compact_trylock_irqsave(zone_lru_lock(zone), &flags, cc); if (!locked) break; @@ -856,7 +856,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } } - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); /* Try isolate the page */ if (__isolate_lru_page(page, isolate_mode) != 0) @@ -899,7 +899,7 @@ isolate_fail: */ if (nr_isolated) { if (locked) { - spin_unlock_irqrestore(&zone->lru_lock, flags); + spin_unlock_irqrestore(zone_lru_lock(zone), flags); locked = false; } acct_isolated(zone, cc); @@ -927,7 +927,7 @@ isolate_fail: low_pfn = end_pfn; if (locked) - spin_unlock_irqrestore(&zone->lru_lock, flags); + spin_unlock_irqrestore(zone_lru_lock(zone), flags); /* * Update the pageblock-skip information and cached scanner pfn, @@ -1200,7 +1200,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, struct page *page; const isolate_mode_t isolate_mode = (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | - (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); + (cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0); /* * Start at where we last stopped, or beginning of the zone as @@ -1619,14 +1619,11 @@ out: trace_mm_compaction_end(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn, sync, ret); - if (ret == COMPACT_CONTENDED) - ret = COMPACT_PARTIAL; - return ret; } static enum compact_result compact_zone_order(struct zone *zone, int order, - gfp_t gfp_mask, enum migrate_mode mode, int *contended, + gfp_t gfp_mask, enum compact_priority prio, unsigned int alloc_flags, int classzone_idx) { enum compact_result ret; @@ -1636,7 +1633,8 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .order = order, .gfp_mask = gfp_mask, .zone = zone, - .mode = mode, + .mode = (prio == COMPACT_PRIO_ASYNC) ? + MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT, .alloc_flags = alloc_flags, .classzone_idx = classzone_idx, .direct_compaction = true, @@ -1649,7 +1647,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, VM_BUG_ON(!list_empty(&cc.freepages)); VM_BUG_ON(!list_empty(&cc.migratepages)); - *contended = cc.contended; return ret; } @@ -1662,50 +1659,38 @@ int sysctl_extfrag_threshold = 500; * @alloc_flags: The allocation flags of the current allocation * @ac: The context of current allocation * @mode: The migration mode for async, sync light, or sync migration - * @contended: Return value that determines if compaction was aborted due to - * need_resched() or lock contention * * This is the main entry point for direct page compaction. */ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, - enum migrate_mode mode, int *contended) + enum compact_priority prio) { int may_enter_fs = gfp_mask & __GFP_FS; int may_perform_io = gfp_mask & __GFP_IO; struct zoneref *z; struct zone *zone; enum compact_result rc = COMPACT_SKIPPED; - int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ - - *contended = COMPACT_CONTENDED_NONE; /* Check if the GFP flags allow compaction */ - if (!order || !may_enter_fs || !may_perform_io) + if (!may_enter_fs || !may_perform_io) return COMPACT_SKIPPED; - trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode); + trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); /* Compact each zone in the list */ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) { enum compact_result status; - int zone_contended; if (compaction_deferred(zone, order)) { rc = max_t(enum compact_result, COMPACT_DEFERRED, rc); continue; } - status = compact_zone_order(zone, order, gfp_mask, mode, - &zone_contended, alloc_flags, - ac_classzone_idx(ac)); + status = compact_zone_order(zone, order, gfp_mask, prio, + alloc_flags, ac_classzone_idx(ac)); rc = max(status, rc); - /* - * It takes at least one zone that wasn't lock contended - * to clear all_zones_contended. - */ - all_zones_contended &= zone_contended; /* If a normal allocation would succeed, stop compacting */ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), @@ -1717,59 +1702,29 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, * succeeds in this zone. */ compaction_defer_reset(zone, order, false); - /* - * It is possible that async compaction aborted due to - * need_resched() and the watermarks were ok thanks to - * somebody else freeing memory. The allocation can - * however still fail so we better signal the - * need_resched() contention anyway (this will not - * prevent the allocation attempt). - */ - if (zone_contended == COMPACT_CONTENDED_SCHED) - *contended = COMPACT_CONTENDED_SCHED; - goto break_loop; + break; } - if (mode != MIGRATE_ASYNC && (status == COMPACT_COMPLETE || - status == COMPACT_PARTIAL_SKIPPED)) { + if (prio != COMPACT_PRIO_ASYNC && (status == COMPACT_COMPLETE || + status == COMPACT_PARTIAL_SKIPPED)) /* * We think that allocation won't succeed in this zone * so we defer compaction there. If it ends up * succeeding after all, it will be reset. */ defer_compaction(zone, order); - } /* * We might have stopped compacting due to need_resched() in * async compaction, or due to a fatal signal detected. In that - * case do not try further zones and signal need_resched() - * contention. - */ - if ((zone_contended == COMPACT_CONTENDED_SCHED) - || fatal_signal_pending(current)) { - *contended = COMPACT_CONTENDED_SCHED; - goto break_loop; - } - - continue; -break_loop: - /* - * We might not have tried all the zones, so be conservative - * and assume they are not all lock contended. + * case do not try further zones */ - all_zones_contended = 0; - break; + if ((prio == COMPACT_PRIO_ASYNC && need_resched()) + || fatal_signal_pending(current)) + break; } - /* - * If at least one zone wasn't deferred or skipped, we report if all - * zones that were tried were lock contended. - */ - if (rc > COMPACT_INACTIVE && all_zones_contended) - *contended = COMPACT_CONTENDED_LOCK; - return rc; } diff --git a/mm/filemap.c b/mm/filemap.c index e90c1543ec2d..c5f5e46c6f7f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -95,8 +95,8 @@ * ->swap_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) * ->tree_lock (try_to_unmap_one) - * ->zone.lru_lock (follow_page->mark_page_accessed) - * ->zone.lru_lock (check_pte_range->isolate_lru_page) + * ->zone_lru_lock(zone) (follow_page->mark_page_accessed) + * ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) * ->tree_lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) @@ -218,11 +218,11 @@ void __delete_from_page_cache(struct page *page, void *shadow) /* hugetlb pages do not participate in page cache accounting. */ if (!PageHuge(page)) - __mod_zone_page_state(page_zone(page), NR_FILE_PAGES, -nr); + __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); if (PageSwapBacked(page)) { - __mod_zone_page_state(page_zone(page), NR_SHMEM, -nr); + __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); if (PageTransHuge(page)) - __dec_zone_page_state(page, NR_SHMEM_THPS); + __dec_node_page_state(page, NR_SHMEM_THPS); } else { VM_BUG_ON_PAGE(PageTransHuge(page) && !PageHuge(page), page); } @@ -568,9 +568,9 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) * hugetlb pages do not participate in page cache accounting. */ if (!PageHuge(new)) - __inc_zone_page_state(new, NR_FILE_PAGES); + __inc_node_page_state(new, NR_FILE_PAGES); if (PageSwapBacked(new)) - __inc_zone_page_state(new, NR_SHMEM); + __inc_node_page_state(new, NR_SHMEM); spin_unlock_irqrestore(&mapping->tree_lock, flags); mem_cgroup_migrate(old, new); radix_tree_preload_end(); @@ -677,7 +677,7 @@ static int __add_to_page_cache_locked(struct page *page, /* hugetlb pages do not participate in page cache accounting. */ if (!huge) - __inc_zone_page_state(page, NR_FILE_PAGES); + __inc_node_page_state(page, NR_FILE_PAGES); spin_unlock_irq(&mapping->tree_lock); if (!huge) mem_cgroup_commit_charge(page, memcg, false, false); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3647334c2ef9..2373f0a7d340 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -539,23 +539,26 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, } /* - * If THP is set to always then directly reclaim/compact as necessary - * If set to defer then do no reclaim and defer to khugepaged + * If THP defrag is set to always then directly reclaim/compact as necessary + * If set to defer then do only background reclaim/compact and defer to khugepaged * If set to madvise and the VMA is flagged then directly reclaim/compact + * When direct reclaim/compact is allowed, don't retry except for flagged VMA's */ static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) { - gfp_t reclaim_flags = 0; - - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags) && - (vma->vm_flags & VM_HUGEPAGE)) - reclaim_flags = __GFP_DIRECT_RECLAIM; - else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) - reclaim_flags = __GFP_KSWAPD_RECLAIM; - else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) - reclaim_flags = __GFP_DIRECT_RECLAIM; - - return GFP_TRANSHUGE | reclaim_flags; + bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); + + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, + &transparent_hugepage_flags) && vma_madvised) + return GFP_TRANSHUGE; + else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, + &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; + else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, + &transparent_hugepage_flags)) + return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); + + return GFP_TRANSHUGE_LIGHT; } /* Caller must hold page table lock. */ @@ -1249,25 +1252,26 @@ out: return 0; } -int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, +/* + * Return true if we do MADV_FREE successfully on entire pmd page. + * Otherwise, return false. + */ +bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long next) - { spinlock_t *ptl; pmd_t orig_pmd; struct page *page; struct mm_struct *mm = tlb->mm; - int ret = 0; + bool ret = false; ptl = pmd_trans_huge_lock(pmd, vma); if (!ptl) goto out_unlocked; orig_pmd = *pmd; - if (is_huge_zero_pmd(orig_pmd)) { - ret = 1; + if (is_huge_zero_pmd(orig_pmd)) goto out; - } page = pmd_page(orig_pmd); /* @@ -1309,7 +1313,7 @@ int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, set_pmd_at(mm, addr, pmd, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); } - ret = 1; + ret = true; out: spin_unlock(ptl); out_unlocked: @@ -1586,7 +1590,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { /* Last compound_mapcount is gone. */ - __dec_zone_page_state(page, NR_ANON_THPS); + __dec_node_page_state(page, NR_ANON_THPS); if (TestClearPageDoubleMap(page)) { /* No need in mapcount reference anymore */ for (i = 0; i < HPAGE_PMD_NR; i++) @@ -1818,7 +1822,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, pgoff_t end = -1; int i; - lruvec = mem_cgroup_page_lruvec(head, zone); + lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat); /* complete memcg works before add pages to LRU */ mem_cgroup_split_huge_fixup(head); @@ -1848,7 +1852,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, spin_unlock(&head->mapping->tree_lock); } - spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags); + spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); unfreeze_page(head); @@ -2034,7 +2038,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) lru_add_drain(); /* prevent PageLRU to go away from under us, and freeze lru stats */ - spin_lock_irqsave(&page_zone(head)->lru_lock, flags); + spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags); if (mapping) { void **pslot; @@ -2061,7 +2065,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) list_del(page_deferred_list(head)); } if (mapping) - __dec_zone_page_state(page, NR_SHMEM_THPS); + __dec_node_page_state(page, NR_SHMEM_THPS); spin_unlock(&pgdata->split_queue_lock); __split_huge_page(page, list, flags); ret = 0; @@ -2077,7 +2081,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) spin_unlock(&pgdata->split_queue_lock); fail: if (mapping) spin_unlock(&mapping->tree_lock); - spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags); + spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); unfreeze_page(head); ret = -EBUSY; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 51a04e5e9373..f904246a8fd5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4391,7 +4391,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, /* * This function is called from memory failure code. - * Assume the caller holds page lock of the head page. */ int dequeue_hwpoisoned_huge_page(struct page *hpage) { diff --git a/mm/internal.h b/mm/internal.h index 9b6a6c43ac39..1501304f87a4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -78,7 +78,7 @@ extern unsigned long highest_memmap_pfn; */ extern int isolate_lru_page(struct page *page); extern void putback_lru_page(struct page *page); -extern bool zone_reclaimable(struct zone *zone); +extern bool pgdat_reclaimable(struct pglist_data *pgdat); /* * in mm/rmap.c: @@ -185,10 +185,7 @@ struct compact_control { const unsigned int alloc_flags; /* alloc flags of a direct compactor */ const int classzone_idx; /* zone index of a direct compactor */ struct zone *zone; - int contended; /* Signal need_sched() or lock - * contention detected during - * compaction - */ + bool contended; /* Signal lock or sched contention */ }; unsigned long @@ -433,10 +430,10 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, } #endif /* CONFIG_SPARSEMEM */ -#define ZONE_RECLAIM_NOSCAN -2 -#define ZONE_RECLAIM_FULL -1 -#define ZONE_RECLAIM_SOME 0 -#define ZONE_RECLAIM_SUCCESS 1 +#define NODE_RECLAIM_NOSCAN -2 +#define NODE_RECLAIM_FULL -1 +#define NODE_RECLAIM_SOME 0 +#define NODE_RECLAIM_SUCCESS 1 extern int hwpoison_filter(struct page *p); @@ -467,7 +464,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ -#define ALLOC_FAIR 0x100 /* fair zone allocation */ enum ttu_flags; struct tlbflush_unmap_batch; diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 1548749a3d45..2976a9ee104f 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -7,5 +7,4 @@ CFLAGS_REMOVE_kasan.o = -pg # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -obj-y := kasan.o report.o kasan_init.o -obj-$(CONFIG_SLAB) += quarantine.o +obj-y := kasan.o report.o kasan_init.o quarantine.o diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 6845f9294696..b6f99e81bfeb 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -351,7 +351,6 @@ void kasan_free_pages(struct page *page, unsigned int order) KASAN_FREE_PAGE); } -#ifdef CONFIG_SLAB /* * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. * For larger allocations larger redzones are used. @@ -373,16 +372,8 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size, unsigned long *flags) { int redzone_adjust; - /* Make sure the adjusted size is still less than - * KMALLOC_MAX_CACHE_SIZE. - * TODO: this check is only useful for SLAB, but not SLUB. We'll need - * to skip it for SLUB when it starts using kasan_cache_create(). - */ - if (*size > KMALLOC_MAX_CACHE_SIZE - - sizeof(struct kasan_alloc_meta) - - sizeof(struct kasan_free_meta)) - return; - *flags |= SLAB_KASAN; + int orig_size = *size; + /* Add alloc meta. */ cache->kasan_info.alloc_meta_offset = *size; *size += sizeof(struct kasan_alloc_meta); @@ -395,14 +386,26 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size, } redzone_adjust = optimal_redzone(cache->object_size) - (*size - cache->object_size); + if (redzone_adjust > 0) *size += redzone_adjust; - *size = min(KMALLOC_MAX_CACHE_SIZE, - max(*size, - cache->object_size + - optimal_redzone(cache->object_size))); + + *size = min(KMALLOC_MAX_SIZE, max(*size, cache->object_size + + optimal_redzone(cache->object_size))); + + /* + * If the metadata doesn't fit, don't enable KASAN at all. + */ + if (*size <= cache->kasan_info.alloc_meta_offset || + *size <= cache->kasan_info.free_meta_offset) { + cache->kasan_info.alloc_meta_offset = 0; + cache->kasan_info.free_meta_offset = 0; + *size = orig_size; + return; + } + + *flags |= SLAB_KASAN; } -#endif void kasan_cache_shrink(struct kmem_cache *cache) { @@ -414,6 +417,14 @@ void kasan_cache_destroy(struct kmem_cache *cache) quarantine_remove_cache(cache); } +size_t kasan_metadata_size(struct kmem_cache *cache) +{ + return (cache->kasan_info.alloc_meta_offset ? + sizeof(struct kasan_alloc_meta) : 0) + + (cache->kasan_info.free_meta_offset ? + sizeof(struct kasan_free_meta) : 0); +} + void kasan_poison_slab(struct page *page) { kasan_poison_shadow(page_address(page), @@ -431,16 +442,13 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object) kasan_poison_shadow(object, round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE), KASAN_KMALLOC_REDZONE); -#ifdef CONFIG_SLAB if (cache->flags & SLAB_KASAN) { struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); alloc_info->state = KASAN_STATE_INIT; } -#endif } -#ifdef CONFIG_SLAB static inline int in_irqentry_text(unsigned long ptr) { return (ptr >= (unsigned long)&__irqentry_text_start && @@ -501,7 +509,6 @@ struct kasan_free_meta *get_free_info(struct kmem_cache *cache, BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); return (void *)object + cache->kasan_info.free_meta_offset; } -#endif void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags) { @@ -522,16 +529,16 @@ static void kasan_poison_slab_free(struct kmem_cache *cache, void *object) bool kasan_slab_free(struct kmem_cache *cache, void *object) { -#ifdef CONFIG_SLAB /* RCU slabs could be legally used after free within the RCU period */ if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) return false; if (likely(cache->flags & SLAB_KASAN)) { - struct kasan_alloc_meta *alloc_info = - get_alloc_info(cache, object); - struct kasan_free_meta *free_info = - get_free_info(cache, object); + struct kasan_alloc_meta *alloc_info; + struct kasan_free_meta *free_info; + + alloc_info = get_alloc_info(cache, object); + free_info = get_free_info(cache, object); switch (alloc_info->state) { case KASAN_STATE_ALLOC: @@ -550,10 +557,6 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object) } } return false; -#else - kasan_poison_slab_free(cache, object); - return false; -#endif } void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, @@ -576,7 +579,6 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, kasan_unpoison_shadow(object, size); kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, KASAN_KMALLOC_REDZONE); -#ifdef CONFIG_SLAB if (cache->flags & SLAB_KASAN) { struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); @@ -585,7 +587,6 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, alloc_info->alloc_size = size; set_track(&alloc_info->track, flags); } -#endif } EXPORT_SYMBOL(kasan_kmalloc); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index fb87923552ef..31972cdba433 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -95,7 +95,6 @@ struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, struct kasan_free_meta *get_free_info(struct kmem_cache *cache, const void *object); - static inline const void *kasan_shadow_to_mem(const void *shadow_addr) { return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) @@ -110,7 +109,7 @@ static inline bool kasan_report_enabled(void) void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); -#ifdef CONFIG_SLAB +#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB) void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); void quarantine_reduce(void); void quarantine_remove_cache(struct kmem_cache *cache); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index b3c122ddd454..861b9776841a 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -116,7 +116,6 @@ static inline bool init_task_stack_addr(const void *addr) sizeof(init_thread_union.stack)); } -#ifdef CONFIG_SLAB static void print_track(struct kasan_track *track) { pr_err("PID = %u\n", track->pid); @@ -130,8 +129,8 @@ static void print_track(struct kasan_track *track) } } -static void object_err(struct kmem_cache *cache, struct page *page, - void *object, char *unused_reason) +static void kasan_object_err(struct kmem_cache *cache, struct page *page, + void *object, char *unused_reason) { struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); struct kasan_free_meta *free_info; @@ -162,7 +161,6 @@ static void object_err(struct kmem_cache *cache, struct page *page, break; } } -#endif static void print_address_description(struct kasan_access_info *info) { @@ -177,7 +175,7 @@ static void print_address_description(struct kasan_access_info *info) struct kmem_cache *cache = page->slab_cache; object = nearest_obj(cache, page, (void *)info->access_addr); - object_err(cache, page, object, + kasan_object_err(cache, page, object, "kasan: bad access detected"); return; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 7dbee698d6aa..79c52d0061af 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -480,7 +480,7 @@ void __khugepaged_exit(struct mm_struct *mm) static void release_pte_page(struct page *page) { /* 0 stands for page_is_file_cache(page) == false */ - dec_zone_page_state(page, NR_ISOLATED_ANON + 0); + dec_node_page_state(page, NR_ISOLATED_ANON + 0); unlock_page(page); putback_lru_page(page); } @@ -576,7 +576,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, goto out; } /* 0 stands for page_is_file_cache(page) == false */ - inc_zone_page_state(page, NR_ISOLATED_ANON + 0); + inc_node_page_state(page, NR_ISOLATED_ANON + 0); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageLRU(page), page); @@ -672,10 +672,10 @@ static bool khugepaged_scan_abort(int nid) int i; /* - * If zone_reclaim_mode is disabled, then no extra effort is made to + * If node_reclaim_mode is disabled, then no extra effort is made to * allocate memory locally. */ - if (!zone_reclaim_mode) + if (!node_reclaim_mode) return false; /* If there is a count for this node already, it must be acceptable */ @@ -694,7 +694,7 @@ static bool khugepaged_scan_abort(int nid) /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) { - return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0); + return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT; } #ifdef CONFIG_NUMA @@ -1483,10 +1483,10 @@ tree_unlocked: } local_irq_save(flags); - __inc_zone_page_state(new_page, NR_SHMEM_THPS); + __inc_node_page_state(new_page, NR_SHMEM_THPS); if (nr_none) { - __mod_zone_page_state(zone, NR_FILE_PAGES, nr_none); - __mod_zone_page_state(zone, NR_SHMEM, nr_none); + __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); + __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none); } local_irq_restore(flags); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 04320d3adbef..086292f7c59d 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1485,8 +1485,10 @@ static int kmemleak_scan_thread(void *arg) * Wait before the first scan to allow the system to fully initialize. */ if (first_run) { + signed long timeout = msecs_to_jiffies(SECS_FIRST_SCAN * 1000); first_run = 0; - ssleep(SECS_FIRST_SCAN); + while (timeout && !kthread_should_stop()) + timeout = schedule_timeout_interruptible(timeout); } while (!kthread_should_stop()) { diff --git a/mm/memblock.c b/mm/memblock.c index ca099159b45a..ff5ff3b5f1ea 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -20,7 +20,7 @@ #include <linux/seq_file.h> #include <linux/memblock.h> -#include <asm-generic/sections.h> +#include <asm/sections.h> #include <linux/io.h> #include "internal.h" @@ -1027,7 +1027,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, *out_end = m_end; if (out_nid) *out_nid = m_nid; - idx_a++; + idx_a--; *idx = (u32)idx_a | (u64)idx_b << 32; return; } @@ -1465,15 +1465,16 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void) return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size); } -void __init memblock_enforce_memory_limit(phys_addr_t limit) +static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit) { phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; struct memblock_region *r; - if (!limit) - return; - - /* find out max address */ + /* + * translate the memory @limit size into the max address within one of + * the memory memblock regions, if the @limit exceeds the total size + * of those regions, max_addr will keep original value ULLONG_MAX + */ for_each_memblock(memory, r) { if (limit <= r->size) { max_addr = r->base + limit; @@ -1482,6 +1483,22 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit) limit -= r->size; } + return max_addr; +} + +void __init memblock_enforce_memory_limit(phys_addr_t limit) +{ + phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; + + if (!limit) + return; + + max_addr = __find_max_addr(limit); + + /* @limit exceeds the total size of the memory, do nothing */ + if (max_addr == (phys_addr_t)ULLONG_MAX) + return; + /* truncate both memory and reserved regions */ memblock_remove_range(&memblock.memory, max_addr, (phys_addr_t)ULLONG_MAX); @@ -1489,6 +1506,36 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit) (phys_addr_t)ULLONG_MAX); } +void __init memblock_mem_limit_remove_map(phys_addr_t limit) +{ + struct memblock_type *type = &memblock.memory; + phys_addr_t max_addr; + int i, ret, start_rgn, end_rgn; + + if (!limit) + return; + + max_addr = __find_max_addr(limit); + + /* @limit exceeds the total size of the memory, do nothing */ + if (max_addr == (phys_addr_t)ULLONG_MAX) + return; + + ret = memblock_isolate_range(type, max_addr, (phys_addr_t)ULLONG_MAX, + &start_rgn, &end_rgn); + if (ret) + return; + + /* remove all the MAP regions above the limit */ + for (i = end_rgn - 1; i >= start_rgn; i--) { + if (!memblock_is_nomap(&type->regions[i])) + memblock_remove_region(type, i); + } + /* truncate the reserved regions */ + memblock_remove_range(&memblock.reserved, max_addr, + (phys_addr_t)ULLONG_MAX); +} + static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) { unsigned int left = 0, right = type->cnt; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f3a84c64f35c..c265212bec8c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -132,15 +132,11 @@ static const char * const mem_cgroup_lru_names[] = { * their hierarchy representation */ -struct mem_cgroup_tree_per_zone { +struct mem_cgroup_tree_per_node { struct rb_root rb_root; spinlock_t lock; }; -struct mem_cgroup_tree_per_node { - struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; -}; - struct mem_cgroup_tree { struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; }; @@ -323,15 +319,6 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key); #endif /* !CONFIG_SLOB */ -static struct mem_cgroup_per_zone * -mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) -{ - int nid = zone_to_nid(zone); - int zid = zone_idx(zone); - - return &memcg->nodeinfo[nid]->zoneinfo[zid]; -} - /** * mem_cgroup_css_from_page - css of the memcg associated with a page * @page: page of interest @@ -383,37 +370,35 @@ ino_t page_cgroup_ino(struct page *page) return ino; } -static struct mem_cgroup_per_zone * -mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) +static struct mem_cgroup_per_node * +mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page) { int nid = page_to_nid(page); - int zid = page_zonenum(page); - return &memcg->nodeinfo[nid]->zoneinfo[zid]; + return memcg->nodeinfo[nid]; } -static struct mem_cgroup_tree_per_zone * -soft_limit_tree_node_zone(int nid, int zid) +static struct mem_cgroup_tree_per_node * +soft_limit_tree_node(int nid) { - return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; + return soft_limit_tree.rb_tree_per_node[nid]; } -static struct mem_cgroup_tree_per_zone * +static struct mem_cgroup_tree_per_node * soft_limit_tree_from_page(struct page *page) { int nid = page_to_nid(page); - int zid = page_zonenum(page); - return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; + return soft_limit_tree.rb_tree_per_node[nid]; } -static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz, +static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, + struct mem_cgroup_tree_per_node *mctz, unsigned long new_usage_in_excess) { struct rb_node **p = &mctz->rb_root.rb_node; struct rb_node *parent = NULL; - struct mem_cgroup_per_zone *mz_node; + struct mem_cgroup_per_node *mz_node; if (mz->on_tree) return; @@ -423,7 +408,7 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, return; while (*p) { parent = *p; - mz_node = rb_entry(parent, struct mem_cgroup_per_zone, + mz_node = rb_entry(parent, struct mem_cgroup_per_node, tree_node); if (mz->usage_in_excess < mz_node->usage_in_excess) p = &(*p)->rb_left; @@ -439,8 +424,8 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, mz->on_tree = true; } -static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) +static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, + struct mem_cgroup_tree_per_node *mctz) { if (!mz->on_tree) return; @@ -448,8 +433,8 @@ static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, mz->on_tree = false; } -static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) +static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, + struct mem_cgroup_tree_per_node *mctz) { unsigned long flags; @@ -473,8 +458,8 @@ static unsigned long soft_limit_excess(struct mem_cgroup *memcg) static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) { unsigned long excess; - struct mem_cgroup_per_zone *mz; - struct mem_cgroup_tree_per_zone *mctz; + struct mem_cgroup_per_node *mz; + struct mem_cgroup_tree_per_node *mctz; mctz = soft_limit_tree_from_page(page); /* @@ -482,7 +467,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) * because their event counter is not touched. */ for (; memcg; memcg = parent_mem_cgroup(memcg)) { - mz = mem_cgroup_page_zoneinfo(memcg, page); + mz = mem_cgroup_page_nodeinfo(memcg, page); excess = soft_limit_excess(memcg); /* * We have to update the tree if mz is on RB-tree or @@ -507,24 +492,22 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) { - struct mem_cgroup_tree_per_zone *mctz; - struct mem_cgroup_per_zone *mz; - int nid, zid; + struct mem_cgroup_tree_per_node *mctz; + struct mem_cgroup_per_node *mz; + int nid; for_each_node(nid) { - for (zid = 0; zid < MAX_NR_ZONES; zid++) { - mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; - mctz = soft_limit_tree_node_zone(nid, zid); - mem_cgroup_remove_exceeded(mz, mctz); - } + mz = mem_cgroup_nodeinfo(memcg, nid); + mctz = soft_limit_tree_node(nid); + mem_cgroup_remove_exceeded(mz, mctz); } } -static struct mem_cgroup_per_zone * -__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) +static struct mem_cgroup_per_node * +__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) { struct rb_node *rightmost = NULL; - struct mem_cgroup_per_zone *mz; + struct mem_cgroup_per_node *mz; retry: mz = NULL; @@ -532,7 +515,7 @@ retry: if (!rightmost) goto done; /* Nothing to reclaim from */ - mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); + mz = rb_entry(rightmost, struct mem_cgroup_per_node, tree_node); /* * Remove the node now but someone else can add it back, * we will to add it back at the end of reclaim to its correct @@ -546,10 +529,10 @@ done: return mz; } -static struct mem_cgroup_per_zone * -mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) +static struct mem_cgroup_per_node * +mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) { - struct mem_cgroup_per_zone *mz; + struct mem_cgroup_per_node *mz; spin_lock_irq(&mctz->lock); mz = __mem_cgroup_largest_soft_limit_node(mctz); @@ -643,20 +626,16 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask) { unsigned long nr = 0; - int zid; + struct mem_cgroup_per_node *mz; + enum lru_list lru; VM_BUG_ON((unsigned)nid >= nr_node_ids); - for (zid = 0; zid < MAX_NR_ZONES; zid++) { - struct mem_cgroup_per_zone *mz; - enum lru_list lru; - - for_each_lru(lru) { - if (!(BIT(lru) & lru_mask)) - continue; - mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; - nr += mz->lru_size[lru]; - } + for_each_lru(lru) { + if (!(BIT(lru) & lru_mask)) + continue; + mz = mem_cgroup_nodeinfo(memcg, nid); + nr += mz->lru_size[lru]; } return nr; } @@ -809,9 +788,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, rcu_read_lock(); if (reclaim) { - struct mem_cgroup_per_zone *mz; + struct mem_cgroup_per_node *mz; - mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); + mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id); iter = &mz->iter[reclaim->priority]; if (prev && reclaim->generation != iter->generation) @@ -910,19 +889,17 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) { struct mem_cgroup *memcg = dead_memcg; struct mem_cgroup_reclaim_iter *iter; - struct mem_cgroup_per_zone *mz; - int nid, zid; + struct mem_cgroup_per_node *mz; + int nid; int i; while ((memcg = parent_mem_cgroup(memcg))) { for_each_node(nid) { - for (zid = 0; zid < MAX_NR_ZONES; zid++) { - mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; - for (i = 0; i <= DEF_PRIORITY; i++) { - iter = &mz->iter[i]; - cmpxchg(&iter->position, - dead_memcg, NULL); - } + mz = mem_cgroup_nodeinfo(memcg, nid); + for (i = 0; i <= DEF_PRIORITY; i++) { + iter = &mz->iter[i]; + cmpxchg(&iter->position, + dead_memcg, NULL); } } } @@ -944,39 +921,6 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) iter = mem_cgroup_iter(NULL, iter, NULL)) /** - * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg - * @zone: zone of the wanted lruvec - * @memcg: memcg of the wanted lruvec - * - * Returns the lru list vector holding pages for the given @zone and - * @mem. This can be the global zone lruvec, if the memory controller - * is disabled. - */ -struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, - struct mem_cgroup *memcg) -{ - struct mem_cgroup_per_zone *mz; - struct lruvec *lruvec; - - if (mem_cgroup_disabled()) { - lruvec = &zone->lruvec; - goto out; - } - - mz = mem_cgroup_zone_zoneinfo(memcg, zone); - lruvec = &mz->lruvec; -out: - /* - * Since a node can be onlined after the mem_cgroup was created, - * we have to be prepared to initialize lruvec->zone here; - * and if offlined then reonlined, we need to reinitialize it. - */ - if (unlikely(lruvec->zone != zone)) - lruvec->zone = zone; - return lruvec; -} - -/** * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page * @page: the page * @zone: zone of the page @@ -985,14 +929,14 @@ out: * and putback protocol: the LRU lock must be held, and the page must * either be PageLRU() or the caller must have isolated/allocated it. */ -struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) +struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat) { - struct mem_cgroup_per_zone *mz; + struct mem_cgroup_per_node *mz; struct mem_cgroup *memcg; struct lruvec *lruvec; if (mem_cgroup_disabled()) { - lruvec = &zone->lruvec; + lruvec = &pgdat->lruvec; goto out; } @@ -1004,7 +948,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) if (!memcg) memcg = root_mem_cgroup; - mz = mem_cgroup_page_zoneinfo(memcg, page); + mz = mem_cgroup_page_nodeinfo(memcg, page); lruvec = &mz->lruvec; out: /* @@ -1012,8 +956,8 @@ out: * we have to be prepared to initialize lruvec->zone here; * and if offlined then reonlined, we need to reinitialize it. */ - if (unlikely(lruvec->zone != zone)) - lruvec->zone = zone; + if (unlikely(lruvec->pgdat != pgdat)) + lruvec->pgdat = pgdat; return lruvec; } @@ -1030,17 +974,15 @@ out: void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int nr_pages) { - struct mem_cgroup_per_zone *mz; + struct mem_cgroup_per_node *mz; unsigned long *lru_size; long size; bool empty; - __update_lru_size(lruvec, lru, nr_pages); - if (mem_cgroup_disabled()) return; - mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); + mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); lru_size = mz->lru_size + lru; empty = list_empty(lruvec->lists + lru); @@ -1276,9 +1218,9 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, * select it. The goal is to allow it to allocate so that it may * quickly exit and free its memory. */ - if (fatal_signal_pending(current) || task_will_free_mem(current)) { + if (task_will_free_mem(current)) { mark_oom_victim(current); - try_oom_reaper(current); + wake_oom_reaper(current); goto unlock; } @@ -1433,7 +1375,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) #endif static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, - struct zone *zone, + pg_data_t *pgdat, gfp_t gfp_mask, unsigned long *total_scanned) { @@ -1443,7 +1385,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, unsigned long excess; unsigned long nr_scanned; struct mem_cgroup_reclaim_cookie reclaim = { - .zone = zone, + .pgdat = pgdat, .priority = 0, }; @@ -1473,8 +1415,8 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, } continue; } - total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, - zone, &nr_scanned); + total += mem_cgroup_shrink_node(victim, gfp_mask, false, + pgdat, &nr_scanned); *total_scanned += nr_scanned; if (!soft_limit_excess(root_memcg)) break; @@ -2107,11 +2049,11 @@ static void lock_page_lru(struct page *page, int *isolated) { struct zone *zone = page_zone(page); - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(zone_lru_lock(zone)); if (PageLRU(page)) { struct lruvec *lruvec; - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_lru(page)); *isolated = 1; @@ -2126,12 +2068,12 @@ static void unlock_page_lru(struct page *page, int isolated) if (isolated) { struct lruvec *lruvec; - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); VM_BUG_ON_PAGE(PageLRU(page), page); SetPageLRU(page); add_page_to_lru_list(page, lruvec, page_lru(page)); } - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(zone_lru_lock(zone)); } static void commit_charge(struct page *page, struct mem_cgroup *memcg, @@ -2431,7 +2373,7 @@ void memcg_kmem_uncharge(struct page *page, int order) /* * Because tail pages are not marked as "used", set it. We're under - * zone->lru_lock and migration entries setup in all page mappings. + * zone_lru_lock and migration entries setup in all page mappings. */ void mem_cgroup_split_huge_fixup(struct page *head) { @@ -2601,22 +2543,22 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, return ret; } -unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, +unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned) { unsigned long nr_reclaimed = 0; - struct mem_cgroup_per_zone *mz, *next_mz = NULL; + struct mem_cgroup_per_node *mz, *next_mz = NULL; unsigned long reclaimed; int loop = 0; - struct mem_cgroup_tree_per_zone *mctz; + struct mem_cgroup_tree_per_node *mctz; unsigned long excess; unsigned long nr_scanned; if (order > 0) return 0; - mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); + mctz = soft_limit_tree_node(pgdat->node_id); /* * This loop can run a while, specially if mem_cgroup's continuously * keep exceeding their soft limit and putting the system under @@ -2631,7 +2573,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, break; nr_scanned = 0; - reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, + reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, gfp_mask, &nr_scanned); nr_reclaimed += reclaimed; *total_scanned += nr_scanned; @@ -3252,22 +3194,21 @@ static int memcg_stat_show(struct seq_file *m, void *v) #ifdef CONFIG_DEBUG_VM { - int nid, zid; - struct mem_cgroup_per_zone *mz; + pg_data_t *pgdat; + struct mem_cgroup_per_node *mz; struct zone_reclaim_stat *rstat; unsigned long recent_rotated[2] = {0, 0}; unsigned long recent_scanned[2] = {0, 0}; - for_each_online_node(nid) - for (zid = 0; zid < MAX_NR_ZONES; zid++) { - mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; - rstat = &mz->lruvec.reclaim_stat; + for_each_online_pgdat(pgdat) { + mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); + rstat = &mz->lruvec.reclaim_stat; - recent_rotated[0] += rstat->recent_rotated[0]; - recent_rotated[1] += rstat->recent_rotated[1]; - recent_scanned[0] += rstat->recent_scanned[0]; - recent_scanned[1] += rstat->recent_scanned[1]; - } + recent_rotated[0] += rstat->recent_rotated[0]; + recent_rotated[1] += rstat->recent_rotated[1]; + recent_scanned[0] += rstat->recent_scanned[0]; + recent_scanned[1] += rstat->recent_scanned[1]; + } seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); @@ -4147,11 +4088,10 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return idr_find(&mem_cgroup_idr, id); } -static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) +static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; - struct mem_cgroup_per_zone *mz; - int zone, tmp = node; + int tmp = node; /* * This routine is called against possible nodes. * But it's BUG to call kmalloc() against offline node. @@ -4166,18 +4106,16 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) if (!pn) return 1; - for (zone = 0; zone < MAX_NR_ZONES; zone++) { - mz = &pn->zoneinfo[zone]; - lruvec_init(&mz->lruvec); - mz->usage_in_excess = 0; - mz->on_tree = false; - mz->memcg = memcg; - } + lruvec_init(&pn->lruvec); + pn->usage_in_excess = 0; + pn->on_tree = false; + pn->memcg = memcg; + memcg->nodeinfo[node] = pn; return 0; } -static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) +static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { kfree(memcg->nodeinfo[node]); } @@ -4188,7 +4126,7 @@ static void mem_cgroup_free(struct mem_cgroup *memcg) memcg_wb_domain_exit(memcg); for_each_node(node) - free_mem_cgroup_per_zone_info(memcg, node); + free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->stat); kfree(memcg); } @@ -4217,7 +4155,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) goto fail; for_each_node(node) - if (alloc_mem_cgroup_per_zone_info(memcg, node)) + if (alloc_mem_cgroup_per_node_info(memcg, node)) goto fail; if (memcg_wb_domain_init(memcg, GFP_KERNEL)) @@ -5233,7 +5171,7 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "file %llu\n", (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); seq_printf(m, "kernel_stack %llu\n", - (u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE); + (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024); seq_printf(m, "slab %llu\n", (u64)(stat[MEMCG_SLAB_RECLAIMABLE] + stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); @@ -5820,18 +5758,12 @@ static int __init mem_cgroup_init(void) for_each_node(node) { struct mem_cgroup_tree_per_node *rtpn; - int zone; rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node_online(node) ? node : NUMA_NO_NODE); - for (zone = 0; zone < MAX_NR_ZONES; zone++) { - struct mem_cgroup_tree_per_zone *rtpz; - - rtpz = &rtpn->rb_tree_per_zone[zone]; - rtpz->rb_root = RB_ROOT; - spin_lock_init(&rtpz->lock); - } + rtpn->rb_root = RB_ROOT; + spin_lock_init(&rtpn->lock); soft_limit_tree.rb_tree_per_node[node] = rtpn; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2fcca6b0e005..de88f33519c0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -741,8 +741,6 @@ static int me_huge_page(struct page *p, unsigned long pfn) * page->lru because it can be used in other hugepage operations, * such as __unmap_hugepage_range() and gather_surplus_pages(). * So instead we use page_mapping() and PageAnon(). - * We assume that this function is called with page lock held, - * so there is no race between isolation and mapping/unmapping. */ if (!(page_mapping(hpage) || PageAnon(hpage))) { res = dequeue_hwpoisoned_huge_page(hpage); @@ -1663,7 +1661,7 @@ static int __soft_offline_page(struct page *page, int flags) put_hwpoison_page(page); if (!ret) { LIST_HEAD(pagelist); - inc_zone_page_state(page, NR_ISOLATED_ANON + + inc_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, @@ -1671,7 +1669,7 @@ static int __soft_offline_page(struct page *page, int flags) if (ret) { if (!list_empty(&pagelist)) { list_del(&page->lru); - dec_zone_page_state(page, NR_ISOLATED_ANON + + dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); putback_lru_page(page); } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 82d0b98d27f8..3894b65b1555 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1209,9 +1209,10 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) arch_refresh_nodedata(nid, pgdat); } else { - /* Reset the nr_zones and classzone_idx to 0 before reuse */ + /* Reset the nr_zones, order and classzone_idx before reuse */ pgdat->nr_zones = 0; - pgdat->classzone_idx = 0; + pgdat->kswapd_order = 0; + pgdat->kswapd_classzone_idx = 0; } /* we can use NODE_DATA(nid) from here */ @@ -1547,6 +1548,37 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) return 0; } +static struct page *new_node_page(struct page *page, unsigned long private, + int **result) +{ + gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; + int nid = page_to_nid(page); + nodemask_t nmask = node_online_map; + struct page *new_page; + + /* + * TODO: allocate a destination hugepage from a nearest neighbor node, + * accordance with memory policy of the user process if possible. For + * now as a simple work-around, we use the next node for destination. + */ + if (PageHuge(page)) + return alloc_huge_page_node(page_hstate(compound_head(page)), + next_node_in(nid, nmask)); + + node_clear(nid, nmask); + if (PageHighMem(page) + || (zone_idx(page_zone(page)) == ZONE_MOVABLE)) + gfp_mask |= __GFP_HIGHMEM; + + new_page = __alloc_pages_nodemask(gfp_mask, 0, + node_zonelist(nid, gfp_mask), &nmask); + if (!new_page) + new_page = __alloc_pages(gfp_mask, 0, + node_zonelist(nid, gfp_mask)); + + return new_page; +} + #define NR_OFFLINE_AT_ONCE_PAGES (256) static int do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) @@ -1586,7 +1618,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) put_page(page); list_add_tail(&page->lru, &source); move_pages--; - inc_zone_page_state(page, NR_ISOLATED_ANON + + inc_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); } else { @@ -1610,11 +1642,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) goto out; } - /* - * alloc_migrate_target should be improooooved!! - * migrate_pages returns # of failed pages. - */ - ret = migrate_pages(&source, alloc_migrate_target, NULL, 0, + /* Allocate a new page from the nearest neighbor node */ + ret = migrate_pages(&source, new_node_page, NULL, 0, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); if (ret) putback_movable_pages(&source); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 53e40d3f3933..d8c4e38fb5f4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -962,7 +962,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { if (!isolate_lru_page(page)) { list_add_tail(&page->lru, pagelist); - inc_zone_page_state(page, NR_ISOLATED_ANON + + inc_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); } } diff --git a/mm/mempool.c b/mm/mempool.c index 8f65464da5de..47a659dedd44 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -306,7 +306,7 @@ EXPORT_SYMBOL(mempool_resize); * returns NULL. Note that due to preallocation, this function * *never* fails when called from process contexts. (it might * fail if called from an IRQ context.) - * Note: neither __GFP_NOMEMALLOC nor __GFP_ZERO are supported. + * Note: using __GFP_ZERO is not supported. */ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) { @@ -315,27 +315,16 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) wait_queue_t wait; gfp_t gfp_temp; - /* If oom killed, memory reserves are essential to prevent livelock */ - VM_WARN_ON_ONCE(gfp_mask & __GFP_NOMEMALLOC); - /* No element size to zero on allocation */ VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); - might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); + gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ gfp_mask |= __GFP_NOWARN; /* failures are OK */ gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); repeat_alloc: - if (likely(pool->curr_nr)) { - /* - * Don't allocate from emergency reserves if there are - * elements available. This check is racy, but it will - * be rechecked each loop. - */ - gfp_temp |= __GFP_NOMEMALLOC; - } element = pool->alloc(gfp_temp, pool->pool_data); if (likely(element != NULL)) @@ -359,12 +348,11 @@ repeat_alloc: * We use gfp mask w/o direct reclaim or IO for the first round. If * alloc failed with that and @pool was empty, retry immediately. */ - if ((gfp_temp & ~__GFP_NOMEMALLOC) != gfp_mask) { + if (gfp_temp != gfp_mask) { spin_unlock_irqrestore(&pool->lock, flags); gfp_temp = gfp_mask; goto repeat_alloc; } - gfp_temp = gfp_mask; /* We must not sleep if !__GFP_DIRECT_RECLAIM */ if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { diff --git a/mm/migrate.c b/mm/migrate.c index 2232f6923cc7..f7ee04a5ae27 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -168,7 +168,7 @@ void putback_movable_pages(struct list_head *l) continue; } list_del(&page->lru); - dec_zone_page_state(page, NR_ISOLATED_ANON + + dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); /* * We isolated non-lru movable page so here we can use @@ -501,19 +501,21 @@ int migrate_page_move_mapping(struct address_space *mapping, * new page and drop references to the old page. * * Note that anonymous pages are accounted for - * via NR_FILE_PAGES and NR_ANON_PAGES if they + * via NR_FILE_PAGES and NR_ANON_MAPPED if they * are mapped to swap space. */ if (newzone != oldzone) { - __dec_zone_state(oldzone, NR_FILE_PAGES); - __inc_zone_state(newzone, NR_FILE_PAGES); + __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES); + __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES); if (PageSwapBacked(page) && !PageSwapCache(page)) { - __dec_zone_state(oldzone, NR_SHMEM); - __inc_zone_state(newzone, NR_SHMEM); + __dec_node_state(oldzone->zone_pgdat, NR_SHMEM); + __inc_node_state(newzone->zone_pgdat, NR_SHMEM); } if (dirty && mapping_cap_account_dirty(mapping)) { - __dec_zone_state(oldzone, NR_FILE_DIRTY); - __inc_zone_state(newzone, NR_FILE_DIRTY); + __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY); + __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING); + __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY); + __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING); } } local_irq_enable(); @@ -1119,7 +1121,7 @@ out: * restored. */ list_del(&page->lru); - dec_zone_page_state(page, NR_ISOLATED_ANON + + dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); } @@ -1460,7 +1462,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm, err = isolate_lru_page(page); if (!err) { list_add_tail(&page->lru, &pagelist); - inc_zone_page_state(page, NR_ISOLATED_ANON + + inc_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); } put_and_set: @@ -1726,15 +1728,16 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, unsigned long nr_migrate_pages) { int z; + + if (!pgdat_reclaimable(pgdat)) + return false; + for (z = pgdat->nr_zones - 1; z >= 0; z--) { struct zone *zone = pgdat->node_zones + z; if (!populated_zone(zone)) continue; - if (!zone_reclaimable(zone)) - continue; - /* Avoid waking kswapd by allocating pages_to_migrate pages. */ if (!zone_watermark_ok(zone, 0, high_wmark_pages(zone) + @@ -1828,7 +1831,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) } page_lru = page_is_file_cache(page); - mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru, + mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru, hpage_nr_pages(page)); /* @@ -1886,7 +1889,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, if (nr_remaining) { if (!list_empty(&migratepages)) { list_del(&page->lru); - dec_zone_page_state(page, NR_ISOLATED_ANON + + dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); putback_lru_page(page); } @@ -1931,7 +1934,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, goto out_dropref; new_page = alloc_pages_node(node, - (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM, + (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), HPAGE_PMD_ORDER); if (!new_page) goto out_fail; @@ -1979,7 +1982,7 @@ fail_putback: /* Retake the callers reference and putback on LRU */ get_page(page); putback_lru_page(page); - mod_zone_page_state(page_zone(page), + mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); goto out_unlock; @@ -2030,7 +2033,7 @@ fail_putback: count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); - mod_zone_page_state(page_zone(page), + mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); return isolated; diff --git a/mm/mlock.c b/mm/mlock.c index ef8dc9f395c4..14645be06e30 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -103,7 +103,7 @@ static bool __munlock_isolate_lru_page(struct page *page, bool getpage) if (PageLRU(page)) { struct lruvec *lruvec; - lruvec = mem_cgroup_page_lruvec(page, page_zone(page)); + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); if (getpage) get_page(page); ClearPageLRU(page); @@ -188,7 +188,7 @@ unsigned int munlock_vma_page(struct page *page) * might otherwise copy PageMlocked to part of the tail pages before * we clear it in the head page. It also stabilizes hpage_nr_pages(). */ - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(zone_lru_lock(zone)); nr_pages = hpage_nr_pages(page); if (!TestClearPageMlocked(page)) @@ -197,14 +197,14 @@ unsigned int munlock_vma_page(struct page *page) __mod_zone_page_state(zone, NR_MLOCK, -nr_pages); if (__munlock_isolate_lru_page(page, true)) { - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(zone_lru_lock(zone)); __munlock_isolated_page(page); goto out; } __munlock_isolation_failed(page); unlock_out: - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(zone_lru_lock(zone)); out: return nr_pages - 1; @@ -289,7 +289,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) pagevec_init(&pvec_putback, 0); /* Phase 1: page isolation */ - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(zone_lru_lock(zone)); for (i = 0; i < nr; i++) { struct page *page = pvec->pages[i]; @@ -315,7 +315,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) } delta_munlocked = -nr + pagevec_count(&pvec_putback); __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(zone_lru_lock(zone)); /* Now we can release pins of pages that we are not munlocking */ pagevec_release(&pvec_putback); diff --git a/mm/mmap.c b/mm/mmap.c index 86b18f334f4f..d44bee96a5fe 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -621,7 +621,6 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *next = vma->vm_next; - struct vm_area_struct *importer = NULL; struct address_space *mapping = NULL; struct rb_root *root = NULL; struct anon_vma *anon_vma = NULL; @@ -631,17 +630,25 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, int remove_next = 0; if (next && !insert) { - struct vm_area_struct *exporter = NULL; + struct vm_area_struct *exporter = NULL, *importer = NULL; if (end >= next->vm_end) { /* * vma expands, overlapping all the next, and * perhaps the one after too (mprotect case 6). */ -again: remove_next = 1 + (end > next->vm_end); + remove_next = 1 + (end > next->vm_end); end = next->vm_end; exporter = next; importer = vma; + + /* + * If next doesn't have anon_vma, import from vma after + * next, if the vma overlaps with it. + */ + if (remove_next == 2 && next && !next->anon_vma) + exporter = next->vm_next; + } else if (end > next->vm_start) { /* * vma expands, overlapping part of the next: @@ -675,7 +682,7 @@ again: remove_next = 1 + (end > next->vm_end); return error; } } - +again: vma_adjust_trans_huge(vma, start, end, adjust_next); if (file) { @@ -796,8 +803,11 @@ again: remove_next = 1 + (end > next->vm_end); * up the code too much to do both in one go. */ next = vma->vm_next; - if (remove_next == 2) + if (remove_next == 2) { + remove_next = 1; + end = next->vm_end; goto again; + } else if (next) vma_gap_update(next); else diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d4a929d79470..7d0a275df822 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -176,11 +176,13 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, /* * Do not even consider tasks which are explicitly marked oom - * unkillable or have been already oom reaped. + * unkillable or have been already oom reaped or the are in + * the middle of vfork */ adj = (long)p->signal->oom_score_adj; if (adj == OOM_SCORE_ADJ_MIN || - test_bit(MMF_OOM_REAPED, &p->mm->flags)) { + test_bit(MMF_OOM_REAPED, &p->mm->flags) || + in_vfork(p)) { task_unlock(p); return 0; } @@ -281,10 +283,22 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, /* * This task already has access to memory reserves and is being killed. - * Don't allow any other task to have access to the reserves. + * Don't allow any other task to have access to the reserves unless + * the task has MMF_OOM_REAPED because chances that it would release + * any memory is quite low. */ - if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) - return OOM_SCAN_ABORT; + if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) { + struct task_struct *p = find_lock_task_mm(task); + enum oom_scan_t ret = OOM_SCAN_ABORT; + + if (p) { + if (test_bit(MMF_OOM_REAPED, &p->mm->flags)) + ret = OOM_SCAN_CONTINUE; + task_unlock(p); + } + + return ret; + } /* * If task is allocating a lot of memory and has been marked to be @@ -415,7 +429,7 @@ bool oom_killer_disabled __read_mostly; * task's threads: if one of those is using this mm then this task was also * using it. */ -static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) +bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) { struct task_struct *t; @@ -554,8 +568,27 @@ static void oom_reap_task(struct task_struct *tsk) schedule_timeout_idle(HZ/10); if (attempts > MAX_OOM_REAP_RETRIES) { + struct task_struct *p; + pr_info("oom_reaper: unable to reap pid:%d (%s)\n", task_pid_nr(tsk), tsk->comm); + + /* + * If we've already tried to reap this task in the past and + * failed it probably doesn't make much sense to try yet again + * so hide the mm from the oom killer so that it can move on + * to another task with a different mm struct. + */ + p = find_lock_task_mm(tsk); + if (p) { + if (test_and_set_bit(MMF_OOM_NOT_REAPABLE, &p->mm->flags)) { + pr_info("oom_reaper: giving up pid:%d (%s)\n", + task_pid_nr(tsk), tsk->comm); + set_bit(MMF_OOM_REAPED, &p->mm->flags); + } + task_unlock(p); + } + debug_show_all_locks(); } @@ -594,7 +627,7 @@ static int oom_reaper(void *unused) return 0; } -static void wake_oom_reaper(struct task_struct *tsk) +void wake_oom_reaper(struct task_struct *tsk) { if (!oom_reaper_th) return; @@ -612,46 +645,6 @@ static void wake_oom_reaper(struct task_struct *tsk) wake_up(&oom_reaper_wait); } -/* Check if we can reap the given task. This has to be called with stable - * tsk->mm - */ -void try_oom_reaper(struct task_struct *tsk) -{ - struct mm_struct *mm = tsk->mm; - struct task_struct *p; - - if (!mm) - return; - - /* - * There might be other threads/processes which are either not - * dying or even not killable. - */ - if (atomic_read(&mm->mm_users) > 1) { - rcu_read_lock(); - for_each_process(p) { - if (!process_shares_mm(p, mm)) - continue; - if (fatal_signal_pending(p)) - continue; - - /* - * If the task is exiting make sure the whole thread group - * is exiting and cannot acces mm anymore. - */ - if (signal_group_exit(p->signal)) - continue; - - /* Give up */ - rcu_read_unlock(); - return; - } - rcu_read_unlock(); - } - - wake_oom_reaper(tsk); -} - static int __init oom_init(void) { oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); @@ -663,10 +656,6 @@ static int __init oom_init(void) return 0; } subsys_initcall(oom_init) -#else -static void wake_oom_reaper(struct task_struct *tsk) -{ -} #endif /** @@ -743,6 +732,80 @@ void oom_killer_enable(void) oom_killer_disabled = false; } +static inline bool __task_will_free_mem(struct task_struct *task) +{ + struct signal_struct *sig = task->signal; + + /* + * A coredumping process may sleep for an extended period in exit_mm(), + * so the oom killer cannot assume that the process will promptly exit + * and release memory. + */ + if (sig->flags & SIGNAL_GROUP_COREDUMP) + return false; + + if (sig->flags & SIGNAL_GROUP_EXIT) + return true; + + if (thread_group_empty(task) && (task->flags & PF_EXITING)) + return true; + + return false; +} + +/* + * Checks whether the given task is dying or exiting and likely to + * release its address space. This means that all threads and processes + * sharing the same mm have to be killed or exiting. + * Caller has to make sure that task->mm is stable (hold task_lock or + * it operates on the current). + */ +bool task_will_free_mem(struct task_struct *task) +{ + struct mm_struct *mm = task->mm; + struct task_struct *p; + bool ret; + + /* + * Skip tasks without mm because it might have passed its exit_mm and + * exit_oom_victim. oom_reaper could have rescued that but do not rely + * on that for now. We can consider find_lock_task_mm in future. + */ + if (!mm) + return false; + + if (!__task_will_free_mem(task)) + return false; + + /* + * This task has already been drained by the oom reaper so there are + * only small chances it will free some more + */ + if (test_bit(MMF_OOM_REAPED, &mm->flags)) + return false; + + if (atomic_read(&mm->mm_users) <= 1) + return true; + + /* + * This is really pessimistic but we do not have any reliable way + * to check that external processes share with our mm + */ + rcu_read_lock(); + for_each_process(p) { + if (!process_shares_mm(p, mm)) + continue; + if (same_thread_group(task, p)) + continue; + ret = __task_will_free_mem(p); + if (!ret) + break; + } + rcu_read_unlock(); + + return ret; +} + /* * Must be called while holding a reference to p, which will be released upon * returning. @@ -765,9 +828,9 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, * its children or threads, just set TIF_MEMDIE so it can die quickly */ task_lock(p); - if (p->mm && task_will_free_mem(p)) { + if (task_will_free_mem(p)) { mark_oom_victim(p); - try_oom_reaper(p); + wake_oom_reaper(p); task_unlock(p); put_task_struct(p); return; @@ -850,14 +913,18 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, continue; if (same_thread_group(p, victim)) continue; - if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) || - p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p)) { /* * We cannot use oom_reaper for the mm shared by this * process because it wouldn't get killed and so the - * memory might be still used. + * memory might be still used. Hide the mm from the oom + * killer to guarantee OOM forward progress. */ can_oom_reap = false; + set_bit(MMF_OOM_REAPED, &mm->flags); + pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n", + task_pid_nr(victim), victim->comm, + task_pid_nr(p), p->comm); continue; } do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); @@ -939,14 +1006,10 @@ bool out_of_memory(struct oom_control *oc) * If current has a pending SIGKILL or is exiting, then automatically * select it. The goal is to allow it to allocate so that it may * quickly exit and free its memory. - * - * But don't select if current has already released its mm and cleared - * TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur. */ - if (current->mm && - (fatal_signal_pending(current) || task_will_free_mem(current))) { + if (task_will_free_mem(current)) { mark_oom_victim(current); - try_oom_reaper(current); + wake_oom_reaper(current); return true; } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d578d2a56b19..f4cd7d8005c9 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -267,26 +267,35 @@ static void wb_min_max_ratio(struct bdi_writeback *wb, */ /** - * zone_dirtyable_memory - number of dirtyable pages in a zone - * @zone: the zone + * node_dirtyable_memory - number of dirtyable pages in a node + * @pgdat: the node * - * Returns the zone's number of pages potentially available for dirty - * page cache. This is the base value for the per-zone dirty limits. + * Returns the node's number of pages potentially available for dirty + * page cache. This is the base value for the per-node dirty limits. */ -static unsigned long zone_dirtyable_memory(struct zone *zone) +static unsigned long node_dirtyable_memory(struct pglist_data *pgdat) { - unsigned long nr_pages; + unsigned long nr_pages = 0; + int z; + + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = pgdat->node_zones + z; + + if (!populated_zone(zone)) + continue; + + nr_pages += zone_page_state(zone, NR_FREE_PAGES); + } - nr_pages = zone_page_state(zone, NR_FREE_PAGES); /* * Pages reserved for the kernel should not be considered * dirtyable, to prevent a situation where reclaim has to * clean pages in order to balance the zones. */ - nr_pages -= min(nr_pages, zone->totalreserve_pages); + nr_pages -= min(nr_pages, pgdat->totalreserve_pages); - nr_pages += zone_page_state(zone, NR_INACTIVE_FILE); - nr_pages += zone_page_state(zone, NR_ACTIVE_FILE); + nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE); + nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE); return nr_pages; } @@ -299,13 +308,26 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) int i; for_each_node_state(node, N_HIGH_MEMORY) { - for (i = 0; i < MAX_NR_ZONES; i++) { - struct zone *z = &NODE_DATA(node)->node_zones[i]; + for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) { + struct zone *z; + unsigned long nr_pages; + + if (!is_highmem_idx(i)) + continue; + + z = &NODE_DATA(node)->node_zones[i]; + if (!populated_zone(z)) + continue; - if (is_highmem(z)) - x += zone_dirtyable_memory(z); + nr_pages = zone_page_state(z, NR_FREE_PAGES); + /* watch for underflows */ + nr_pages -= min(nr_pages, high_wmark_pages(z)); + nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE); + nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE); + x += nr_pages; } } + /* * Unreclaimable memory (kernel memory or anonymous memory * without swap) can bring down the dirtyable pages below @@ -348,8 +370,8 @@ static unsigned long global_dirtyable_memory(void) */ x -= min(x, totalreserve_pages); - x += global_page_state(NR_INACTIVE_FILE); - x += global_page_state(NR_ACTIVE_FILE); + x += global_node_page_state(NR_INACTIVE_FILE); + x += global_node_page_state(NR_ACTIVE_FILE); if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); @@ -445,23 +467,23 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) } /** - * zone_dirty_limit - maximum number of dirty pages allowed in a zone - * @zone: the zone + * node_dirty_limit - maximum number of dirty pages allowed in a node + * @pgdat: the node * - * Returns the maximum number of dirty pages allowed in a zone, based - * on the zone's dirtyable memory. + * Returns the maximum number of dirty pages allowed in a node, based + * on the node's dirtyable memory. */ -static unsigned long zone_dirty_limit(struct zone *zone) +static unsigned long node_dirty_limit(struct pglist_data *pgdat) { - unsigned long zone_memory = zone_dirtyable_memory(zone); + unsigned long node_memory = node_dirtyable_memory(pgdat); struct task_struct *tsk = current; unsigned long dirty; if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * - zone_memory / global_dirtyable_memory(); + node_memory / global_dirtyable_memory(); else - dirty = vm_dirty_ratio * zone_memory / 100; + dirty = vm_dirty_ratio * node_memory / 100; if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) dirty += dirty / 4; @@ -470,19 +492,22 @@ static unsigned long zone_dirty_limit(struct zone *zone) } /** - * zone_dirty_ok - tells whether a zone is within its dirty limits - * @zone: the zone to check + * node_dirty_ok - tells whether a node is within its dirty limits + * @pgdat: the node to check * - * Returns %true when the dirty pages in @zone are within the zone's + * Returns %true when the dirty pages in @pgdat are within the node's * dirty limit, %false if the limit is exceeded. */ -bool zone_dirty_ok(struct zone *zone) +bool node_dirty_ok(struct pglist_data *pgdat) { - unsigned long limit = zone_dirty_limit(zone); + unsigned long limit = node_dirty_limit(pgdat); + unsigned long nr_pages = 0; + + nr_pages += node_page_state(pgdat, NR_FILE_DIRTY); + nr_pages += node_page_state(pgdat, NR_UNSTABLE_NFS); + nr_pages += node_page_state(pgdat, NR_WRITEBACK); - return zone_page_state(zone, NR_FILE_DIRTY) + - zone_page_state(zone, NR_UNSTABLE_NFS) + - zone_page_state(zone, NR_WRITEBACK) <= limit; + return nr_pages <= limit; } int dirty_background_ratio_handler(struct ctl_table *table, int write, @@ -1570,10 +1595,10 @@ static void balance_dirty_pages(struct address_space *mapping, * written to the server's write cache, but has not yet * been flushed to permanent storage. */ - nr_reclaimable = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); + nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) + + global_node_page_state(NR_UNSTABLE_NFS); gdtc->avail = global_dirtyable_memory(); - gdtc->dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); + gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); domain_dirty_limits(gdtc); @@ -1910,8 +1935,8 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) * as we're trying to decide whether to put more under writeback. */ gdtc->avail = global_dirtyable_memory(); - gdtc->dirty = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); + gdtc->dirty = global_node_page_state(NR_FILE_DIRTY) + + global_node_page_state(NR_UNSTABLE_NFS); domain_dirty_limits(gdtc); if (gdtc->dirty > gdtc->bg_thresh) @@ -1955,8 +1980,8 @@ void throttle_vm_writeout(gfp_t gfp_mask) */ dirty_thresh += dirty_thresh / 10; /* wheeee... */ - if (global_page_state(NR_UNSTABLE_NFS) + - global_page_state(NR_WRITEBACK) <= dirty_thresh) + if (global_node_page_state(NR_UNSTABLE_NFS) + + global_node_page_state(NR_WRITEBACK) <= dirty_thresh) break; congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -1984,8 +2009,8 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void laptop_mode_timer_fn(unsigned long data) { struct request_queue *q = (struct request_queue *)data; - int nr_pages = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); + int nr_pages = global_node_page_state(NR_FILE_DIRTY) + + global_node_page_state(NR_UNSTABLE_NFS); struct bdi_writeback *wb; /* @@ -2436,8 +2461,9 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) wb = inode_to_wb(inode); mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY); - __inc_zone_page_state(page, NR_FILE_DIRTY); - __inc_zone_page_state(page, NR_DIRTIED); + __inc_node_page_state(page, NR_FILE_DIRTY); + __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); + __inc_node_page_state(page, NR_DIRTIED); __inc_wb_stat(wb, WB_RECLAIMABLE); __inc_wb_stat(wb, WB_DIRTIED); task_io_account_write(PAGE_SIZE); @@ -2457,7 +2483,8 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, { if (mapping_cap_account_dirty(mapping)) { mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); - dec_zone_page_state(page, NR_FILE_DIRTY); + dec_node_page_state(page, NR_FILE_DIRTY); + dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); task_io_account_cancelled_write(PAGE_SIZE); } @@ -2525,7 +2552,7 @@ void account_page_redirty(struct page *page) wb = unlocked_inode_to_wb_begin(inode, &locked); current->nr_dirtied--; - dec_zone_page_state(page, NR_DIRTIED); + dec_node_page_state(page, NR_DIRTIED); dec_wb_stat(wb, WB_DIRTIED); unlocked_inode_to_wb_end(inode, locked); } @@ -2713,7 +2740,8 @@ int clear_page_dirty_for_io(struct page *page) wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) { mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); - dec_zone_page_state(page, NR_FILE_DIRTY); + dec_node_page_state(page, NR_FILE_DIRTY); + dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); ret = 1; } @@ -2759,8 +2787,9 @@ int test_clear_page_writeback(struct page *page) } if (ret) { mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); - dec_zone_page_state(page, NR_WRITEBACK); - inc_zone_page_state(page, NR_WRITTEN); + dec_node_page_state(page, NR_WRITEBACK); + dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); + inc_node_page_state(page, NR_WRITTEN); } unlock_page_memcg(page); return ret; @@ -2813,7 +2842,8 @@ int __test_set_page_writeback(struct page *page, bool keep_write) } if (!ret) { mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); - inc_zone_page_state(page, NR_WRITEBACK); + inc_node_page_state(page, NR_WRITEBACK); + inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); } unlock_page_memcg(page); return ret; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 452513bf02ce..ea759b935360 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -295,14 +295,6 @@ static inline bool __meminit early_page_uninitialised(unsigned long pfn) return false; } -static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid) -{ - if (pfn >= NODE_DATA(nid)->first_deferred_pfn) - return true; - - return false; -} - /* * Returns false when the remaining initialisation should be deferred until * later in the boot cycle when it can be parallelised. @@ -342,11 +334,6 @@ static inline bool early_page_uninitialised(unsigned long pfn) return false; } -static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid) -{ - return false; -} - static inline bool update_defer_init(pg_data_t *pgdat, unsigned long pfn, unsigned long zone_end, unsigned long *nr_initialised) @@ -1091,9 +1078,9 @@ static void free_pcppages_bulk(struct zone *zone, int count, spin_lock(&zone->lock); isolated_pageblocks = has_isolate_pageblock(zone); - nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); if (nr_scanned) - __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); + __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); while (count) { struct page *page; @@ -1148,9 +1135,9 @@ static void free_one_page(struct zone *zone, { unsigned long nr_scanned; spin_lock(&zone->lock); - nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); if (nr_scanned) - __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); + __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); if (unlikely(has_isolate_pageblock(zone) || is_migrate_isolate(migratetype))) { @@ -2517,7 +2504,10 @@ int __isolate_free_page(struct page *page, unsigned int order) zone->free_area[order].nr_free--; rmv_page_order(page); - /* Set the pageblock if the isolated page is at least a pageblock */ + /* + * Set the pageblock if the isolated page is at least half of a + * pageblock + */ if (order >= pageblock_order - 1) { struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { @@ -2597,7 +2587,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, else page = list_first_entry(list, struct page, lru); - __dec_zone_state(zone, NR_ALLOC_BATCH); list_del(&page->lru); pcp->count--; @@ -2623,16 +2612,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, spin_unlock(&zone->lock); if (!page) goto failed; - __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); __mod_zone_freepage_state(zone, -(1 << order), get_pcppage_migratetype(page)); } - if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 && - !test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) - set_bit(ZONE_FAIR_DEPLETED, &zone->flags); - - __count_zone_vm_events(PGALLOC, zone, 1 << order); + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); local_irq_restore(flags); @@ -2842,40 +2826,18 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order, } #ifdef CONFIG_NUMA -static bool zone_local(struct zone *local_zone, struct zone *zone) -{ - return local_zone->node == zone->node; -} - static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < RECLAIM_DISTANCE; } #else /* CONFIG_NUMA */ -static bool zone_local(struct zone *local_zone, struct zone *zone) -{ - return true; -} - static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return true; } #endif /* CONFIG_NUMA */ -static void reset_alloc_batches(struct zone *preferred_zone) -{ - struct zone *zone = preferred_zone->zone_pgdat->node_zones; - - do { - mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - low_wmark_pages(zone) - - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); - clear_bit(ZONE_FAIR_DEPLETED, &zone->flags); - } while (zone++ != preferred_zone); -} - /* * get_page_from_freelist goes through the zonelist trying to allocate * a page. @@ -2886,10 +2848,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, { struct zoneref *z = ac->preferred_zoneref; struct zone *zone; - bool fair_skipped = false; - bool apply_fair = (alloc_flags & ALLOC_FAIR); + struct pglist_data *last_pgdat_dirty_limit = NULL; -zonelist_scan: /* * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed() comment in kernel/cpuset.c. @@ -2904,50 +2864,33 @@ zonelist_scan: !__cpuset_zone_allowed(zone, gfp_mask)) continue; /* - * Distribute pages in proportion to the individual - * zone size to ensure fair page aging. The zone a - * page was allocated in should have no effect on the - * time the page has in memory before being reclaimed. - */ - if (apply_fair) { - if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { - fair_skipped = true; - continue; - } - if (!zone_local(ac->preferred_zoneref->zone, zone)) { - if (fair_skipped) - goto reset_fair; - apply_fair = false; - } - } - /* * When allocating a page cache page for writing, we - * want to get it from a zone that is within its dirty - * limit, such that no single zone holds more than its + * want to get it from a node that is within its dirty + * limit, such that no single node holds more than its * proportional share of globally allowed dirty pages. - * The dirty limits take into account the zone's + * The dirty limits take into account the node's * lowmem reserves and high watermark so that kswapd * should be able to balance it without having to * write pages from its LRU list. * - * This may look like it could increase pressure on - * lower zones by failing allocations in higher zones - * before they are full. But the pages that do spill - * over are limited as the lower zones are protected - * by this very same mechanism. It should not become - * a practical burden to them. - * * XXX: For now, allow allocations to potentially - * exceed the per-zone dirty limit in the slowpath + * exceed the per-node dirty limit in the slowpath * (spread_dirty_pages unset) before going into reclaim, * which is important when on a NUMA setup the allowed - * zones are together not big enough to reach the + * nodes are together not big enough to reach the * global limit. The proper fix for these situations - * will require awareness of zones in the + * will require awareness of nodes in the * dirty-throttling and the flusher threads. */ - if (ac->spread_dirty_pages && !zone_dirty_ok(zone)) - continue; + if (ac->spread_dirty_pages) { + if (last_pgdat_dirty_limit == zone->zone_pgdat) + continue; + + if (!node_dirty_ok(zone->zone_pgdat)) { + last_pgdat_dirty_limit = zone->zone_pgdat; + continue; + } + } mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; if (!zone_watermark_fast(zone, order, mark, @@ -2959,16 +2902,16 @@ zonelist_scan: if (alloc_flags & ALLOC_NO_WATERMARKS) goto try_this_zone; - if (zone_reclaim_mode == 0 || + if (node_reclaim_mode == 0 || !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) continue; - ret = zone_reclaim(zone, gfp_mask, order); + ret = node_reclaim(zone->zone_pgdat, gfp_mask, order); switch (ret) { - case ZONE_RECLAIM_NOSCAN: + case NODE_RECLAIM_NOSCAN: /* did not scan */ continue; - case ZONE_RECLAIM_FULL: + case NODE_RECLAIM_FULL: /* scanned but unreclaimable */ continue; default: @@ -2998,23 +2941,6 @@ try_this_zone: } } - /* - * The first pass makes sure allocations are spread fairly within the - * local node. However, the local node might have free pages left - * after the fairness batches are exhausted, and remote zones haven't - * even been considered yet. Try once more without fairness, and - * include remote zones now, before entering the slowpath and waking - * kswapd: prefer spilling to a remote zone over swapping locally. - */ - if (fair_skipped) { -reset_fair: - apply_fair = false; - fair_skipped = false; - reset_alloc_batches(ac->preferred_zoneref->zone); - z = ac->preferred_zoneref; - goto zonelist_scan; - } - return NULL; } @@ -3159,7 +3085,6 @@ out: return page; } - /* * Maximum number of compaction retries wit a progress before OOM * killer is consider as the only way to move forward. @@ -3171,17 +3096,16 @@ out: static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, - enum migrate_mode mode, enum compact_result *compact_result) + enum compact_priority prio, enum compact_result *compact_result) { struct page *page; - int contended_compaction; if (!order) return NULL; current->flags |= PF_MEMALLOC; *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, - mode, &contended_compaction); + prio); current->flags &= ~PF_MEMALLOC; if (*compact_result <= COMPACT_INACTIVE) @@ -3193,8 +3117,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, */ count_vm_event(COMPACTSTALL); - page = get_page_from_freelist(gfp_mask, order, - alloc_flags & ~ALLOC_NO_WATERMARKS, ac); + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); if (page) { struct zone *zone = page_zone(page); @@ -3211,24 +3134,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, */ count_vm_event(COMPACTFAIL); - /* - * In all zones where compaction was attempted (and not - * deferred or skipped), lock contention has been detected. - * For THP allocation we do not want to disrupt the others - * so we fallback to base pages instead. - */ - if (contended_compaction == COMPACT_CONTENDED_LOCK) - *compact_result = COMPACT_CONTENDED; - - /* - * If compaction was aborted due to need_resched(), we do not - * want to further increase allocation latency, unless it is - * khugepaged trying to collapse. - */ - if (contended_compaction == COMPACT_CONTENDED_SCHED - && !(current->flags & PF_KTHREAD)) - *compact_result = COMPACT_CONTENDED; - cond_resched(); return NULL; @@ -3236,7 +3141,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, static inline bool should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, - enum compact_result compact_result, enum migrate_mode *migrate_mode, + enum compact_result compact_result, + enum compact_priority *compact_priority, int compaction_retries) { int max_retries = MAX_COMPACT_RETRIES; @@ -3247,11 +3153,11 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, /* * compaction considers all the zone as desperately out of memory * so it doesn't really make much sense to retry except when the - * failure could be caused by weak migration mode. + * failure could be caused by insufficient priority */ if (compaction_failed(compact_result)) { - if (*migrate_mode == MIGRATE_ASYNC) { - *migrate_mode = MIGRATE_SYNC_LIGHT; + if (*compact_priority > MIN_COMPACT_PRIORITY) { + (*compact_priority)--; return true; } return false; @@ -3285,7 +3191,7 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, - enum migrate_mode mode, enum compact_result *compact_result) + enum compact_priority prio, enum compact_result *compact_result) { *compact_result = COMPACT_SKIPPED; return NULL; @@ -3294,7 +3200,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, static inline bool should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, enum compact_result compact_result, - enum migrate_mode *migrate_mode, + enum compact_priority *compact_priority, int compaction_retries) { struct zone *zone; @@ -3362,8 +3268,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, return NULL; retry: - page = get_page_from_freelist(gfp_mask, order, - alloc_flags & ~ALLOC_NO_WATERMARKS, ac); + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); /* * If an allocation failed after direct reclaim, it could be because @@ -3384,10 +3289,14 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) { struct zoneref *z; struct zone *zone; + pg_data_t *last_pgdat = NULL; for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, - ac->high_zoneidx, ac->nodemask) - wakeup_kswapd(zone, order, ac_classzone_idx(ac)); + ac->high_zoneidx, ac->nodemask) { + if (last_pgdat != zone->zone_pgdat) + wakeup_kswapd(zone, order, ac->high_zoneidx); + last_pgdat = zone->zone_pgdat; + } } static inline unsigned int @@ -3421,16 +3330,6 @@ gfp_to_alloc_flags(gfp_t gfp_mask) } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; - if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { - if (gfp_mask & __GFP_MEMALLOC) - alloc_flags |= ALLOC_NO_WATERMARKS; - else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) - alloc_flags |= ALLOC_NO_WATERMARKS; - else if (!in_interrupt() && - ((current->flags & PF_MEMALLOC) || - unlikely(test_thread_flag(TIF_MEMDIE)))) - alloc_flags |= ALLOC_NO_WATERMARKS; - } #ifdef CONFIG_CMA if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; @@ -3440,12 +3339,19 @@ gfp_to_alloc_flags(gfp_t gfp_mask) bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) { - return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); -} + if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) + return false; -static inline bool is_thp_gfp_mask(gfp_t gfp_mask) -{ - return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE; + if (gfp_mask & __GFP_MEMALLOC) + return true; + if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) + return true; + if (!in_interrupt() && + ((current->flags & PF_MEMALLOC) || + unlikely(test_thread_flag(TIF_MEMDIE)))) + return true; + + return false; } /* @@ -3481,10 +3387,10 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, return false; /* - * Keep reclaiming pages while there is a chance this will lead somewhere. - * If none of the target zones can satisfy our allocation request even - * if all reclaimable pages are considered then we are screwed and have - * to go OOM. + * Keep reclaiming pages while there is a chance this will lead + * somewhere. If none of the target zones can satisfy our allocation + * request even if all reclaimable pages are considered then we are + * screwed and have to go OOM. */ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) { @@ -3509,14 +3415,12 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * prevent from pre mature OOM */ if (!did_some_progress) { - unsigned long writeback; - unsigned long dirty; + unsigned long write_pending; - writeback = zone_page_state_snapshot(zone, - NR_WRITEBACK); - dirty = zone_page_state_snapshot(zone, NR_FILE_DIRTY); + write_pending = zone_page_state_snapshot(zone, + NR_ZONE_WRITE_PENDING); - if (2*(writeback + dirty) > reclaimable) { + if (2 * write_pending > reclaimable) { congestion_wait(BLK_RW_ASYNC, HZ/10); return true; } @@ -3551,7 +3455,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct page *page = NULL; unsigned int alloc_flags; unsigned long did_some_progress; - enum migrate_mode migration_mode = MIGRATE_ASYNC; + enum compact_priority compact_priority = DEF_COMPACT_PRIORITY; enum compact_result compact_result; int compaction_retries = 0; int no_progress_loops = 0; @@ -3575,42 +3479,88 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) gfp_mask &= ~__GFP_ATOMIC; -retry: + /* + * The fast path uses conservative alloc_flags to succeed only until + * kswapd needs to be woken up, and to avoid the cost of setting up + * alloc_flags precisely. So we do that now. + */ + alloc_flags = gfp_to_alloc_flags(gfp_mask); + if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); /* - * OK, we're below the kswapd watermark and have kicked background - * reclaim. Now things get more complex, so set up alloc_flags according - * to how we want to proceed. + * The adjusted alloc_flags might result in immediate success, so try + * that first */ - alloc_flags = gfp_to_alloc_flags(gfp_mask); + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); + if (page) + goto got_pg; + + /* + * For costly allocations, try direct compaction first, as it's likely + * that we have enough base pages and don't need to reclaim. Don't try + * that for allocations that are allowed to ignore watermarks, as the + * ALLOC_NO_WATERMARKS attempt didn't yet happen. + */ + if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER && + !gfp_pfmemalloc_allowed(gfp_mask)) { + page = __alloc_pages_direct_compact(gfp_mask, order, + alloc_flags, ac, + INIT_COMPACT_PRIORITY, + &compact_result); + if (page) + goto got_pg; + + /* + * Checks for costly allocations with __GFP_NORETRY, which + * includes THP page fault allocations + */ + if (gfp_mask & __GFP_NORETRY) { + /* + * If compaction is deferred for high-order allocations, + * it is because sync compaction recently failed. If + * this is the case and the caller requested a THP + * allocation, we do not want to heavily disrupt the + * system, so we fail the allocation instead of entering + * direct reclaim. + */ + if (compact_result == COMPACT_DEFERRED) + goto nopage; + + /* + * Looks like reclaim/compaction is worth trying, but + * sync compaction could be very expensive, so keep + * using async compaction. + */ + compact_priority = INIT_COMPACT_PRIORITY; + } + } + +retry: + /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ + if (gfp_mask & __GFP_KSWAPD_RECLAIM) + wake_all_kswapds(order, ac); + + if (gfp_pfmemalloc_allowed(gfp_mask)) + alloc_flags = ALLOC_NO_WATERMARKS; /* * Reset the zonelist iterators if memory policies can be ignored. * These allocations are high priority and system rather than user * orientated. */ - if ((alloc_flags & ALLOC_NO_WATERMARKS) || !(alloc_flags & ALLOC_CPUSET)) { + if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) { ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); } - /* This is the last chance, in general, before the goto nopage. */ - page = get_page_from_freelist(gfp_mask, order, - alloc_flags & ~ALLOC_NO_WATERMARKS, ac); + /* Attempt with potentially adjusted zonelist and alloc_flags */ + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); if (page) goto got_pg; - /* Allocate without watermarks if the context allows */ - if (alloc_flags & ALLOC_NO_WATERMARKS) { - page = get_page_from_freelist(gfp_mask, order, - ALLOC_NO_WATERMARKS, ac); - if (page) - goto got_pg; - } - /* Caller is not willing to reclaim, we can't balance anything */ if (!can_direct_reclaim) { /* @@ -3640,38 +3590,6 @@ retry: if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) goto nopage; - /* - * Try direct compaction. The first pass is asynchronous. Subsequent - * attempts after direct reclaim are synchronous - */ - page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, - migration_mode, - &compact_result); - if (page) - goto got_pg; - - /* Checks for THP-specific high-order allocations */ - if (is_thp_gfp_mask(gfp_mask)) { - /* - * If compaction is deferred for high-order allocations, it is - * because sync compaction recently failed. If this is the case - * and the caller requested a THP allocation, we do not want - * to heavily disrupt the system, so we fail the allocation - * instead of entering direct reclaim. - */ - if (compact_result == COMPACT_DEFERRED) - goto nopage; - - /* - * Compaction is contended so rather back off than cause - * excessive stalls. - */ - if(compact_result == COMPACT_CONTENDED) - goto nopage; - } - - if (order && compaction_made_progress(compact_result)) - compaction_retries++; /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, @@ -3679,16 +3597,25 @@ retry: if (page) goto got_pg; + /* Try direct compaction and then allocating */ + page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, + compact_priority, &compact_result); + if (page) + goto got_pg; + + if (order && compaction_made_progress(compact_result)) + compaction_retries++; + /* Do not loop if specifically requested */ if (gfp_mask & __GFP_NORETRY) - goto noretry; + goto nopage; /* * Do not retry costly high order allocations unless they are * __GFP_REPEAT */ if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) - goto noretry; + goto nopage; /* * Costly allocations might have made a progress but this doesn't mean @@ -3712,7 +3639,7 @@ retry: */ if (did_some_progress > 0 && should_compact_retry(ac, order, alloc_flags, - compact_result, &migration_mode, + compact_result, &compact_priority, compaction_retries)) goto retry; @@ -3727,25 +3654,6 @@ retry: goto retry; } -noretry: - /* - * High-order allocations do not necessarily loop after direct reclaim - * and reclaim/compaction depends on compaction being called after - * reclaim so call directly if necessary. - * It can become very expensive to allocate transparent hugepages at - * fault, so use asynchronous memory compaction for THP unless it is - * khugepaged trying to collapse. All other requests should tolerate - * at least light sync migration. - */ - if (is_thp_gfp_mask(gfp_mask) && !(current->flags & PF_KTHREAD)) - migration_mode = MIGRATE_ASYNC; - else - migration_mode = MIGRATE_SYNC_LIGHT; - page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, - ac, migration_mode, - &compact_result); - if (page) - goto got_pg; nopage: warn_alloc_failed(gfp_mask, order, NULL); got_pg: @@ -3761,7 +3669,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, { struct page *page; unsigned int cpuset_mems_cookie; - unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR; + unsigned int alloc_flags = ALLOC_WMARK_LOW; gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { .high_zoneidx = gfp_zone(gfp_mask), @@ -4192,7 +4100,7 @@ EXPORT_SYMBOL_GPL(si_mem_available); void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; - val->sharedram = global_page_state(NR_SHMEM); + val->sharedram = global_node_page_state(NR_SHMEM); val->freeram = global_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); val->totalhigh = totalhigh_pages; @@ -4214,8 +4122,8 @@ void si_meminfo_node(struct sysinfo *val, int nid) for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) managed_pages += pgdat->node_zones[zone_type].managed_pages; val->totalram = managed_pages; - val->sharedram = node_page_state(nid, NR_SHMEM); - val->freeram = node_page_state(nid, NR_FREE_PAGES); + val->sharedram = node_page_state(pgdat, NR_SHMEM); + val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; @@ -4298,6 +4206,7 @@ void show_free_areas(unsigned int filter) unsigned long free_pcp = 0; int cpu; struct zone *zone; + pg_data_t *pgdat; for_each_populated_zone(zone) { if (skip_free_areas_node(filter, zone_to_nid(zone))) @@ -4312,35 +4221,74 @@ void show_free_areas(unsigned int filter) " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" " slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - " anon_thp: %lu shmem_thp: %lu shmem_pmdmapped: %lu\n" -#endif " free:%lu free_pcp:%lu free_cma:%lu\n", - global_page_state(NR_ACTIVE_ANON), - global_page_state(NR_INACTIVE_ANON), - global_page_state(NR_ISOLATED_ANON), - global_page_state(NR_ACTIVE_FILE), - global_page_state(NR_INACTIVE_FILE), - global_page_state(NR_ISOLATED_FILE), - global_page_state(NR_UNEVICTABLE), - global_page_state(NR_FILE_DIRTY), - global_page_state(NR_WRITEBACK), - global_page_state(NR_UNSTABLE_NFS), + global_node_page_state(NR_ACTIVE_ANON), + global_node_page_state(NR_INACTIVE_ANON), + global_node_page_state(NR_ISOLATED_ANON), + global_node_page_state(NR_ACTIVE_FILE), + global_node_page_state(NR_INACTIVE_FILE), + global_node_page_state(NR_ISOLATED_FILE), + global_node_page_state(NR_UNEVICTABLE), + global_node_page_state(NR_FILE_DIRTY), + global_node_page_state(NR_WRITEBACK), + global_node_page_state(NR_UNSTABLE_NFS), global_page_state(NR_SLAB_RECLAIMABLE), global_page_state(NR_SLAB_UNRECLAIMABLE), - global_page_state(NR_FILE_MAPPED), - global_page_state(NR_SHMEM), + global_node_page_state(NR_FILE_MAPPED), + global_node_page_state(NR_SHMEM), global_page_state(NR_PAGETABLE), global_page_state(NR_BOUNCE), -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - global_page_state(NR_ANON_THPS) * HPAGE_PMD_NR, - global_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR, - global_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR, -#endif global_page_state(NR_FREE_PAGES), free_pcp, global_page_state(NR_FREE_CMA_PAGES)); + for_each_online_pgdat(pgdat) { + printk("Node %d" + " active_anon:%lukB" + " inactive_anon:%lukB" + " active_file:%lukB" + " inactive_file:%lukB" + " unevictable:%lukB" + " isolated(anon):%lukB" + " isolated(file):%lukB" + " mapped:%lukB" + " dirty:%lukB" + " writeback:%lukB" + " shmem:%lukB" +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + " shmem_thp: %lukB" + " shmem_pmdmapped: %lukB" + " anon_thp: %lukB" +#endif + " writeback_tmp:%lukB" + " unstable:%lukB" + " pages_scanned:%lu" + " all_unreclaimable? %s" + "\n", + pgdat->node_id, + K(node_page_state(pgdat, NR_ACTIVE_ANON)), + K(node_page_state(pgdat, NR_INACTIVE_ANON)), + K(node_page_state(pgdat, NR_ACTIVE_FILE)), + K(node_page_state(pgdat, NR_INACTIVE_FILE)), + K(node_page_state(pgdat, NR_UNEVICTABLE)), + K(node_page_state(pgdat, NR_ISOLATED_ANON)), + K(node_page_state(pgdat, NR_ISOLATED_FILE)), + K(node_page_state(pgdat, NR_FILE_MAPPED)), + K(node_page_state(pgdat, NR_FILE_DIRTY)), + K(node_page_state(pgdat, NR_WRITEBACK)), +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), + K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) + * HPAGE_PMD_NR), + K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), +#endif + K(node_page_state(pgdat, NR_SHMEM)), + K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), + K(node_page_state(pgdat, NR_UNSTABLE_NFS)), + node_page_state(pgdat, NR_PAGES_SCANNED), + !pgdat_reclaimable(pgdat) ? "yes" : "no"); + } + for_each_populated_zone(zone) { int i; @@ -4362,72 +4310,41 @@ void show_free_areas(unsigned int filter) " active_file:%lukB" " inactive_file:%lukB" " unevictable:%lukB" - " isolated(anon):%lukB" - " isolated(file):%lukB" + " writepending:%lukB" " present:%lukB" " managed:%lukB" " mlocked:%lukB" - " dirty:%lukB" - " writeback:%lukB" - " mapped:%lukB" - " shmem:%lukB" -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - " shmem_thp: %lukB" - " shmem_pmdmapped: %lukB" - " anon_thp: %lukB" -#endif " slab_reclaimable:%lukB" " slab_unreclaimable:%lukB" " kernel_stack:%lukB" " pagetables:%lukB" - " unstable:%lukB" " bounce:%lukB" " free_pcp:%lukB" " local_pcp:%ukB" " free_cma:%lukB" - " writeback_tmp:%lukB" - " pages_scanned:%lu" - " all_unreclaimable? %s" "\n", zone->name, K(zone_page_state(zone, NR_FREE_PAGES)), K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), - K(zone_page_state(zone, NR_ACTIVE_ANON)), - K(zone_page_state(zone, NR_INACTIVE_ANON)), - K(zone_page_state(zone, NR_ACTIVE_FILE)), - K(zone_page_state(zone, NR_INACTIVE_FILE)), - K(zone_page_state(zone, NR_UNEVICTABLE)), - K(zone_page_state(zone, NR_ISOLATED_ANON)), - K(zone_page_state(zone, NR_ISOLATED_FILE)), + K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), + K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), + K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), + K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), + K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), + K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), K(zone->present_pages), K(zone->managed_pages), K(zone_page_state(zone, NR_MLOCK)), - K(zone_page_state(zone, NR_FILE_DIRTY)), - K(zone_page_state(zone, NR_WRITEBACK)), - K(zone_page_state(zone, NR_FILE_MAPPED)), - K(zone_page_state(zone, NR_SHMEM)), -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - K(zone_page_state(zone, NR_SHMEM_THPS) * HPAGE_PMD_NR), - K(zone_page_state(zone, NR_SHMEM_PMDMAPPED) - * HPAGE_PMD_NR), - K(zone_page_state(zone, NR_ANON_THPS) * HPAGE_PMD_NR), -#endif K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), - zone_page_state(zone, NR_KERNEL_STACK) * - THREAD_SIZE / 1024, + zone_page_state(zone, NR_KERNEL_STACK_KB), K(zone_page_state(zone, NR_PAGETABLE)), - K(zone_page_state(zone, NR_UNSTABLE_NFS)), K(zone_page_state(zone, NR_BOUNCE)), K(free_pcp), K(this_cpu_read(zone->pageset->pcp.count)), - K(zone_page_state(zone, NR_FREE_CMA_PAGES)), - K(zone_page_state(zone, NR_WRITEBACK_TEMP)), - K(zone_page_state(zone, NR_PAGES_SCANNED)), - (!zone_reclaimable(zone) ? "yes" : "no") - ); + K(zone_page_state(zone, NR_FREE_CMA_PAGES))); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) printk(" %ld", zone->lowmem_reserve[i]); @@ -4469,7 +4386,7 @@ void show_free_areas(unsigned int filter) hugetlb_show_meminfo(); - printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); + printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES)); show_swap_cache_info(); } @@ -5340,6 +5257,11 @@ static void __meminit setup_zone_pageset(struct zone *zone) zone->pageset = alloc_percpu(struct per_cpu_pageset); for_each_possible_cpu(cpu) zone_pageset_init(zone, cpu); + + if (!zone->zone_pgdat->per_cpu_nodestats) { + zone->zone_pgdat->per_cpu_nodestats = + alloc_percpu(struct per_cpu_nodestat); + } } /* @@ -5909,6 +5831,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) init_waitqueue_head(&pgdat->kcompactd_wait); #endif pgdat_page_ext_init(pgdat); + spin_lock_init(&pgdat->lru_lock); + lruvec_init(node_lruvec(pgdat)); for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; @@ -5958,21 +5882,16 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; #ifdef CONFIG_NUMA zone->node = nid; - zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) + pgdat->min_unmapped_pages += (freesize*sysctl_min_unmapped_ratio) / 100; - zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; + pgdat->min_slab_pages += (freesize * sysctl_min_slab_ratio) / 100; #endif zone->name = zone_names[j]; + zone->zone_pgdat = pgdat; spin_lock_init(&zone->lock); - spin_lock_init(&zone->lru_lock); zone_seqlock_init(zone); - zone->zone_pgdat = pgdat; zone_pcp_init(zone); - /* For bootup, initialized properly in watermark setup */ - mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages); - - lruvec_init(&zone->lruvec); if (!size) continue; @@ -6038,11 +5957,12 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, unsigned long end_pfn = 0; /* pg_data_t should be reset to zero when it's allocated */ - WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); + WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); reset_deferred_meminit(pgdat); pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; + pgdat->per_cpu_nodestats = NULL; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, @@ -6699,6 +6619,9 @@ static void calculate_totalreserve_pages(void) enum zone_type i, j; for_each_online_pgdat(pgdat) { + + pgdat->totalreserve_pages = 0; + for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; long max = 0; @@ -6715,7 +6638,7 @@ static void calculate_totalreserve_pages(void) if (max > zone->managed_pages) max = zone->managed_pages; - zone->totalreserve_pages = max; + pgdat->totalreserve_pages += max; reserve_pages += max; } @@ -6816,10 +6739,6 @@ static void __setup_per_zone_wmarks(void) zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; - __mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - low_wmark_pages(zone) - - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); - spin_unlock_irqrestore(&zone->lock, flags); } @@ -6930,6 +6849,7 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { + struct pglist_data *pgdat; struct zone *zone; int rc; @@ -6937,8 +6857,11 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, if (rc) return rc; + for_each_online_pgdat(pgdat) + pgdat->min_slab_pages = 0; + for_each_zone(zone) - zone->min_unmapped_pages = (zone->managed_pages * + zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages * sysctl_min_unmapped_ratio) / 100; return 0; } @@ -6946,6 +6869,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { + struct pglist_data *pgdat; struct zone *zone; int rc; @@ -6953,8 +6877,11 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, if (rc) return rc; + for_each_online_pgdat(pgdat) + pgdat->min_slab_pages = 0; + for_each_zone(zone) - zone->min_slab_pages = (zone->managed_pages * + zone->zone_pgdat->min_slab_pages += (zone->managed_pages * sysctl_min_slab_ratio) / 100; return 0; } diff --git a/mm/page_idle.c b/mm/page_idle.c index 4ea9c4ef5146..ae11aa914e55 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -41,12 +41,12 @@ static struct page *page_idle_get_page(unsigned long pfn) return NULL; zone = page_zone(page); - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(zone_lru_lock(zone)); if (unlikely(!PageLRU(page))) { put_page(page); page = NULL; } - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(zone_lru_lock(zone)); return page; } diff --git a/mm/page_io.c b/mm/page_io.c index dcc5d3769608..fb1fa269d3a0 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -166,6 +166,8 @@ int generic_swapfile_activate(struct swap_info_struct *sis, unsigned block_in_page; sector_t first_block; + cond_resched(); + first_block = bmap(inode, probe_block); if (first_block == 0) goto bad_bmap; diff --git a/mm/rmap.c b/mm/rmap.c index 8a13d9f7b566..709bc83703b1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -27,7 +27,7 @@ * mapping->i_mmap_rwsem * anon_vma->rwsem * mm->page_table_lock or pte_lock - * zone->lru_lock (in mark_page_accessed, isolate_lru_page) + * zone_lru_lock (in mark_page_accessed, isolate_lru_page) * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) @@ -1213,8 +1213,8 @@ void do_page_add_anon_rmap(struct page *page, * disabled. */ if (compound) - __inc_zone_page_state(page, NR_ANON_THPS); - __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr); + __inc_node_page_state(page, NR_ANON_THPS); + __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); } if (unlikely(PageKsm(page))) return; @@ -1251,14 +1251,14 @@ void page_add_new_anon_rmap(struct page *page, VM_BUG_ON_PAGE(!PageTransHuge(page), page); /* increment count (starts at -1) */ atomic_set(compound_mapcount_ptr(page), 0); - __inc_zone_page_state(page, NR_ANON_THPS); + __inc_node_page_state(page, NR_ANON_THPS); } else { /* Anon THP always mapped first with PMD */ VM_BUG_ON_PAGE(PageTransCompound(page), page); /* increment count (starts at -1) */ atomic_set(&page->_mapcount, 0); } - __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr); + __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); __page_set_anon_rmap(page, vma, address, 1); } @@ -1282,7 +1282,7 @@ void page_add_file_rmap(struct page *page, bool compound) if (!atomic_inc_and_test(compound_mapcount_ptr(page))) goto out; VM_BUG_ON_PAGE(!PageSwapBacked(page), page); - __inc_zone_page_state(page, NR_SHMEM_PMDMAPPED); + __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); } else { if (PageTransCompound(page)) { VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -1293,7 +1293,7 @@ void page_add_file_rmap(struct page *page, bool compound) if (!atomic_inc_and_test(&page->_mapcount)) goto out; } - __mod_zone_page_state(page_zone(page), NR_FILE_MAPPED, nr); + __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr); mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); out: unlock_page_memcg(page); @@ -1322,18 +1322,18 @@ static void page_remove_file_rmap(struct page *page, bool compound) if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) goto out; VM_BUG_ON_PAGE(!PageSwapBacked(page), page); - __dec_zone_page_state(page, NR_SHMEM_PMDMAPPED); + __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); } else { if (!atomic_add_negative(-1, &page->_mapcount)) goto out; } /* - * We use the irq-unsafe __{inc|mod}_zone_page_stat because + * We use the irq-unsafe __{inc|mod}_zone_page_state because * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption disabled. */ - __mod_zone_page_state(page_zone(page), NR_FILE_MAPPED, -nr); + __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr); mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); if (unlikely(PageMlocked(page))) @@ -1356,7 +1356,7 @@ static void page_remove_anon_compound_rmap(struct page *page) if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return; - __dec_zone_page_state(page, NR_ANON_THPS); + __dec_node_page_state(page, NR_ANON_THPS); if (TestClearPageDoubleMap(page)) { /* @@ -1375,7 +1375,7 @@ static void page_remove_anon_compound_rmap(struct page *page) clear_page_mlock(page); if (nr) { - __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr); + __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr); deferred_split_huge_page(page); } } @@ -1404,7 +1404,7 @@ void page_remove_rmap(struct page *page, bool compound) * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption disabled. */ - __dec_zone_page_state(page, NR_ANON_PAGES); + __dec_node_page_state(page, NR_ANON_MAPPED); if (unlikely(PageMlocked(page))) clear_page_mlock(page); diff --git a/mm/shmem.c b/mm/shmem.c index 62e42c7d544c..2ac19a61d565 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -575,9 +575,9 @@ static int shmem_add_to_page_cache(struct page *page, if (!error) { mapping->nrpages += nr; if (PageTransHuge(page)) - __inc_zone_page_state(page, NR_SHMEM_THPS); - __mod_zone_page_state(page_zone(page), NR_FILE_PAGES, nr); - __mod_zone_page_state(page_zone(page), NR_SHMEM, nr); + __inc_node_page_state(page, NR_SHMEM_THPS); + __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); + __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr); spin_unlock_irq(&mapping->tree_lock); } else { page->mapping = NULL; @@ -601,8 +601,8 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) error = shmem_radix_tree_replace(mapping, page->index, page, radswap); page->mapping = NULL; mapping->nrpages--; - __dec_zone_page_state(page, NR_FILE_PAGES); - __dec_zone_page_state(page, NR_SHMEM); + __dec_node_page_state(page, NR_FILE_PAGES); + __dec_node_page_state(page, NR_SHMEM); spin_unlock_irq(&mapping->tree_lock); put_page(page); BUG_ON(error); @@ -1493,8 +1493,8 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, newpage); if (!error) { - __inc_zone_page_state(newpage, NR_FILE_PAGES); - __dec_zone_page_state(oldpage, NR_FILE_PAGES); + __inc_node_page_state(newpage, NR_FILE_PAGES); + __dec_node_page_state(oldpage, NR_FILE_PAGES); } spin_unlock_irq(&swap_mapping->tree_lock); diff --git a/mm/slab.h b/mm/slab.h index f33980ab0406..9653f2e2591a 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -369,6 +369,8 @@ static inline size_t slab_ksize(const struct kmem_cache *s) if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) return s->object_size; # endif + if (s->flags & SLAB_KASAN) + return s->object_size; /* * If we have the need to store the freelist pointer * back there or track user information then we can diff --git a/mm/slub.c b/mm/slub.c index f9da8716b8b3..74e7c8c30db8 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -124,7 +124,7 @@ static inline int kmem_cache_debug(struct kmem_cache *s) #endif } -static inline void *fixup_red_left(struct kmem_cache *s, void *p) +inline void *fixup_red_left(struct kmem_cache *s, void *p) { if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) p += s->red_left_pad; @@ -454,8 +454,6 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p) */ #if defined(CONFIG_SLUB_DEBUG_ON) static int slub_debug = DEBUG_DEFAULT_FLAGS; -#elif defined(CONFIG_KASAN) -static int slub_debug = SLAB_STORE_USER; #else static int slub_debug; #endif @@ -660,6 +658,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); + off += kasan_metadata_size(s); + if (off != size_from_object(s)) /* Beginning of the filler is the free pointer */ print_section("Padding ", p + off, size_from_object(s) - off); @@ -787,6 +787,8 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) /* We also have user information there */ off += 2 * sizeof(struct track); + off += kasan_metadata_size(s); + if (size_from_object(s) == off) return 1; @@ -1322,8 +1324,10 @@ static inline void kfree_hook(const void *x) kasan_kfree_large(x); } -static inline void slab_free_hook(struct kmem_cache *s, void *x) +static inline void *slab_free_hook(struct kmem_cache *s, void *x) { + void *freeptr; + kmemleak_free_recursive(x, s->flags); /* @@ -1344,7 +1348,13 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) if (!(s->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(x, s->object_size); + freeptr = get_freepointer(s, x); + /* + * kasan_slab_free() may put x into memory quarantine, delaying its + * reuse. In this case the object's freelist pointer is changed. + */ kasan_slab_free(s, x); + return freeptr; } static inline void slab_free_freelist_hook(struct kmem_cache *s, @@ -1362,11 +1372,11 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s, void *object = head; void *tail_obj = tail ? : head; + void *freeptr; do { - slab_free_hook(s, object); - } while ((object != tail_obj) && - (object = get_freepointer(s, object))); + freeptr = slab_free_hook(s, object); + } while ((object != tail_obj) && (object = freeptr)); #endif } @@ -2878,16 +2888,13 @@ slab_empty: * same page) possible by specifying head and tail ptr, plus objects * count (cnt). Bulk free indicated by tail pointer being set. */ -static __always_inline void slab_free(struct kmem_cache *s, struct page *page, - void *head, void *tail, int cnt, - unsigned long addr) +static __always_inline void do_slab_free(struct kmem_cache *s, + struct page *page, void *head, void *tail, + int cnt, unsigned long addr) { void *tail_obj = tail ? : head; struct kmem_cache_cpu *c; unsigned long tid; - - slab_free_freelist_hook(s, head, tail); - redo: /* * Determine the currently cpus per cpu slab. @@ -2921,6 +2928,27 @@ redo: } +static __always_inline void slab_free(struct kmem_cache *s, struct page *page, + void *head, void *tail, int cnt, + unsigned long addr) +{ + slab_free_freelist_hook(s, head, tail); + /* + * slab_free_freelist_hook() could have put the items into quarantine. + * If so, no need to free them. + */ + if (s->flags & SLAB_KASAN && !(s->flags & SLAB_DESTROY_BY_RCU)) + return; + do_slab_free(s, page, head, tail, cnt, addr); +} + +#ifdef CONFIG_KASAN +void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) +{ + do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr); +} +#endif + void kmem_cache_free(struct kmem_cache *s, void *x) { s = cache_from_obj(s, x); @@ -3363,7 +3391,7 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min) static int calculate_sizes(struct kmem_cache *s, int forced_order) { unsigned long flags = s->flags; - unsigned long size = s->object_size; + size_t size = s->object_size; int order; /* @@ -3422,7 +3450,10 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * the object. */ size += 2 * sizeof(struct track); +#endif + kasan_cache_create(s, &size, &s->flags); +#ifdef CONFIG_SLUB_DEBUG if (flags & SLAB_RED_ZONE) { /* * Add some empty padding so that we can catch diff --git a/mm/sparse.c b/mm/sparse.c index 5d0cf4540364..36d7bbb80e49 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -100,11 +100,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid) } #endif -/* - * Although written for the SPARSEMEM_EXTREME case, this happens - * to also work for the flat array case because - * NR_SECTION_ROOTS==NR_MEM_SECTIONS. - */ +#ifdef CONFIG_SPARSEMEM_EXTREME int __section_nr(struct mem_section* ms) { unsigned long root_nr; @@ -123,6 +119,12 @@ int __section_nr(struct mem_section* ms) return (root_nr * SECTIONS_PER_ROOT) + (ms - root); } +#else +int __section_nr(struct mem_section* ms) +{ + return (int)(ms - mem_section[0]); +} +#endif /* * During early boot, before section_mem_map is used for an actual diff --git a/mm/swap.c b/mm/swap.c index 616df4ddd870..75c63bb2a1da 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -62,12 +62,12 @@ static void __page_cache_release(struct page *page) struct lruvec *lruvec; unsigned long flags; - spin_lock_irqsave(&zone->lru_lock, flags); - lruvec = mem_cgroup_page_lruvec(page, zone); + spin_lock_irqsave(zone_lru_lock(zone), flags); + lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_off_lru(page)); - spin_unlock_irqrestore(&zone->lru_lock, flags); + spin_unlock_irqrestore(zone_lru_lock(zone), flags); } mem_cgroup_uncharge(page); } @@ -179,26 +179,26 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, void *arg) { int i; - struct zone *zone = NULL; + struct pglist_data *pgdat = NULL; struct lruvec *lruvec; unsigned long flags = 0; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - struct zone *pagezone = page_zone(page); + struct pglist_data *pagepgdat = page_pgdat(page); - if (pagezone != zone) { - if (zone) - spin_unlock_irqrestore(&zone->lru_lock, flags); - zone = pagezone; - spin_lock_irqsave(&zone->lru_lock, flags); + if (pagepgdat != pgdat) { + if (pgdat) + spin_unlock_irqrestore(&pgdat->lru_lock, flags); + pgdat = pagepgdat; + spin_lock_irqsave(&pgdat->lru_lock, flags); } - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, pgdat); (*move_fn)(page, lruvec, arg); } - if (zone) - spin_unlock_irqrestore(&zone->lru_lock, flags); + if (pgdat) + spin_unlock_irqrestore(&pgdat->lru_lock, flags); release_pages(pvec->pages, pvec->nr, pvec->cold); pagevec_reinit(pvec); } @@ -318,9 +318,9 @@ void activate_page(struct page *page) struct zone *zone = page_zone(page); page = compound_head(page); - spin_lock_irq(&zone->lru_lock); - __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL); - spin_unlock_irq(&zone->lru_lock); + spin_lock_irq(zone_lru_lock(zone)); + __activate_page(page, mem_cgroup_page_lruvec(page, zone->zone_pgdat), NULL); + spin_unlock_irq(zone_lru_lock(zone)); } #endif @@ -445,16 +445,16 @@ void lru_cache_add(struct page *page) */ void add_page_to_unevictable_list(struct page *page) { - struct zone *zone = page_zone(page); + struct pglist_data *pgdat = page_pgdat(page); struct lruvec *lruvec; - spin_lock_irq(&zone->lru_lock); - lruvec = mem_cgroup_page_lruvec(page, zone); + spin_lock_irq(&pgdat->lru_lock); + lruvec = mem_cgroup_page_lruvec(page, pgdat); ClearPageActive(page); SetPageUnevictable(page); SetPageLRU(page); add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&pgdat->lru_lock); } /** @@ -730,7 +730,7 @@ void release_pages(struct page **pages, int nr, bool cold) { int i; LIST_HEAD(pages_to_free); - struct zone *zone = NULL; + struct pglist_data *locked_pgdat = NULL; struct lruvec *lruvec; unsigned long uninitialized_var(flags); unsigned int uninitialized_var(lock_batch); @@ -741,11 +741,11 @@ void release_pages(struct page **pages, int nr, bool cold) /* * Make sure the IRQ-safe lock-holding time does not get * excessive with a continuous string of pages from the - * same zone. The lock is held only if zone != NULL. + * same pgdat. The lock is held only if pgdat != NULL. */ - if (zone && ++lock_batch == SWAP_CLUSTER_MAX) { - spin_unlock_irqrestore(&zone->lru_lock, flags); - zone = NULL; + if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) { + spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); + locked_pgdat = NULL; } if (is_huge_zero_page(page)) { @@ -758,27 +758,27 @@ void release_pages(struct page **pages, int nr, bool cold) continue; if (PageCompound(page)) { - if (zone) { - spin_unlock_irqrestore(&zone->lru_lock, flags); - zone = NULL; + if (locked_pgdat) { + spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); + locked_pgdat = NULL; } __put_compound_page(page); continue; } if (PageLRU(page)) { - struct zone *pagezone = page_zone(page); + struct pglist_data *pgdat = page_pgdat(page); - if (pagezone != zone) { - if (zone) - spin_unlock_irqrestore(&zone->lru_lock, + if (pgdat != locked_pgdat) { + if (locked_pgdat) + spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); lock_batch = 0; - zone = pagezone; - spin_lock_irqsave(&zone->lru_lock, flags); + locked_pgdat = pgdat; + spin_lock_irqsave(&locked_pgdat->lru_lock, flags); } - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, locked_pgdat); VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_off_lru(page)); @@ -789,8 +789,8 @@ void release_pages(struct page **pages, int nr, bool cold) list_add(&page->lru, &pages_to_free); } - if (zone) - spin_unlock_irqrestore(&zone->lru_lock, flags); + if (locked_pgdat) + spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); mem_cgroup_uncharge_list(&pages_to_free); free_hot_cold_page_list(&pages_to_free, cold); @@ -826,7 +826,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, VM_BUG_ON_PAGE(PageCompound(page_tail), page); VM_BUG_ON_PAGE(PageLRU(page_tail), page); VM_BUG_ON(NR_CPUS != 1 && - !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); + !spin_is_locked(&lruvec_pgdat(lruvec)->lru_lock)); if (!list) SetPageLRU(page_tail); diff --git a/mm/swap_state.c b/mm/swap_state.c index c99463ac02fb..c8310a37be3a 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -95,7 +95,7 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) entry.val, page); if (likely(!error)) { address_space->nrpages++; - __inc_zone_page_state(page, NR_FILE_PAGES); + __inc_node_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(add_total); } spin_unlock_irq(&address_space->tree_lock); @@ -147,7 +147,7 @@ void __delete_from_swap_cache(struct page *page) set_page_private(page, 0); ClearPageSwapCache(page); address_space->nrpages--; - __dec_zone_page_state(page, NR_FILE_PAGES); + __dec_node_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(del_total); } diff --git a/mm/util.c b/mm/util.c index 8d010ef2ce1c..662cddf914af 100644 --- a/mm/util.c +++ b/mm/util.c @@ -528,7 +528,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { free = global_page_state(NR_FREE_PAGES); - free += global_page_state(NR_FILE_PAGES); + free += global_node_page_state(NR_FILE_PAGES); /* * shmem pages shouldn't be counted as free in this @@ -536,7 +536,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) * that won't affect the overall amount of available * memory in the system. */ - free -= global_page_state(NR_SHMEM); + free -= global_node_page_state(NR_SHMEM); free += get_nr_swap_pages(); diff --git a/mm/vmscan.c b/mm/vmscan.c index 21d417ccff69..650d26832569 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -84,6 +84,9 @@ struct scan_control { /* Scan (total_size >> priority) pages at once */ int priority; + /* The highest zone to isolate pages for reclaim from */ + enum zone_type reclaim_idx; + unsigned int may_writepage:1; /* Can mapped pages be reclaimed? */ @@ -191,26 +194,44 @@ static bool sane_reclaim(struct scan_control *sc) } #endif +/* + * This misses isolated pages which are not accounted for to save counters. + * As the data only determines if reclaim or compaction continues, it is + * not expected that isolated pages will be a dominating factor. + */ unsigned long zone_reclaimable_pages(struct zone *zone) { unsigned long nr; - nr = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) + - zone_page_state_snapshot(zone, NR_INACTIVE_FILE) + - zone_page_state_snapshot(zone, NR_ISOLATED_FILE); + nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) + + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE); + if (get_nr_swap_pages() > 0) + nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); + + return nr; +} + +unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat) +{ + unsigned long nr; + + nr = node_page_state_snapshot(pgdat, NR_ACTIVE_FILE) + + node_page_state_snapshot(pgdat, NR_INACTIVE_FILE) + + node_page_state_snapshot(pgdat, NR_ISOLATED_FILE); if (get_nr_swap_pages() > 0) - nr += zone_page_state_snapshot(zone, NR_ACTIVE_ANON) + - zone_page_state_snapshot(zone, NR_INACTIVE_ANON) + - zone_page_state_snapshot(zone, NR_ISOLATED_ANON); + nr += node_page_state_snapshot(pgdat, NR_ACTIVE_ANON) + + node_page_state_snapshot(pgdat, NR_INACTIVE_ANON) + + node_page_state_snapshot(pgdat, NR_ISOLATED_ANON); return nr; } -bool zone_reclaimable(struct zone *zone) +bool pgdat_reclaimable(struct pglist_data *pgdat) { - return zone_page_state_snapshot(zone, NR_PAGES_SCANNED) < - zone_reclaimable_pages(zone) * 6; + return node_page_state_snapshot(pgdat, NR_PAGES_SCANNED) < + pgdat_reclaimable_pages(pgdat) * 6; } unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) @@ -218,7 +239,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) if (!mem_cgroup_disabled()) return mem_cgroup_get_lru_size(lruvec, lru); - return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru); + return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); } /* @@ -593,7 +614,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, ClearPageReclaim(page); } trace_mm_vmscan_writepage(page); - inc_zone_page_state(page, NR_VMSCAN_WRITE); + inc_node_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } @@ -877,7 +898,7 @@ static void page_check_dirty_writeback(struct page *page, * shrink_page_list() returns the number of reclaimed pages */ static unsigned long shrink_page_list(struct list_head *page_list, - struct zone *zone, + struct pglist_data *pgdat, struct scan_control *sc, enum ttu_flags ttu_flags, unsigned long *ret_nr_dirty, @@ -917,7 +938,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep; VM_BUG_ON_PAGE(PageActive(page), page); - VM_BUG_ON_PAGE(page_zone(page) != zone, page); sc->nr_scanned++; @@ -996,7 +1016,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* Case 1 above */ if (current_is_kswapd() && PageReclaim(page) && - test_bit(ZONE_WRITEBACK, &zone->flags)) { + test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { nr_immediate++; goto keep_locked; @@ -1092,14 +1112,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ if (page_is_file_cache(page) && (!current_is_kswapd() || - !test_bit(ZONE_DIRTY, &zone->flags))) { + !test_bit(PGDAT_DIRTY, &pgdat->flags))) { /* * Immediately reclaim when written back. * Similar in principal to deactivate_page() * except we already have the page isolated * and know it's dirty */ - inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); + inc_node_page_state(page, NR_VMSCAN_IMMEDIATE); SetPageReclaim(page); goto keep_locked; @@ -1266,11 +1286,11 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, } } - ret = shrink_page_list(&clean_pages, zone, &sc, + ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, TTU_UNMAP|TTU_IGNORE_ACCESS, &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); list_splice(&clean_pages, page_list); - mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); return ret; } @@ -1348,8 +1368,31 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) return ret; } + /* - * zone->lru_lock is heavily contended. Some of the functions that + * Update LRU sizes after isolating pages. The LRU size updates must + * be complete before mem_cgroup_update_lru_size due to a santity check. + */ +static __always_inline void update_lru_sizes(struct lruvec *lruvec, + enum lru_list lru, unsigned long *nr_zone_taken, + unsigned long nr_taken) +{ + int zid; + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + if (!nr_zone_taken[zid]) + continue; + + __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); + } + +#ifdef CONFIG_MEMCG + mem_cgroup_update_lru_size(lruvec, lru, -nr_taken); +#endif +} + +/* + * zone_lru_lock is heavily contended. Some of the functions that * shrink the lists perform better by taking out a batch of pages * and working on them outside the LRU lock. * @@ -1375,10 +1418,13 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, { struct list_head *src = &lruvec->lists[lru]; unsigned long nr_taken = 0; - unsigned long scan; + unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; + unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; + unsigned long scan, nr_pages; + LIST_HEAD(pages_skipped); for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && - !list_empty(src); scan++) { + !list_empty(src);) { struct page *page; page = lru_to_page(src); @@ -1386,9 +1432,23 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, VM_BUG_ON_PAGE(!PageLRU(page), page); + if (page_zonenum(page) > sc->reclaim_idx) { + list_move(&page->lru, &pages_skipped); + nr_skipped[page_zonenum(page)]++; + continue; + } + + /* + * Account for scanned and skipped separetly to avoid the pgdat + * being prematurely marked unreclaimable by pgdat_reclaimable. + */ + scan++; + switch (__isolate_lru_page(page, mode)) { case 0: - nr_taken += hpage_nr_pages(page); + nr_pages = hpage_nr_pages(page); + nr_taken += nr_pages; + nr_zone_taken[page_zonenum(page)] += nr_pages; list_move(&page->lru, dst); break; @@ -1402,9 +1462,38 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, } } + /* + * Splice any skipped pages to the start of the LRU list. Note that + * this disrupts the LRU order when reclaiming for lower zones but + * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX + * scanning would soon rescan the same pages to skip and put the + * system at risk of premature OOM. + */ + if (!list_empty(&pages_skipped)) { + int zid; + unsigned long total_skipped = 0; + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + if (!nr_skipped[zid]) + continue; + + __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]); + total_skipped += nr_skipped[zid]; + } + + /* + * Account skipped pages as a partial scan as the pgdat may be + * close to unreclaimable. If the LRU list is empty, account + * skipped pages as a full scan. + */ + scan += list_empty(src) ? total_skipped : total_skipped >> 2; + + list_splice(&pages_skipped, src); + } *nr_scanned = scan; - trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan, + trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan, nr_taken, mode, is_file_lru(lru)); + update_lru_sizes(lruvec, lru, nr_zone_taken, nr_taken); return nr_taken; } @@ -1444,8 +1533,8 @@ int isolate_lru_page(struct page *page) struct zone *zone = page_zone(page); struct lruvec *lruvec; - spin_lock_irq(&zone->lru_lock); - lruvec = mem_cgroup_page_lruvec(page, zone); + spin_lock_irq(zone_lru_lock(zone)); + lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); if (PageLRU(page)) { int lru = page_lru(page); get_page(page); @@ -1453,7 +1542,7 @@ int isolate_lru_page(struct page *page) del_page_from_lru_list(page, lruvec, lru); ret = 0; } - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(zone_lru_lock(zone)); } return ret; } @@ -1465,7 +1554,7 @@ int isolate_lru_page(struct page *page) * the LRU list will go small and be scanned faster than necessary, leading to * unnecessary swapping, thrashing and OOM. */ -static int too_many_isolated(struct zone *zone, int file, +static int too_many_isolated(struct pglist_data *pgdat, int file, struct scan_control *sc) { unsigned long inactive, isolated; @@ -1477,11 +1566,11 @@ static int too_many_isolated(struct zone *zone, int file, return 0; if (file) { - inactive = zone_page_state(zone, NR_INACTIVE_FILE); - isolated = zone_page_state(zone, NR_ISOLATED_FILE); + inactive = node_page_state(pgdat, NR_INACTIVE_FILE); + isolated = node_page_state(pgdat, NR_ISOLATED_FILE); } else { - inactive = zone_page_state(zone, NR_INACTIVE_ANON); - isolated = zone_page_state(zone, NR_ISOLATED_ANON); + inactive = node_page_state(pgdat, NR_INACTIVE_ANON); + isolated = node_page_state(pgdat, NR_ISOLATED_ANON); } /* @@ -1499,7 +1588,7 @@ static noinline_for_stack void putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) { struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; - struct zone *zone = lruvec_zone(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); LIST_HEAD(pages_to_free); /* @@ -1512,13 +1601,13 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) VM_BUG_ON_PAGE(PageLRU(page), page); list_del(&page->lru); if (unlikely(!page_evictable(page))) { - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&pgdat->lru_lock); putback_lru_page(page); - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&pgdat->lru_lock); continue; } - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, pgdat); SetPageLRU(page); lru = page_lru(page); @@ -1535,10 +1624,10 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) del_page_from_lru_list(page, lruvec, lru); if (unlikely(PageCompound(page))) { - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&pgdat->lru_lock); mem_cgroup_uncharge(page); (*get_compound_page_dtor(page))(page); - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&pgdat->lru_lock); } else list_add(&page->lru, &pages_to_free); } @@ -1563,8 +1652,32 @@ static int current_may_throttle(void) bdi_write_congested(current->backing_dev_info); } +static bool inactive_reclaimable_pages(struct lruvec *lruvec, + struct scan_control *sc, enum lru_list lru) +{ + int zid; + struct zone *zone; + int file = is_file_lru(lru); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + if (!global_reclaim(sc)) + return true; + + for (zid = sc->reclaim_idx; zid >= 0; zid--) { + zone = &pgdat->node_zones[zid]; + if (!populated_zone(zone)) + continue; + + if (zone_page_state_snapshot(zone, NR_ZONE_LRU_BASE + + LRU_FILE * file) >= SWAP_CLUSTER_MAX) + return true; + } + + return false; +} + /* - * shrink_inactive_list() is a helper for shrink_zone(). It returns the number + * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages */ static noinline_for_stack unsigned long @@ -1582,10 +1695,13 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, unsigned long nr_immediate = 0; isolate_mode_t isolate_mode = 0; int file = is_file_lru(lru); - struct zone *zone = lruvec_zone(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; - while (unlikely(too_many_isolated(zone, file, sc))) { + if (!inactive_reclaimable_pages(lruvec, sc, lru)) + return 0; + + while (unlikely(too_many_isolated(pgdat, file, sc))) { congestion_wait(BLK_RW_ASYNC, HZ/10); /* We are about to die and free our memory. Return now. */ @@ -1600,48 +1716,45 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (!sc->may_writepage) isolate_mode |= ISOLATE_CLEAN; - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&pgdat->lru_lock); nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, &nr_scanned, sc, isolate_mode, lru); - update_lru_size(lruvec, lru, -nr_taken); - __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; if (global_reclaim(sc)) { - __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); + __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned); if (current_is_kswapd()) - __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); + __count_vm_events(PGSCAN_KSWAPD, nr_scanned); else - __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned); + __count_vm_events(PGSCAN_DIRECT, nr_scanned); } - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&pgdat->lru_lock); if (nr_taken == 0) return 0; - nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, + nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP, &nr_dirty, &nr_unqueued_dirty, &nr_congested, &nr_writeback, &nr_immediate, false); - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&pgdat->lru_lock); if (global_reclaim(sc)) { if (current_is_kswapd()) - __count_zone_vm_events(PGSTEAL_KSWAPD, zone, - nr_reclaimed); + __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed); else - __count_zone_vm_events(PGSTEAL_DIRECT, zone, - nr_reclaimed); + __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed); } putback_inactive_pages(lruvec, &page_list); - __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&pgdat->lru_lock); mem_cgroup_uncharge_list(&page_list); free_hot_cold_page_list(&page_list, true); @@ -1661,7 +1774,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * are encountered in the nr_immediate check below. */ if (nr_writeback && nr_writeback == nr_taken) - set_bit(ZONE_WRITEBACK, &zone->flags); + set_bit(PGDAT_WRITEBACK, &pgdat->flags); /* * Legacy memcg will stall in page writeback so avoid forcibly @@ -1673,16 +1786,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * backed by a congested BDI and wait_iff_congested will stall. */ if (nr_dirty && nr_dirty == nr_congested) - set_bit(ZONE_CONGESTED, &zone->flags); + set_bit(PGDAT_CONGESTED, &pgdat->flags); /* * If dirty pages are scanned that are not queued for IO, it * implies that flushers are not keeping up. In this case, flag - * the zone ZONE_DIRTY and kswapd will start writing pages from + * the pgdat PGDAT_DIRTY and kswapd will start writing pages from * reclaim context. */ if (nr_unqueued_dirty == nr_taken) - set_bit(ZONE_DIRTY, &zone->flags); + set_bit(PGDAT_DIRTY, &pgdat->flags); /* * If kswapd scans pages marked marked for immediate @@ -1701,9 +1814,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, */ if (!sc->hibernation_mode && !current_is_kswapd() && current_may_throttle()) - wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); + wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10); - trace_mm_vmscan_lru_shrink_inactive(zone, nr_scanned, nr_reclaimed, + trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, + nr_scanned, nr_reclaimed, sc->priority, file); return nr_reclaimed; } @@ -1715,9 +1829,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * processes, from rmap. * * If the pages are mostly unmapped, the processing is fast and it is - * appropriate to hold zone->lru_lock across the whole operation. But if + * appropriate to hold zone_lru_lock across the whole operation. But if * the pages are mapped, the processing is slow (page_referenced()) so we - * should drop zone->lru_lock around each page. It's impossible to balance + * should drop zone_lru_lock around each page. It's impossible to balance * this, so instead we remove the pages from the LRU while processing them. * It is safe to rely on PG_active against the non-LRU pages in here because * nobody will play with that bit on a non-LRU page. @@ -1731,20 +1845,20 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, struct list_head *pages_to_free, enum lru_list lru) { - struct zone *zone = lruvec_zone(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); unsigned long pgmoved = 0; struct page *page; int nr_pages; while (!list_empty(list)) { page = lru_to_page(list); - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, pgdat); VM_BUG_ON_PAGE(PageLRU(page), page); SetPageLRU(page); nr_pages = hpage_nr_pages(page); - update_lru_size(lruvec, lru, nr_pages); + update_lru_size(lruvec, lru, page_zonenum(page), nr_pages); list_move(&page->lru, &lruvec->lists[lru]); pgmoved += nr_pages; @@ -1754,10 +1868,10 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, del_page_from_lru_list(page, lruvec, lru); if (unlikely(PageCompound(page))) { - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&pgdat->lru_lock); mem_cgroup_uncharge(page); (*get_compound_page_dtor(page))(page); - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&pgdat->lru_lock); } else list_add(&page->lru, pages_to_free); } @@ -1783,7 +1897,7 @@ static void shrink_active_list(unsigned long nr_to_scan, unsigned long nr_rotated = 0; isolate_mode_t isolate_mode = 0; int file = is_file_lru(lru); - struct zone *zone = lruvec_zone(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); lru_add_drain(); @@ -1792,20 +1906,19 @@ static void shrink_active_list(unsigned long nr_to_scan, if (!sc->may_writepage) isolate_mode |= ISOLATE_CLEAN; - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&pgdat->lru_lock); nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, isolate_mode, lru); - update_lru_size(lruvec, lru, -nr_taken); - __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; if (global_reclaim(sc)) - __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); - __count_zone_vm_events(PGREFILL, zone, nr_scanned); + __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned); + __count_vm_events(PGREFILL, nr_scanned); - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&pgdat->lru_lock); while (!list_empty(&l_hold)) { cond_resched(); @@ -1850,7 +1963,7 @@ static void shrink_active_list(unsigned long nr_to_scan, /* * Move pages back to the lru list. */ - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&pgdat->lru_lock); /* * Count referenced pages from currently used mappings as rotated, * even though only some of them are actually re-activated. This @@ -1861,8 +1974,8 @@ static void shrink_active_list(unsigned long nr_to_scan, move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); - __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); - spin_unlock_irq(&zone->lru_lock); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); + spin_unlock_irq(&pgdat->lru_lock); mem_cgroup_uncharge_list(&l_hold); free_hot_cold_page_list(&l_hold, true); @@ -1894,12 +2007,15 @@ static void shrink_active_list(unsigned long nr_to_scan, * 1TB 101 10GB * 10TB 320 32GB */ -static bool inactive_list_is_low(struct lruvec *lruvec, bool file) +static bool inactive_list_is_low(struct lruvec *lruvec, bool file, + struct scan_control *sc) { unsigned long inactive_ratio; unsigned long inactive; unsigned long active; unsigned long gb; + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + int zid; /* * If we don't have swap space, anonymous page deactivation @@ -1911,6 +2027,27 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file) inactive = lruvec_lru_size(lruvec, file * LRU_FILE); active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE); + /* + * For zone-constrained allocations, it is necessary to check if + * deactivations are required for lowmem to be reclaimed. This + * calculates the inactive/active pages available in eligible zones. + */ + for (zid = sc->reclaim_idx + 1; zid < MAX_NR_ZONES; zid++) { + struct zone *zone = &pgdat->node_zones[zid]; + unsigned long inactive_zone, active_zone; + + if (!populated_zone(zone)) + continue; + + inactive_zone = zone_page_state(zone, + NR_ZONE_LRU_BASE + (file * LRU_FILE)); + active_zone = zone_page_state(zone, + NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE); + + inactive -= min(inactive, inactive_zone); + active -= min(active, active_zone); + } + gb = (inactive + active) >> (30 - PAGE_SHIFT); if (gb) inactive_ratio = int_sqrt(10 * gb); @@ -1924,7 +2061,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc) { if (is_active_lru(lru)) { - if (inactive_list_is_low(lruvec, is_file_lru(lru))) + if (inactive_list_is_low(lruvec, is_file_lru(lru), sc)) shrink_active_list(nr_to_scan, lruvec, sc, lru); return 0; } @@ -1956,7 +2093,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; u64 fraction[2]; u64 denominator = 0; /* gcc */ - struct zone *zone = lruvec_zone(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); unsigned long anon_prio, file_prio; enum scan_balance scan_balance; unsigned long anon, file; @@ -1977,7 +2114,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * well. */ if (current_is_kswapd()) { - if (!zone_reclaimable(zone)) + if (!pgdat_reclaimable(pgdat)) force_scan = true; if (!mem_cgroup_online(memcg)) force_scan = true; @@ -2023,14 +2160,24 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * anon pages. Try to detect this based on file LRU size. */ if (global_reclaim(sc)) { - unsigned long zonefile; - unsigned long zonefree; + unsigned long pgdatfile; + unsigned long pgdatfree; + int z; + unsigned long total_high_wmark = 0; - zonefree = zone_page_state(zone, NR_FREE_PAGES); - zonefile = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_FILE); + pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); + pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_FILE); - if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) { + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = &pgdat->node_zones[z]; + if (!populated_zone(zone)) + continue; + + total_high_wmark += high_wmark_pages(zone); + } + + if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) { scan_balance = SCAN_ANON; goto out; } @@ -2045,7 +2192,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * lruvec even if it has plenty of old anonymous pages unless the * system is under heavy pressure. */ - if (!inactive_list_is_low(lruvec, true) && + if (!inactive_list_is_low(lruvec, true, sc) && lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { scan_balance = SCAN_FILE; goto out; @@ -2077,7 +2224,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE); - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&pgdat->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { reclaim_stat->recent_scanned[0] /= 2; reclaim_stat->recent_rotated[0] /= 2; @@ -2098,7 +2245,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); fp /= reclaim_stat->recent_rotated[1] + 1; - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&pgdat->lru_lock); fraction[0] = ap; fraction[1] = fp; @@ -2174,12 +2321,12 @@ static inline void init_tlb_ubc(void) #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ /* - * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. + * This is a basic per-node page freer. Used by both kswapd and direct reclaim. */ -static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg, +static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg, struct scan_control *sc, unsigned long *lru_pages) { - struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); + struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; unsigned long nr_to_scan; @@ -2287,7 +2434,7 @@ static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg, * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_list_is_low(lruvec, false)) + if (inactive_list_is_low(lruvec, false, sc)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); @@ -2312,13 +2459,14 @@ static bool in_reclaim_compaction(struct scan_control *sc) * calls try_to_compact_zone() that it will have enough free pages to succeed. * It will give up earlier than that if there is difficulty reclaiming pages. */ -static inline bool should_continue_reclaim(struct zone *zone, +static inline bool should_continue_reclaim(struct pglist_data *pgdat, unsigned long nr_reclaimed, unsigned long nr_scanned, struct scan_control *sc) { unsigned long pages_for_compaction; unsigned long inactive_lru_pages; + int z; /* If not in reclaim/compaction mode, stop */ if (!in_reclaim_compaction(sc)) @@ -2352,25 +2500,32 @@ static inline bool should_continue_reclaim(struct zone *zone, * inactive lists are large enough, continue reclaiming */ pages_for_compaction = (2UL << sc->order); - inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE); + inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); if (get_nr_swap_pages() > 0) - inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON); + inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); if (sc->nr_reclaimed < pages_for_compaction && inactive_lru_pages > pages_for_compaction) return true; /* If compaction would go ahead or the allocation would succeed, stop */ - switch (compaction_suitable(zone, sc->order, 0, 0)) { - case COMPACT_PARTIAL: - case COMPACT_CONTINUE: - return false; - default: - return true; + for (z = 0; z <= sc->reclaim_idx; z++) { + struct zone *zone = &pgdat->node_zones[z]; + if (!populated_zone(zone)) + continue; + + switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) { + case COMPACT_PARTIAL: + case COMPACT_CONTINUE: + return false; + default: + /* check next zone */ + ; + } } + return true; } -static bool shrink_zone(struct zone *zone, struct scan_control *sc, - bool is_classzone) +static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) { struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long nr_reclaimed, nr_scanned; @@ -2379,10 +2534,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, do { struct mem_cgroup *root = sc->target_mem_cgroup; struct mem_cgroup_reclaim_cookie reclaim = { - .zone = zone, + .pgdat = pgdat, .priority = sc->priority, }; - unsigned long zone_lru_pages = 0; + unsigned long node_lru_pages = 0; struct mem_cgroup *memcg; nr_reclaimed = sc->nr_reclaimed; @@ -2403,11 +2558,11 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, reclaimed = sc->nr_reclaimed; scanned = sc->nr_scanned; - shrink_zone_memcg(zone, memcg, sc, &lru_pages); - zone_lru_pages += lru_pages; + shrink_node_memcg(pgdat, memcg, sc, &lru_pages); + node_lru_pages += lru_pages; - if (memcg && is_classzone) - shrink_slab(sc->gfp_mask, zone_to_nid(zone), + if (!global_reclaim(sc)) + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->nr_scanned - scanned, lru_pages); @@ -2419,7 +2574,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, /* * Direct reclaim and kswapd have to scan all memory * cgroups to fulfill the overall scan target for the - * zone. + * node. * * Limit reclaim, on the other hand, only cares about * nr_to_reclaim pages to be reclaimed and it will @@ -2437,10 +2592,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, * Shrink the slab caches in the same proportion that * the eligible LRU pages were scanned. */ - if (global_reclaim(sc) && is_classzone) - shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL, + if (global_reclaim(sc)) + shrink_slab(sc->gfp_mask, pgdat->node_id, NULL, sc->nr_scanned - nr_scanned, - zone_lru_pages); + node_lru_pages); if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; @@ -2455,7 +2610,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, if (sc->nr_reclaimed - nr_reclaimed) reclaimable = true; - } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, + } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); return reclaimable; @@ -2465,9 +2620,9 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, * Returns true if compaction should go ahead for a high-order request, or * the high-order allocation would succeed without compaction. */ -static inline bool compaction_ready(struct zone *zone, int order, int classzone_idx) +static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) { - unsigned long balance_gap, watermark; + unsigned long watermark; bool watermark_ok; /* @@ -2476,23 +2631,21 @@ static inline bool compaction_ready(struct zone *zone, int order, int classzone_ * there is a buffer of free pages available to give compaction * a reasonable chance of completing and allocating the page */ - balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( - zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); - watermark = high_wmark_pages(zone) + balance_gap + (2UL << order); - watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, classzone_idx); + watermark = high_wmark_pages(zone) + (2UL << sc->order); + watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); /* * If compaction is deferred, reclaim up to a point where * compaction will have a chance of success when re-enabled */ - if (compaction_deferred(zone, order)) + if (compaction_deferred(zone, sc->order)) return watermark_ok; /* * If compaction is not ready to start and allocation is not likely * to succeed without it, then keep reclaiming. */ - if (compaction_suitable(zone, order, 0, classzone_idx) == COMPACT_SKIPPED) + if (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx) == COMPACT_SKIPPED) return false; return watermark_ok; @@ -2503,14 +2656,6 @@ static inline bool compaction_ready(struct zone *zone, int order, int classzone_ * try to reclaim pages from zones which will satisfy the caller's allocation * request. * - * We reclaim from a zone even if that zone is over high_wmark_pages(zone). - * Because: - * a) The caller may be trying to free *extra* pages to satisfy a higher-order - * allocation or - * b) The target zone may be at high_wmark_pages(zone) but the lower zones - * must go *over* high_wmark_pages(zone) to satisfy the `incremental min' - * zone defense algorithm. - * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ @@ -2521,7 +2666,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; gfp_t orig_mask; - enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); + pg_data_t *last_pgdat = NULL; /* * If the number of buffer_heads in the machine exceeds the maximum @@ -2529,21 +2674,13 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) * highmem pages could be pinning lowmem pages storing buffer_heads */ orig_mask = sc->gfp_mask; - if (buffer_heads_over_limit) + if (buffer_heads_over_limit) { sc->gfp_mask |= __GFP_HIGHMEM; + sc->reclaim_idx = gfp_zone(sc->gfp_mask); + } for_each_zone_zonelist_nodemask(zone, z, zonelist, - gfp_zone(sc->gfp_mask), sc->nodemask) { - enum zone_type classzone_idx; - - if (!populated_zone(zone)) - continue; - - classzone_idx = requested_highidx; - while (!populated_zone(zone->zone_pgdat->node_zones + - classzone_idx)) - classzone_idx--; - + sc->reclaim_idx, sc->nodemask) { /* * Take care memory controller reclaiming has small influence * to global LRU. @@ -2554,7 +2691,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) continue; if (sc->priority != DEF_PRIORITY && - !zone_reclaimable(zone)) + !pgdat_reclaimable(zone->zone_pgdat)) continue; /* Let kswapd poll it */ /* @@ -2568,20 +2705,28 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) */ if (IS_ENABLED(CONFIG_COMPACTION) && sc->order > PAGE_ALLOC_COSTLY_ORDER && - zonelist_zone_idx(z) <= requested_highidx && - compaction_ready(zone, sc->order, requested_highidx)) { + compaction_ready(zone, sc)) { sc->compaction_ready = true; continue; } /* + * Shrink each node in the zonelist once. If the + * zonelist is ordered by zone (not the default) then a + * node may be shrunk multiple times but in that case + * the user prefers lower zones being preserved. + */ + if (zone->zone_pgdat == last_pgdat) + continue; + + /* * This steals pages from memory cgroups over softlimit * and returns the number of reclaimed pages and * scanned pages. This works for global memory pressure * and balancing, not for a memcg's limit. */ nr_soft_scanned = 0; - nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat, sc->order, sc->gfp_mask, &nr_soft_scanned); sc->nr_reclaimed += nr_soft_reclaimed; @@ -2589,7 +2734,11 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) /* need some check for avoid more shrink_zone() */ } - shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); + /* See comment about same check for global reclaim above */ + if (zone->zone_pgdat == last_pgdat) + continue; + last_pgdat = zone->zone_pgdat; + shrink_node(zone->zone_pgdat, sc); } /* @@ -2625,7 +2774,7 @@ retry: delayacct_freepages_start(); if (global_reclaim(sc)) - count_vm_event(ALLOCSTALL); + __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1); do { vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, @@ -2692,7 +2841,7 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; if (!populated_zone(zone) || - zone_reclaimable_pages(zone) == 0) + pgdat_reclaimable_pages(pgdat) == 0) continue; pfmemalloc_reserve += min_wmark_pages(zone); @@ -2707,7 +2856,7 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) /* kswapd must be awake if processes are being throttled */ if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { - pgdat->classzone_idx = min(pgdat->classzone_idx, + pgdat->kswapd_classzone_idx = min(pgdat->kswapd_classzone_idx, (enum zone_type)ZONE_NORMAL); wake_up_interruptible(&pgdat->kswapd_wait); } @@ -2815,6 +2964,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), + .reclaim_idx = gfp_zone(gfp_mask), .order = order, .nodemask = nodemask, .priority = DEF_PRIORITY, @@ -2833,7 +2983,8 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, trace_mm_vmscan_direct_reclaim_begin(order, sc.may_writepage, - gfp_mask); + gfp_mask, + sc.reclaim_idx); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); @@ -2844,9 +2995,9 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #ifdef CONFIG_MEMCG -unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, +unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, gfp_t gfp_mask, bool noswap, - struct zone *zone, + pg_data_t *pgdat, unsigned long *nr_scanned) { struct scan_control sc = { @@ -2854,6 +3005,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, .target_mem_cgroup = memcg, .may_writepage = !laptop_mode, .may_unmap = 1, + .reclaim_idx = MAX_NR_ZONES - 1, .may_swap = !noswap, }; unsigned long lru_pages; @@ -2863,16 +3015,17 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, sc.may_writepage, - sc.gfp_mask); + sc.gfp_mask, + sc.reclaim_idx); /* * NOTE: Although we can get the priority field, using it * here is not a good idea, since it limits the pages we can scan. - * if we don't reclaim here, the shrink_zone from balance_pgdat + * if we don't reclaim here, the shrink_node from balance_pgdat * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_zone_memcg(zone, memcg, &sc, &lru_pages); + shrink_node_memcg(pgdat, memcg, &sc, &lru_pages); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -2892,6 +3045,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), + .reclaim_idx = MAX_NR_ZONES - 1, .target_mem_cgroup = memcg, .priority = DEF_PRIORITY, .may_writepage = !laptop_mode, @@ -2910,7 +3064,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, trace_mm_vmscan_memcg_reclaim_begin(0, sc.may_writepage, - sc.gfp_mask); + sc.gfp_mask, + sc.reclaim_idx); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); @@ -2920,7 +3075,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, } #endif -static void age_active_anon(struct zone *zone, struct scan_control *sc) +static void age_active_anon(struct pglist_data *pgdat, + struct scan_control *sc) { struct mem_cgroup *memcg; @@ -2929,9 +3085,9 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc) memcg = mem_cgroup_iter(NULL, NULL, NULL); do { - struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); + struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); - if (inactive_list_is_low(lruvec, false)) + if (inactive_list_is_low(lruvec, false, sc)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); @@ -2939,82 +3095,21 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc) } while (memcg); } -static bool zone_balanced(struct zone *zone, int order, bool highorder, - unsigned long balance_gap, int classzone_idx) +static bool zone_balanced(struct zone *zone, int order, int classzone_idx) { - unsigned long mark = high_wmark_pages(zone) + balance_gap; + unsigned long mark = high_wmark_pages(zone); + + if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx)) + return false; /* - * When checking from pgdat_balanced(), kswapd should stop and sleep - * when it reaches the high order-0 watermark and let kcompactd take - * over. Other callers such as wakeup_kswapd() want to determine the - * true high-order watermark. + * If any eligible zone is balanced then the node is not considered + * to be congested or dirty */ - if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) { - mark += (1UL << order); - order = 0; - } - - return zone_watermark_ok_safe(zone, order, mark, classzone_idx); -} - -/* - * pgdat_balanced() is used when checking if a node is balanced. - * - * For order-0, all zones must be balanced! - * - * For high-order allocations only zones that meet watermarks and are in a - * zone allowed by the callers classzone_idx are added to balanced_pages. The - * total of balanced pages must be at least 25% of the zones allowed by - * classzone_idx for the node to be considered balanced. Forcing all zones to - * be balanced for high orders can cause excessive reclaim when there are - * imbalanced zones. - * The choice of 25% is due to - * o a 16M DMA zone that is balanced will not balance a zone on any - * reasonable sized machine - * o On all other machines, the top zone must be at least a reasonable - * percentage of the middle zones. For example, on 32-bit x86, highmem - * would need to be at least 256M for it to be balance a whole node. - * Similarly, on x86-64 the Normal zone would need to be at least 1G - * to balance a node on its own. These seemed like reasonable ratios. - */ -static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) -{ - unsigned long managed_pages = 0; - unsigned long balanced_pages = 0; - int i; - - /* Check the watermark levels */ - for (i = 0; i <= classzone_idx; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!populated_zone(zone)) - continue; - - managed_pages += zone->managed_pages; - - /* - * A special case here: - * - * balance_pgdat() skips over all_unreclaimable after - * DEF_PRIORITY. Effectively, it considers them balanced so - * they must be considered balanced here as well! - */ - if (!zone_reclaimable(zone)) { - balanced_pages += zone->managed_pages; - continue; - } + clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags); + clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags); - if (zone_balanced(zone, order, false, 0, i)) - balanced_pages += zone->managed_pages; - else if (!order) - return false; - } - - if (order) - return balanced_pages >= (managed_pages >> 2); - else - return true; + return true; } /* @@ -3023,12 +3118,9 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) * * Returns true if kswapd is ready to sleep */ -static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, - int classzone_idx) +static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) { - /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ - if (remaining) - return false; + int i; /* * The throttled processes are normally woken up in balance_pgdat() as @@ -3046,91 +3138,81 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, if (waitqueue_active(&pgdat->pfmemalloc_wait)) wake_up_all(&pgdat->pfmemalloc_wait); - return pgdat_balanced(pgdat, order, classzone_idx); + for (i = 0; i <= classzone_idx; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; + + if (!zone_balanced(zone, order, classzone_idx)) + return false; + } + + return true; } /* - * kswapd shrinks the zone by the number of pages required to reach - * the high watermark. + * kswapd shrinks a node of pages that are at or below the highest usable + * zone that is currently unbalanced. * * Returns true if kswapd scanned at least the requested number of pages to * reclaim or if the lack of progress was due to pages under writeback. * This is used to determine if the scanning priority needs to be raised. */ -static bool kswapd_shrink_zone(struct zone *zone, - int classzone_idx, +static bool kswapd_shrink_node(pg_data_t *pgdat, struct scan_control *sc) { - unsigned long balance_gap; - bool lowmem_pressure; + struct zone *zone; + int z; - /* Reclaim above the high watermark. */ - sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); + /* Reclaim a number of pages proportional to the number of zones */ + sc->nr_to_reclaim = 0; + for (z = 0; z <= sc->reclaim_idx; z++) { + zone = pgdat->node_zones + z; + if (!populated_zone(zone)) + continue; - /* - * We put equal pressure on every zone, unless one zone has way too - * many pages free already. The "too many pages" is defined as the - * high wmark plus a "gap" where the gap is either the low - * watermark or 1% of the zone, whichever is smaller. - */ - balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( - zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); + sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX); + } /* - * If there is no low memory pressure or the zone is balanced then no - * reclaim is necessary + * Historically care was taken to put equal pressure on all zones but + * now pressure is applied based on node LRU order. */ - lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); - if (!lowmem_pressure && zone_balanced(zone, sc->order, false, - balance_gap, classzone_idx)) - return true; - - shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); - - clear_bit(ZONE_WRITEBACK, &zone->flags); + shrink_node(pgdat, sc); /* - * If a zone reaches its high watermark, consider it to be no longer - * congested. It's possible there are dirty pages backed by congested - * BDIs but as pressure is relieved, speculatively avoid congestion - * waits. + * Fragmentation may mean that the system cannot be rebalanced for + * high-order allocations. If twice the allocation size has been + * reclaimed then recheck watermarks only at order-0 to prevent + * excessive reclaim. Assume that a process requested a high-order + * can direct reclaim/compact. */ - if (zone_reclaimable(zone) && - zone_balanced(zone, sc->order, false, 0, classzone_idx)) { - clear_bit(ZONE_CONGESTED, &zone->flags); - clear_bit(ZONE_DIRTY, &zone->flags); - } + if (sc->order && sc->nr_reclaimed >= 2UL << sc->order) + sc->order = 0; return sc->nr_scanned >= sc->nr_to_reclaim; } /* - * For kswapd, balance_pgdat() will work across all this node's zones until - * they are all at high_wmark_pages(zone). + * For kswapd, balance_pgdat() will reclaim pages across a node from zones + * that are eligible for use by the caller until at least one zone is + * balanced. * - * Returns the highest zone idx kswapd was reclaiming at - * - * There is special handling here for zones which are full of pinned pages. - * This can happen if the pages are all mlocked, or if they are all used by - * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. - * What we do is to detect the case where all pages in the zone have been - * scanned twice and there has been zero successful reclaim. Mark the zone as - * dead and from now on, only perform a short scan. Basically we're polling - * the zone for when the problem goes away. + * Returns the order kswapd finished reclaiming at. * * kswapd scans the zones in the highmem->normal->dma direction. It skips * zones which have free_pages > high_wmark_pages(zone), but once a zone is - * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the - * lower zones regardless of the number of free pages in the lower zones. This - * interoperates with the page allocator fallback scheme to ensure that aging - * of pages is balanced across the zones. + * found to have free_pages <= high_wmark_pages(zone), any page is that zone + * or lower is eligible for reclaim until at least one usable zone is + * balanced. */ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) { int i; - int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; + struct zone *zone; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .order = order, @@ -3145,100 +3227,77 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) bool raise_priority = true; sc.nr_reclaimed = 0; + sc.reclaim_idx = classzone_idx; /* - * Scan in the highmem->dma direction for the highest - * zone which needs scanning + * If the number of buffer_heads exceeds the maximum allowed + * then consider reclaiming from all zones. This has a dual + * purpose -- on 64-bit systems it is expected that + * buffer_heads are stripped during active rotation. On 32-bit + * systems, highmem pages can pin lowmem memory and shrinking + * buffers can relieve lowmem pressure. Reclaim may still not + * go ahead if all eligible zones for the original allocation + * request are balanced to avoid excessive reclaim from kswapd. */ - for (i = pgdat->nr_zones - 1; i >= 0; i--) { - struct zone *zone = pgdat->node_zones + i; - - if (!populated_zone(zone)) - continue; - - if (sc.priority != DEF_PRIORITY && - !zone_reclaimable(zone)) - continue; - - /* - * Do some background aging of the anon list, to give - * pages a chance to be referenced before reclaiming. - */ - age_active_anon(zone, &sc); + if (buffer_heads_over_limit) { + for (i = MAX_NR_ZONES - 1; i >= 0; i--) { + zone = pgdat->node_zones + i; + if (!populated_zone(zone)) + continue; - /* - * If the number of buffer_heads in the machine - * exceeds the maximum allowed level and this node - * has a highmem zone, force kswapd to reclaim from - * it to relieve lowmem pressure. - */ - if (buffer_heads_over_limit && is_highmem_idx(i)) { - end_zone = i; + sc.reclaim_idx = i; break; } + } - if (!zone_balanced(zone, order, false, 0, 0)) { - end_zone = i; - break; - } else { - /* - * If balanced, clear the dirty and congested - * flags - */ - clear_bit(ZONE_CONGESTED, &zone->flags); - clear_bit(ZONE_DIRTY, &zone->flags); - } + /* + * Only reclaim if there are no eligible zones. Check from + * high to low zone as allocations prefer higher zones. + * Scanning from low to high zone would allow congestion to be + * cleared during a very small window when a small low + * zone was balanced even under extreme pressure when the + * overall node may be congested. Note that sc.reclaim_idx + * is not used as buffer_heads_over_limit may have adjusted + * it. + */ + for (i = classzone_idx; i >= 0; i--) { + zone = pgdat->node_zones + i; + if (!populated_zone(zone)) + continue; + + if (zone_balanced(zone, sc.order, classzone_idx)) + goto out; } - if (i < 0) - goto out; + /* + * Do some background aging of the anon list, to give + * pages a chance to be referenced before reclaiming. All + * pages are rotated regardless of classzone as this is + * about consistent aging. + */ + age_active_anon(pgdat, &sc); /* * If we're getting trouble reclaiming, start doing writepage * even in laptop mode. */ - if (sc.priority < DEF_PRIORITY - 2) + if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat)) sc.may_writepage = 1; + /* Call soft limit reclaim before calling shrink_node. */ + sc.nr_scanned = 0; + nr_soft_scanned = 0; + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order, + sc.gfp_mask, &nr_soft_scanned); + sc.nr_reclaimed += nr_soft_reclaimed; + /* - * Now scan the zone in the dma->highmem direction, stopping - * at the last zone which needs scanning. - * - * We do this because the page allocator works in the opposite - * direction. This prevents the page allocator from allocating - * pages behind kswapd's direction of progress, which would - * cause too much scanning of the lower zones. + * There should be no need to raise the scanning priority if + * enough pages are already being scanned that that high + * watermark would be met at 100% efficiency. */ - for (i = 0; i <= end_zone; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!populated_zone(zone)) - continue; - - if (sc.priority != DEF_PRIORITY && - !zone_reclaimable(zone)) - continue; - - sc.nr_scanned = 0; - - nr_soft_scanned = 0; - /* - * Call soft limit reclaim before calling shrink_zone. - */ - nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, - order, sc.gfp_mask, - &nr_soft_scanned); - sc.nr_reclaimed += nr_soft_reclaimed; - - /* - * There should be no need to raise the scanning - * priority if enough pages are already being scanned - * that that high watermark would be met at 100% - * efficiency. - */ - if (kswapd_shrink_zone(zone, end_zone, &sc)) - raise_priority = false; - } + if (kswapd_shrink_node(pgdat, &sc)) + raise_priority = false; /* * If the low watermark is met there is no need for processes @@ -3259,19 +3318,20 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) */ if (raise_priority || !sc.nr_reclaimed) sc.priority--; - } while (sc.priority >= 1 && - !pgdat_balanced(pgdat, order, classzone_idx)); + } while (sc.priority >= 1); out: /* - * Return the highest zone idx we were reclaiming at so - * prepare_kswapd_sleep() makes the same decisions as here. + * Return the order kswapd stopped reclaiming at as + * prepare_kswapd_sleep() takes it into account. If another caller + * entered the allocator slow path while kswapd was awake, order will + * remain at the higher level. */ - return end_zone; + return sc.order; } -static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, - int classzone_idx, int balanced_classzone_idx) +static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, + unsigned int classzone_idx) { long remaining = 0; DEFINE_WAIT(wait); @@ -3282,8 +3342,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); /* Try to sleep for a short interval */ - if (prepare_kswapd_sleep(pgdat, order, remaining, - balanced_classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { /* * Compaction records what page blocks it recently failed to * isolate pages from and skips them in the future scanning. @@ -3296,9 +3355,20 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, * We have freed the memory, now we should compact it to make * allocation of the requested order possible. */ - wakeup_kcompactd(pgdat, order, classzone_idx); + wakeup_kcompactd(pgdat, alloc_order, classzone_idx); remaining = schedule_timeout(HZ/10); + + /* + * If woken prematurely then reset kswapd_classzone_idx and + * order. The values will either be from a wakeup request or + * the previous request that slept prematurely. + */ + if (remaining) { + pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx); + pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order); + } + finish_wait(&pgdat->kswapd_wait, &wait); prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); } @@ -3307,8 +3377,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, * After a short sleep, check if it was a premature sleep. If not, then * go fully to sleep until explicitly woken up. */ - if (prepare_kswapd_sleep(pgdat, order, remaining, - balanced_classzone_idx)) { + if (!remaining && + prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* @@ -3349,9 +3419,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, */ static int kswapd(void *p) { - unsigned long order, new_order; - int classzone_idx, new_classzone_idx; - int balanced_classzone_idx; + unsigned int alloc_order, reclaim_order, classzone_idx; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; @@ -3381,38 +3449,20 @@ static int kswapd(void *p) tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; set_freezable(); - order = new_order = 0; - classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; - balanced_classzone_idx = classzone_idx; + pgdat->kswapd_order = alloc_order = reclaim_order = 0; + pgdat->kswapd_classzone_idx = classzone_idx = 0; for ( ; ; ) { bool ret; - /* - * While we were reclaiming, there might have been another - * wakeup, so check the values. - */ - new_order = pgdat->kswapd_max_order; - new_classzone_idx = pgdat->classzone_idx; - pgdat->kswapd_max_order = 0; - pgdat->classzone_idx = pgdat->nr_zones - 1; +kswapd_try_sleep: + kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, + classzone_idx); - if (order < new_order || classzone_idx > new_classzone_idx) { - /* - * Don't sleep if someone wants a larger 'order' - * allocation or has tigher zone constraints - */ - order = new_order; - classzone_idx = new_classzone_idx; - } else { - kswapd_try_to_sleep(pgdat, order, classzone_idx, - balanced_classzone_idx); - order = pgdat->kswapd_max_order; - classzone_idx = pgdat->classzone_idx; - new_order = order; - new_classzone_idx = classzone_idx; - pgdat->kswapd_max_order = 0; - pgdat->classzone_idx = pgdat->nr_zones - 1; - } + /* Read the new order and classzone_idx */ + alloc_order = reclaim_order = pgdat->kswapd_order; + classzone_idx = pgdat->kswapd_classzone_idx; + pgdat->kswapd_order = 0; + pgdat->kswapd_classzone_idx = 0; ret = try_to_freeze(); if (kthread_should_stop()) @@ -3422,11 +3472,25 @@ static int kswapd(void *p) * We can speed up thawing tasks if we don't call balance_pgdat * after returning from the refrigerator */ - if (!ret) { - trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); - balanced_classzone_idx = balance_pgdat(pgdat, order, - classzone_idx); - } + if (ret) + continue; + + /* + * Reclaim begins at the requested order but if a high-order + * reclaim fails then kswapd falls back to reclaiming for + * order-0. If that happens, kswapd will consider sleeping + * for the order it finished reclaiming at (reclaim_order) + * but kcompactd is woken to compact for the original + * request (alloc_order). + */ + trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx, + alloc_order); + reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); + if (reclaim_order < alloc_order) + goto kswapd_try_sleep; + + alloc_order = reclaim_order = pgdat->kswapd_order; + classzone_idx = pgdat->kswapd_classzone_idx; } tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); @@ -3442,6 +3506,7 @@ static int kswapd(void *p) void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) { pg_data_t *pgdat; + int z; if (!populated_zone(zone)) return; @@ -3449,14 +3514,20 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) return; pgdat = zone->zone_pgdat; - if (pgdat->kswapd_max_order < order) { - pgdat->kswapd_max_order = order; - pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); - } + pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx); + pgdat->kswapd_order = max(pgdat->kswapd_order, order); if (!waitqueue_active(&pgdat->kswapd_wait)) return; - if (zone_balanced(zone, order, true, 0, 0)) - return; + + /* Only wake kswapd if all zones are unbalanced */ + for (z = 0; z <= classzone_idx; z++) { + zone = pgdat->node_zones + z; + if (!populated_zone(zone)) + continue; + + if (zone_balanced(zone, order, classzone_idx)) + return; + } trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); wake_up_interruptible(&pgdat->kswapd_wait); @@ -3477,6 +3548,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) struct scan_control sc = { .nr_to_reclaim = nr_to_reclaim, .gfp_mask = GFP_HIGHUSER_MOVABLE, + .reclaim_idx = MAX_NR_ZONES - 1, .priority = DEF_PRIORITY, .may_writepage = 1, .may_unmap = 1, @@ -3578,12 +3650,12 @@ module_init(kswapd_init) #ifdef CONFIG_NUMA /* - * Zone reclaim mode + * Node reclaim mode * - * If non-zero call zone_reclaim when the number of free pages falls below + * If non-zero call node_reclaim when the number of free pages falls below * the watermarks. */ -int zone_reclaim_mode __read_mostly; +int node_reclaim_mode __read_mostly; #define RECLAIM_OFF 0 #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ @@ -3591,14 +3663,14 @@ int zone_reclaim_mode __read_mostly; #define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ /* - * Priority for ZONE_RECLAIM. This determines the fraction of pages + * Priority for NODE_RECLAIM. This determines the fraction of pages * of a node considered for each zone_reclaim. 4 scans 1/16th of * a zone. */ -#define ZONE_RECLAIM_PRIORITY 4 +#define NODE_RECLAIM_PRIORITY 4 /* - * Percentage of pages in a zone that must be unmapped for zone_reclaim to + * Percentage of pages in a zone that must be unmapped for node_reclaim to * occur. */ int sysctl_min_unmapped_ratio = 1; @@ -3609,11 +3681,11 @@ int sysctl_min_unmapped_ratio = 1; */ int sysctl_min_slab_ratio = 5; -static inline unsigned long zone_unmapped_file_pages(struct zone *zone) +static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat) { - unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); - unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + - zone_page_state(zone, NR_ACTIVE_FILE); + unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED); + unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) + + node_page_state(pgdat, NR_ACTIVE_FILE); /* * It's possible for there to be more file mapped pages than @@ -3624,7 +3696,7 @@ static inline unsigned long zone_unmapped_file_pages(struct zone *zone) } /* Work out how many page cache pages we can reclaim in this reclaim_mode */ -static unsigned long zone_pagecache_reclaimable(struct zone *zone) +static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat) { unsigned long nr_pagecache_reclaimable; unsigned long delta = 0; @@ -3632,17 +3704,17 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone) /* * If RECLAIM_UNMAP is set, then all file pages are considered * potentially reclaimable. Otherwise, we have to worry about - * pages like swapcache and zone_unmapped_file_pages() provides + * pages like swapcache and node_unmapped_file_pages() provides * a better estimate */ - if (zone_reclaim_mode & RECLAIM_UNMAP) - nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); + if (node_reclaim_mode & RECLAIM_UNMAP) + nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES); else - nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); + nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat); /* If we can't clean pages, remove dirty pages from consideration */ - if (!(zone_reclaim_mode & RECLAIM_WRITE)) - delta += zone_page_state(zone, NR_FILE_DIRTY); + if (!(node_reclaim_mode & RECLAIM_WRITE)) + delta += node_page_state(pgdat, NR_FILE_DIRTY); /* Watch for any possible underflows due to delta */ if (unlikely(delta > nr_pagecache_reclaimable)) @@ -3652,22 +3724,24 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone) } /* - * Try to free up some pages from this zone through reclaim. + * Try to free up some pages from this node through reclaim. */ -static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) { /* Minimum pages needed in order to stay on node */ const unsigned long nr_pages = 1 << order; struct task_struct *p = current; struct reclaim_state reclaim_state; + int classzone_idx = gfp_zone(gfp_mask); struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), .order = order, - .priority = ZONE_RECLAIM_PRIORITY, - .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), - .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP), + .priority = NODE_RECLAIM_PRIORITY, + .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), + .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), .may_swap = 1, + .reclaim_idx = classzone_idx, }; cond_resched(); @@ -3681,13 +3755,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { + if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) { /* * Free memory by calling shrink zone with increasing * priorities until we have enough memory freed. */ do { - shrink_zone(zone, &sc, true); + shrink_node(pgdat, &sc); } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); } @@ -3697,49 +3771,47 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) return sc.nr_reclaimed >= nr_pages; } -int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) { - int node_id; int ret; /* - * Zone reclaim reclaims unmapped file backed pages and + * Node reclaim reclaims unmapped file backed pages and * slab pages if we are over the defined limits. * * A small portion of unmapped file backed pages is needed for * file I/O otherwise pages read by file I/O will be immediately - * thrown out if the zone is overallocated. So we do not reclaim - * if less than a specified percentage of the zone is used by + * thrown out if the node is overallocated. So we do not reclaim + * if less than a specified percentage of the node is used by * unmapped file backed pages. */ - if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && - zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) - return ZONE_RECLAIM_FULL; + if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages && + sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) + return NODE_RECLAIM_FULL; - if (!zone_reclaimable(zone)) - return ZONE_RECLAIM_FULL; + if (!pgdat_reclaimable(pgdat)) + return NODE_RECLAIM_FULL; /* * Do not scan if the allocation should not be delayed. */ if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) - return ZONE_RECLAIM_NOSCAN; + return NODE_RECLAIM_NOSCAN; /* - * Only run zone reclaim on the local zone or on zones that do not + * Only run node reclaim on the local node or on nodes that do not * have associated processors. This will favor the local processor * over remote processors and spread off node memory allocations * as wide as possible. */ - node_id = zone_to_nid(zone); - if (node_state(node_id, N_CPU) && node_id != numa_node_id()) - return ZONE_RECLAIM_NOSCAN; + if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id()) + return NODE_RECLAIM_NOSCAN; - if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags)) - return ZONE_RECLAIM_NOSCAN; + if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) + return NODE_RECLAIM_NOSCAN; - ret = __zone_reclaim(zone, gfp_mask, order); - clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags); + ret = __node_reclaim(pgdat, gfp_mask, order); + clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags); if (!ret) count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); @@ -3778,24 +3850,23 @@ int page_evictable(struct page *page) void check_move_unevictable_pages(struct page **pages, int nr_pages) { struct lruvec *lruvec; - struct zone *zone = NULL; + struct pglist_data *pgdat = NULL; int pgscanned = 0; int pgrescued = 0; int i; for (i = 0; i < nr_pages; i++) { struct page *page = pages[i]; - struct zone *pagezone; + struct pglist_data *pagepgdat = page_pgdat(page); pgscanned++; - pagezone = page_zone(page); - if (pagezone != zone) { - if (zone) - spin_unlock_irq(&zone->lru_lock); - zone = pagezone; - spin_lock_irq(&zone->lru_lock); + if (pagepgdat != pgdat) { + if (pgdat) + spin_unlock_irq(&pgdat->lru_lock); + pgdat = pagepgdat; + spin_lock_irq(&pgdat->lru_lock); } - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, pgdat); if (!PageLRU(page) || !PageUnevictable(page)) continue; @@ -3811,10 +3882,10 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) } } - if (zone) { + if (pgdat) { __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&pgdat->lru_lock); } } #endif /* CONFIG_SHMEM */ diff --git a/mm/vmstat.c b/mm/vmstat.c index 7997f52935c9..89cec42d19ff 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -86,8 +86,10 @@ void vm_events_fold_cpu(int cpu) * * vm_stat contains the global counters */ -atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; -EXPORT_SYMBOL(vm_stat); +atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; +atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp; +EXPORT_SYMBOL(vm_zone_stat); +EXPORT_SYMBOL(vm_node_stat); #ifdef CONFIG_SMP @@ -167,19 +169,36 @@ int calculate_normal_threshold(struct zone *zone) */ void refresh_zone_stat_thresholds(void) { + struct pglist_data *pgdat; struct zone *zone; int cpu; int threshold; + /* Zero current pgdat thresholds */ + for_each_online_pgdat(pgdat) { + for_each_online_cpu(cpu) { + per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0; + } + } + for_each_populated_zone(zone) { + struct pglist_data *pgdat = zone->zone_pgdat; unsigned long max_drift, tolerate_drift; threshold = calculate_normal_threshold(zone); - for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { + int pgdat_threshold; + per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; + /* Base nodestat threshold on the largest populated zone. */ + pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold; + per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold + = max(threshold, pgdat_threshold); + } + /* * Only set percpu_drift_mark if there is a danger that * NR_FREE_PAGES reports the low watermark is ok when in fact @@ -238,6 +257,26 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, } EXPORT_SYMBOL(__mod_zone_page_state); +void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, + long delta) +{ + struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; + s8 __percpu *p = pcp->vm_node_stat_diff + item; + long x; + long t; + + x = delta + __this_cpu_read(*p); + + t = __this_cpu_read(pcp->stat_threshold); + + if (unlikely(x > t || x < -t)) { + node_page_state_add(x, pgdat, item); + x = 0; + } + __this_cpu_write(*p, x); +} +EXPORT_SYMBOL(__mod_node_page_state); + /* * Optimized increment and decrement functions. * @@ -277,12 +316,34 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) } } +void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) +{ + struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; + s8 __percpu *p = pcp->vm_node_stat_diff + item; + s8 v, t; + + v = __this_cpu_inc_return(*p); + t = __this_cpu_read(pcp->stat_threshold); + if (unlikely(v > t)) { + s8 overstep = t >> 1; + + node_page_state_add(v + overstep, pgdat, item); + __this_cpu_write(*p, -overstep); + } +} + void __inc_zone_page_state(struct page *page, enum zone_stat_item item) { __inc_zone_state(page_zone(page), item); } EXPORT_SYMBOL(__inc_zone_page_state); +void __inc_node_page_state(struct page *page, enum node_stat_item item) +{ + __inc_node_state(page_pgdat(page), item); +} +EXPORT_SYMBOL(__inc_node_page_state); + void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { struct per_cpu_pageset __percpu *pcp = zone->pageset; @@ -299,12 +360,34 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) } } +void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) +{ + struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; + s8 __percpu *p = pcp->vm_node_stat_diff + item; + s8 v, t; + + v = __this_cpu_dec_return(*p); + t = __this_cpu_read(pcp->stat_threshold); + if (unlikely(v < - t)) { + s8 overstep = t >> 1; + + node_page_state_add(v - overstep, pgdat, item); + __this_cpu_write(*p, overstep); + } +} + void __dec_zone_page_state(struct page *page, enum zone_stat_item item) { __dec_zone_state(page_zone(page), item); } EXPORT_SYMBOL(__dec_zone_page_state); +void __dec_node_page_state(struct page *page, enum node_stat_item item) +{ + __dec_node_state(page_pgdat(page), item); +} +EXPORT_SYMBOL(__dec_node_page_state); + #ifdef CONFIG_HAVE_CMPXCHG_LOCAL /* * If we have cmpxchg_local support then we do not need to incur the overhead @@ -318,8 +401,8 @@ EXPORT_SYMBOL(__dec_zone_page_state); * 1 Overstepping half of threshold * -1 Overstepping minus half of threshold */ -static inline void mod_state(struct zone *zone, enum zone_stat_item item, - long delta, int overstep_mode) +static inline void mod_zone_state(struct zone *zone, + enum zone_stat_item item, long delta, int overstep_mode) { struct per_cpu_pageset __percpu *pcp = zone->pageset; s8 __percpu *p = pcp->vm_stat_diff + item; @@ -359,26 +442,83 @@ static inline void mod_state(struct zone *zone, enum zone_stat_item item, void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, long delta) { - mod_state(zone, item, delta, 0); + mod_zone_state(zone, item, delta, 0); } EXPORT_SYMBOL(mod_zone_page_state); -void inc_zone_state(struct zone *zone, enum zone_stat_item item) -{ - mod_state(zone, item, 1, 1); -} - void inc_zone_page_state(struct page *page, enum zone_stat_item item) { - mod_state(page_zone(page), item, 1, 1); + mod_zone_state(page_zone(page), item, 1, 1); } EXPORT_SYMBOL(inc_zone_page_state); void dec_zone_page_state(struct page *page, enum zone_stat_item item) { - mod_state(page_zone(page), item, -1, -1); + mod_zone_state(page_zone(page), item, -1, -1); } EXPORT_SYMBOL(dec_zone_page_state); + +static inline void mod_node_state(struct pglist_data *pgdat, + enum node_stat_item item, int delta, int overstep_mode) +{ + struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; + s8 __percpu *p = pcp->vm_node_stat_diff + item; + long o, n, t, z; + + do { + z = 0; /* overflow to node counters */ + + /* + * The fetching of the stat_threshold is racy. We may apply + * a counter threshold to the wrong the cpu if we get + * rescheduled while executing here. However, the next + * counter update will apply the threshold again and + * therefore bring the counter under the threshold again. + * + * Most of the time the thresholds are the same anyways + * for all cpus in a node. + */ + t = this_cpu_read(pcp->stat_threshold); + + o = this_cpu_read(*p); + n = delta + o; + + if (n > t || n < -t) { + int os = overstep_mode * (t >> 1) ; + + /* Overflow must be added to node counters */ + z = n + os; + n = -os; + } + } while (this_cpu_cmpxchg(*p, o, n) != o); + + if (z) + node_page_state_add(z, pgdat, item); +} + +void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, + long delta) +{ + mod_node_state(pgdat, item, delta, 0); +} +EXPORT_SYMBOL(mod_node_page_state); + +void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) +{ + mod_node_state(pgdat, item, 1, 1); +} + +void inc_node_page_state(struct page *page, enum node_stat_item item) +{ + mod_node_state(page_pgdat(page), item, 1, 1); +} +EXPORT_SYMBOL(inc_node_page_state); + +void dec_node_page_state(struct page *page, enum node_stat_item item) +{ + mod_node_state(page_pgdat(page), item, -1, -1); +} +EXPORT_SYMBOL(dec_node_page_state); #else /* * Use interrupt disable to serialize counter updates @@ -394,15 +534,6 @@ void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, } EXPORT_SYMBOL(mod_zone_page_state); -void inc_zone_state(struct zone *zone, enum zone_stat_item item) -{ - unsigned long flags; - - local_irq_save(flags); - __inc_zone_state(zone, item); - local_irq_restore(flags); -} - void inc_zone_page_state(struct page *page, enum zone_stat_item item) { unsigned long flags; @@ -424,21 +555,69 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item) local_irq_restore(flags); } EXPORT_SYMBOL(dec_zone_page_state); -#endif +void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) +{ + unsigned long flags; + + local_irq_save(flags); + __inc_node_state(pgdat, item); + local_irq_restore(flags); +} +EXPORT_SYMBOL(inc_node_state); + +void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, + long delta) +{ + unsigned long flags; + + local_irq_save(flags); + __mod_node_page_state(pgdat, item, delta); + local_irq_restore(flags); +} +EXPORT_SYMBOL(mod_node_page_state); + +void inc_node_page_state(struct page *page, enum node_stat_item item) +{ + unsigned long flags; + struct pglist_data *pgdat; + + pgdat = page_pgdat(page); + local_irq_save(flags); + __inc_node_state(pgdat, item); + local_irq_restore(flags); +} +EXPORT_SYMBOL(inc_node_page_state); + +void dec_node_page_state(struct page *page, enum node_stat_item item) +{ + unsigned long flags; + + local_irq_save(flags); + __dec_node_page_state(page, item); + local_irq_restore(flags); +} +EXPORT_SYMBOL(dec_node_page_state); +#endif /* * Fold a differential into the global counters. * Returns the number of counters updated. */ -static int fold_diff(int *diff) +static int fold_diff(int *zone_diff, int *node_diff) { int i; int changes = 0; for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - if (diff[i]) { - atomic_long_add(diff[i], &vm_stat[i]); + if (zone_diff[i]) { + atomic_long_add(zone_diff[i], &vm_zone_stat[i]); + changes++; + } + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + if (node_diff[i]) { + atomic_long_add(node_diff[i], &vm_node_stat[i]); changes++; } return changes; @@ -462,9 +641,11 @@ static int fold_diff(int *diff) */ static int refresh_cpu_vm_stats(bool do_pagesets) { + struct pglist_data *pgdat; struct zone *zone; int i; - int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; + int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; + int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; int changes = 0; for_each_populated_zone(zone) { @@ -477,7 +658,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets) if (v) { atomic_long_add(v, &zone->vm_stat[i]); - global_diff[i] += v; + global_zone_diff[i] += v; #ifdef CONFIG_NUMA /* 3 seconds idle till flush */ __this_cpu_write(p->expire, 3); @@ -516,7 +697,22 @@ static int refresh_cpu_vm_stats(bool do_pagesets) } #endif } - changes += fold_diff(global_diff); + + for_each_online_pgdat(pgdat) { + struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats; + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + int v; + + v = this_cpu_xchg(p->vm_node_stat_diff[i], 0); + if (v) { + atomic_long_add(v, &pgdat->vm_stat[i]); + global_node_diff[i] += v; + } + } + } + + changes += fold_diff(global_zone_diff, global_node_diff); return changes; } @@ -527,9 +723,11 @@ static int refresh_cpu_vm_stats(bool do_pagesets) */ void cpu_vm_stats_fold(int cpu) { + struct pglist_data *pgdat; struct zone *zone; int i; - int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; + int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; + int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; for_each_populated_zone(zone) { struct per_cpu_pageset *p; @@ -543,11 +741,27 @@ void cpu_vm_stats_fold(int cpu) v = p->vm_stat_diff[i]; p->vm_stat_diff[i] = 0; atomic_long_add(v, &zone->vm_stat[i]); - global_diff[i] += v; + global_zone_diff[i] += v; } } - fold_diff(global_diff); + for_each_online_pgdat(pgdat) { + struct per_cpu_nodestat *p; + + p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu); + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + if (p->vm_node_stat_diff[i]) { + int v; + + v = p->vm_node_stat_diff[i]; + p->vm_node_stat_diff[i] = 0; + atomic_long_add(v, &pgdat->vm_stat[i]); + global_node_diff[i] += v; + } + } + + fold_diff(global_zone_diff, global_node_diff); } /* @@ -563,16 +777,19 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) int v = pset->vm_stat_diff[i]; pset->vm_stat_diff[i] = 0; atomic_long_add(v, &zone->vm_stat[i]); - atomic_long_add(v, &vm_stat[i]); + atomic_long_add(v, &vm_zone_stat[i]); } } #endif #ifdef CONFIG_NUMA /* - * Determine the per node value of a stat item. + * Determine the per node value of a stat item. This function + * is called frequently in a NUMA machine, so try to be as + * frugal as possible. */ -unsigned long node_page_state(int node, enum zone_stat_item item) +unsigned long sum_zone_node_page_state(int node, + enum zone_stat_item item) { struct zone *zones = NODE_DATA(node)->node_zones; int i; @@ -584,6 +801,19 @@ unsigned long node_page_state(int node, enum zone_stat_item item) return count; } +/* + * Determine the per node value of a stat item. + */ +unsigned long node_page_state(struct pglist_data *pgdat, + enum node_stat_item item) +{ + long x = atomic_long_read(&pgdat->vm_stat[item]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} #endif #ifdef CONFIG_COMPACTION @@ -691,33 +921,18 @@ int fragmentation_index(struct zone *zone, unsigned int order) const char * const vmstat_text[] = { /* enum zone_stat_item countes */ "nr_free_pages", - "nr_alloc_batch", - "nr_inactive_anon", - "nr_active_anon", - "nr_inactive_file", - "nr_active_file", - "nr_unevictable", + "nr_zone_inactive_anon", + "nr_zone_active_anon", + "nr_zone_inactive_file", + "nr_zone_active_file", + "nr_zone_unevictable", + "nr_zone_write_pending", "nr_mlock", - "nr_anon_pages", - "nr_mapped", - "nr_file_pages", - "nr_dirty", - "nr_writeback", "nr_slab_reclaimable", "nr_slab_unreclaimable", "nr_page_table_pages", "nr_kernel_stack", - "nr_unstable", "nr_bounce", - "nr_vmscan_write", - "nr_vmscan_immediate_reclaim", - "nr_writeback_temp", - "nr_isolated_anon", - "nr_isolated_file", - "nr_shmem", - "nr_dirtied", - "nr_written", - "nr_pages_scanned", #if IS_ENABLED(CONFIG_ZSMALLOC) "nr_zspages", #endif @@ -729,13 +944,35 @@ const char * const vmstat_text[] = { "numa_local", "numa_other", #endif + "nr_free_cma", + + /* Node-based counters */ + "nr_inactive_anon", + "nr_active_anon", + "nr_inactive_file", + "nr_active_file", + "nr_unevictable", + "nr_isolated_anon", + "nr_isolated_file", + "nr_pages_scanned", "workingset_refault", "workingset_activate", "workingset_nodereclaim", - "nr_anon_transparent_hugepages", + "nr_anon_pages", + "nr_mapped", + "nr_file_pages", + "nr_dirty", + "nr_writeback", + "nr_writeback_temp", + "nr_shmem", "nr_shmem_hugepages", "nr_shmem_pmdmapped", - "nr_free_cma", + "nr_anon_transparent_hugepages", + "nr_unstable", + "nr_vmscan_write", + "nr_vmscan_immediate_reclaim", + "nr_dirtied", + "nr_written", /* enum writeback_stat_item counters */ "nr_dirty_threshold", @@ -749,6 +986,8 @@ const char * const vmstat_text[] = { "pswpout", TEXTS_FOR_ZONES("pgalloc") + TEXTS_FOR_ZONES("allocstall") + TEXTS_FOR_ZONES("pgskip") "pgfree", "pgactivate", @@ -758,11 +997,11 @@ const char * const vmstat_text[] = { "pgmajfault", "pglazyfreed", - TEXTS_FOR_ZONES("pgrefill") - TEXTS_FOR_ZONES("pgsteal_kswapd") - TEXTS_FOR_ZONES("pgsteal_direct") - TEXTS_FOR_ZONES("pgscan_kswapd") - TEXTS_FOR_ZONES("pgscan_direct") + "pgrefill", + "pgsteal_kswapd", + "pgsteal_direct", + "pgscan_kswapd", + "pgscan_direct", "pgscan_direct_throttle", #ifdef CONFIG_NUMA @@ -774,7 +1013,6 @@ const char * const vmstat_text[] = { "kswapd_low_wmark_hit_quickly", "kswapd_high_wmark_hit_quickly", "pageoutrun", - "allocstall", "pgrotated", @@ -1180,17 +1418,41 @@ static const struct file_operations pagetypeinfo_file_ops = { .release = seq_release, }; +static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone) +{ + int zid; + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + struct zone *compare = &pgdat->node_zones[zid]; + + if (populated_zone(compare)) + return zone == compare; + } + + /* The zone must be somewhere! */ + WARN_ON_ONCE(1); + return false; +} + static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) { int i; seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); + if (is_zone_first_populated(pgdat, zone)) { + seq_printf(m, "\n per-node stats"); + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + seq_printf(m, "\n %-12s %lu", + vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], + node_page_state(pgdat, i)); + } + } seq_printf(m, "\n pages free %lu" "\n min %lu" "\n low %lu" "\n high %lu" - "\n scanned %lu" + "\n node_scanned %lu" "\n spanned %lu" "\n present %lu" "\n managed %lu", @@ -1198,13 +1460,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), - zone_page_state(zone, NR_PAGES_SCANNED), + node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED), zone->spanned_pages, zone->present_pages, zone->managed_pages); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - seq_printf(m, "\n %-12s %lu", vmstat_text[i], + seq_printf(m, "\n %-12s %lu", vmstat_text[i], zone_page_state(zone, i)); seq_printf(m, @@ -1234,12 +1496,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, #endif } seq_printf(m, - "\n all_unreclaimable: %u" - "\n start_pfn: %lu" - "\n inactive_ratio: %u", - !zone_reclaimable(zone), + "\n node_unreclaimable: %u" + "\n start_pfn: %lu" + "\n node_inactive_ratio: %u", + !pgdat_reclaimable(zone->zone_pgdat), zone->zone_start_pfn, - zone->inactive_ratio); + zone->zone_pgdat->inactive_ratio); seq_putc(m, '\n'); } @@ -1287,6 +1549,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) if (*pos >= ARRAY_SIZE(vmstat_text)) return NULL; stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + + NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) + NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long); #ifdef CONFIG_VM_EVENT_COUNTERS @@ -1301,6 +1564,10 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) v[i] = global_page_state(i); v += NR_VM_ZONE_STAT_ITEMS; + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + v[i] = global_node_page_state(i); + v += NR_VM_NODE_STAT_ITEMS; + global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, v + NR_DIRTY_THRESHOLD); v += NR_VM_WRITEBACK_STAT_ITEMS; @@ -1325,7 +1592,6 @@ static int vmstat_show(struct seq_file *m, void *arg) { unsigned long *l = arg; unsigned long off = l - (unsigned long *)m->private; - seq_printf(m, "%s %lu\n", vmstat_text[off], *l); return 0; } @@ -1390,13 +1656,12 @@ int vmstat_refresh(struct ctl_table *table, int write, if (err) return err; for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { - val = atomic_long_read(&vm_stat[i]); + val = atomic_long_read(&vm_zone_stat[i]); if (val < 0) { switch (i) { - case NR_ALLOC_BATCH: case NR_PAGES_SCANNED: /* - * These are often seen to go negative in + * This is often seen to go negative in * recent kernels, but not to go permanently * negative. Whilst it would be nicer not to * have exceptions, rooting them out would be diff --git a/mm/workingset.c b/mm/workingset.c index 577277546d98..69551cfae97b 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -16,7 +16,7 @@ /* * Double CLOCK lists * - * Per zone, two clock lists are maintained for file pages: the + * Per node, two clock lists are maintained for file pages: the * inactive and the active list. Freshly faulted pages start out at * the head of the inactive list and page reclaim scans pages from the * tail. Pages that are accessed multiple times on the inactive list @@ -141,11 +141,11 @@ * * Implementation * - * For each zone's file LRU lists, a counter for inactive evictions - * and activations is maintained (zone->inactive_age). + * For each node's file LRU lists, a counter for inactive evictions + * and activations is maintained (node->inactive_age). * * On eviction, a snapshot of this counter (along with some bits to - * identify the zone) is stored in the now empty page cache radix tree + * identify the node) is stored in the now empty page cache radix tree * slot of the evicted page. This is called a shadow entry. * * On cache misses for which there are shadow entries, an eligible @@ -153,7 +153,7 @@ */ #define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ - ZONES_SHIFT + NODES_SHIFT + \ + NODES_SHIFT + \ MEM_CGROUP_ID_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) @@ -167,33 +167,30 @@ */ static unsigned int bucket_order __read_mostly; -static void *pack_shadow(int memcgid, struct zone *zone, unsigned long eviction) +static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction) { eviction >>= bucket_order; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; - eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); - eviction = (eviction << ZONES_SHIFT) | zone_idx(zone); + eviction = (eviction << NODES_SHIFT) | pgdat->node_id; eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); } -static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep, +static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, unsigned long *evictionp) { unsigned long entry = (unsigned long)shadow; - int memcgid, nid, zid; + int memcgid, nid; entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; - zid = entry & ((1UL << ZONES_SHIFT) - 1); - entry >>= ZONES_SHIFT; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); entry >>= MEM_CGROUP_ID_SHIFT; *memcgidp = memcgid; - *zonep = NODE_DATA(nid)->node_zones + zid; + *pgdat = NODE_DATA(nid); *evictionp = entry << bucket_order; } @@ -208,7 +205,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep, void *workingset_eviction(struct address_space *mapping, struct page *page) { struct mem_cgroup *memcg = page_memcg(page); - struct zone *zone = page_zone(page); + struct pglist_data *pgdat = page_pgdat(page); int memcgid = mem_cgroup_id(memcg); unsigned long eviction; struct lruvec *lruvec; @@ -218,9 +215,9 @@ void *workingset_eviction(struct address_space *mapping, struct page *page) VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); - lruvec = mem_cgroup_zone_lruvec(zone, memcg); + lruvec = mem_cgroup_lruvec(pgdat, memcg); eviction = atomic_long_inc_return(&lruvec->inactive_age); - return pack_shadow(memcgid, zone, eviction); + return pack_shadow(memcgid, pgdat, eviction); } /** @@ -228,7 +225,7 @@ void *workingset_eviction(struct address_space *mapping, struct page *page) * @shadow: shadow entry of the evicted page * * Calculates and evaluates the refault distance of the previously - * evicted page in the context of the zone it was allocated in. + * evicted page in the context of the node it was allocated in. * * Returns %true if the page should be activated, %false otherwise. */ @@ -240,10 +237,10 @@ bool workingset_refault(void *shadow) unsigned long eviction; struct lruvec *lruvec; unsigned long refault; - struct zone *zone; + struct pglist_data *pgdat; int memcgid; - unpack_shadow(shadow, &memcgid, &zone, &eviction); + unpack_shadow(shadow, &memcgid, &pgdat, &eviction); rcu_read_lock(); /* @@ -267,7 +264,7 @@ bool workingset_refault(void *shadow) rcu_read_unlock(); return false; } - lruvec = mem_cgroup_zone_lruvec(zone, memcg); + lruvec = mem_cgroup_lruvec(pgdat, memcg); refault = atomic_long_read(&lruvec->inactive_age); active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); rcu_read_unlock(); @@ -290,10 +287,10 @@ bool workingset_refault(void *shadow) */ refault_distance = (refault - eviction) & EVICTION_MASK; - inc_zone_state(zone, WORKINGSET_REFAULT); + inc_node_state(pgdat, WORKINGSET_REFAULT); if (refault_distance <= active_file) { - inc_zone_state(zone, WORKINGSET_ACTIVATE); + inc_node_state(pgdat, WORKINGSET_ACTIVATE); return true; } return false; @@ -305,9 +302,10 @@ bool workingset_refault(void *shadow) */ void workingset_activation(struct page *page) { + struct mem_cgroup *memcg; struct lruvec *lruvec; - lock_page_memcg(page); + rcu_read_lock(); /* * Filter non-memcg pages here, e.g. unmap can call * mark_page_accessed() on VDSO pages. @@ -315,12 +313,13 @@ void workingset_activation(struct page *page) * XXX: See workingset_refault() - this should return * root_mem_cgroup even for !CONFIG_MEMCG. */ - if (!mem_cgroup_disabled() && !page_memcg(page)) + memcg = page_memcg_rcu(page); + if (!mem_cgroup_disabled() && !memcg) goto out; - lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page)); + lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); atomic_long_inc(&lruvec->inactive_age); out: - unlock_page_memcg(page); + rcu_read_unlock(); } /* @@ -349,12 +348,13 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); local_irq_enable(); - if (memcg_kmem_enabled()) + if (memcg_kmem_enabled()) { pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, LRU_ALL_FILE); - else - pages = node_page_state(sc->nid, NR_ACTIVE_FILE) + - node_page_state(sc->nid, NR_INACTIVE_FILE); + } else { + pages = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + + node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); + } /* * Active cache pages are limited to 50% of memory, and shadow @@ -433,7 +433,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, } } BUG_ON(node->count); - inc_zone_state(page_zone(virt_to_page(node)), WORKINGSET_NODERECLAIM); + inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); if (!__radix_tree_delete_node(&mapping->page_tree, node)) BUG(); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 04176de6df70..b0bc023d25c5 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -20,6 +20,7 @@ * page->freelist(index): links together all component pages of a zspage * For the huge page, this is always 0, so we use this field * to store handle. + * page->units: first object offset in a subpage of zspage * * Usage of struct page flags: * PG_private: identifies the first component page @@ -137,9 +138,6 @@ */ #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS) -/* - * We do not maintain any list for completely empty or full pages - */ enum fullness_group { ZS_EMPTY, ZS_ALMOST_EMPTY, @@ -467,11 +465,6 @@ static struct zpool_driver zs_zpool_driver = { MODULE_ALIAS("zpool-zsmalloc"); #endif /* CONFIG_ZPOOL */ -static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) -{ - return pages_per_zspage * PAGE_SIZE / size; -} - /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ static DEFINE_PER_CPU(struct mapping_area, zs_map_area); @@ -635,8 +628,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v) freeable = zs_can_compact(class); spin_unlock(&class->lock); - objs_per_zspage = get_maxobj_per_zspage(class->size, - class->pages_per_zspage); + objs_per_zspage = class->objs_per_zspage; pages_used = obj_allocated / objs_per_zspage * class->pages_per_zspage; @@ -945,8 +937,8 @@ static void unpin_tag(unsigned long handle) static void reset_page(struct page *page) { __ClearPageMovable(page); - clear_bit(PG_private, &page->flags); - clear_bit(PG_private_2, &page->flags); + ClearPagePrivate(page); + ClearPagePrivate2(page); set_page_private(page, 0); page_mapcount_reset(page); ClearPageHugeObject(page); @@ -1014,8 +1006,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class, cache_free_zspage(pool, zspage); - zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( - class->size, class->pages_per_zspage)); + zs_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage); atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated); } @@ -1350,7 +1341,7 @@ static void zs_unregister_cpu_notifier(void) cpu_notifier_register_done(); } -static void init_zs_size_classes(void) +static void __init init_zs_size_classes(void) { int nr; @@ -1361,16 +1352,14 @@ static void init_zs_size_classes(void) zs_size_classes = nr; } -static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) +static bool can_merge(struct size_class *prev, int pages_per_zspage, + int objs_per_zspage) { - if (prev->pages_per_zspage != pages_per_zspage) - return false; + if (prev->pages_per_zspage == pages_per_zspage && + prev->objs_per_zspage == objs_per_zspage) + return true; - if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage) - != get_maxobj_per_zspage(size, pages_per_zspage)) - return false; - - return true; + return false; } static bool zspage_full(struct size_class *class, struct zspage *zspage) @@ -1541,6 +1530,7 @@ static unsigned long obj_malloc(struct size_class *class, * zs_malloc - Allocate block of given size from pool. * @pool: pool to allocate from * @size: size of block to allocate + * @gfp: gfp flags when allocating object * * On success, handle to the allocated object is returned, * otherwise 0. @@ -1592,8 +1582,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) record_obj(handle, obj); atomic_long_add(class->pages_per_zspage, &pool->pages_allocated); - zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( - class->size, class->pages_per_zspage)); + zs_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage); /* We completely set up zspage so mark them as movable */ SetZsPageMovable(pool, zspage); @@ -1741,10 +1730,11 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, * return handle. */ static unsigned long find_alloced_obj(struct size_class *class, - struct page *page, int index) + struct page *page, int *obj_idx) { unsigned long head; int offset = 0; + int index = *obj_idx; unsigned long handle = 0; void *addr = kmap_atomic(page); @@ -1765,6 +1755,9 @@ static unsigned long find_alloced_obj(struct size_class *class, } kunmap_atomic(addr); + + *obj_idx = index; + return handle; } @@ -1776,7 +1769,7 @@ struct zs_compact_control { struct page *d_page; /* Starting object index within @s_page which used for live object * in the subpage. */ - int index; + int obj_idx; }; static int migrate_zspage(struct zs_pool *pool, struct size_class *class, @@ -1786,16 +1779,16 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, unsigned long handle; struct page *s_page = cc->s_page; struct page *d_page = cc->d_page; - unsigned long index = cc->index; + int obj_idx = cc->obj_idx; int ret = 0; while (1) { - handle = find_alloced_obj(class, s_page, index); + handle = find_alloced_obj(class, s_page, &obj_idx); if (!handle) { s_page = get_next_page(s_page); if (!s_page) break; - index = 0; + obj_idx = 0; continue; } @@ -1809,7 +1802,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, used_obj = handle_to_obj(handle); free_obj = obj_malloc(class, get_zspage(d_page), handle); zs_object_copy(class, free_obj, used_obj); - index++; + obj_idx++; /* * record_obj updates handle's value to free_obj and it will * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which @@ -1824,7 +1817,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, /* Remember last position in this iteration */ cc->s_page = s_page; - cc->index = index; + cc->obj_idx = obj_idx; return ret; } @@ -2181,8 +2174,7 @@ static int zs_register_migration(struct zs_pool *pool) static void zs_unregister_migration(struct zs_pool *pool) { flush_work(&pool->free_work); - if (pool->inode) - iput(pool->inode); + iput(pool->inode); } /* @@ -2261,8 +2253,7 @@ static unsigned long zs_can_compact(struct size_class *class) return 0; obj_wasted = obj_allocated - obj_used; - obj_wasted /= get_maxobj_per_zspage(class->size, - class->pages_per_zspage); + obj_wasted /= class->objs_per_zspage; return obj_wasted * class->pages_per_zspage; } @@ -2279,7 +2270,7 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class) if (!zs_can_compact(class)) break; - cc.index = 0; + cc.obj_idx = 0; cc.s_page = get_first_page(src_zspage); while ((dst_zspage = isolate_zspage(class, false))) { @@ -2398,7 +2389,7 @@ static int zs_register_shrinker(struct zs_pool *pool) /** * zs_create_pool - Creates an allocation pool to work from. - * @flags: allocation flags used to allocate pool metadata + * @name: pool name to be created * * This function must be called before anything when using * the zsmalloc allocator. @@ -2438,6 +2429,7 @@ struct zs_pool *zs_create_pool(const char *name) for (i = zs_size_classes - 1; i >= 0; i--) { int size; int pages_per_zspage; + int objs_per_zspage; struct size_class *class; int fullness = 0; @@ -2445,6 +2437,7 @@ struct zs_pool *zs_create_pool(const char *name) if (size > ZS_MAX_ALLOC_SIZE) size = ZS_MAX_ALLOC_SIZE; pages_per_zspage = get_pages_per_zspage(size); + objs_per_zspage = pages_per_zspage * PAGE_SIZE / size; /* * size_class is used for normal zsmalloc operation such @@ -2456,7 +2449,7 @@ struct zs_pool *zs_create_pool(const char *name) * previous size_class if possible. */ if (prev_class) { - if (can_merge(prev_class, size, pages_per_zspage)) { + if (can_merge(prev_class, pages_per_zspage, objs_per_zspage)) { pool->size_class[i] = prev_class; continue; } @@ -2469,8 +2462,7 @@ struct zs_pool *zs_create_pool(const char *name) class->size = size; class->index = i; class->pages_per_zspage = pages_per_zspage; - class->objs_per_zspage = class->pages_per_zspage * - PAGE_SIZE / class->size; + class->objs_per_zspage = objs_per_zspage; spin_lock_init(&class->lock); pool->size_class[i] = class; for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS; diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index b1d491c2e704..fdde1bd3e306 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -608,6 +608,7 @@ static const struct { const char *compact; } gfp_compact_table[] = { { "GFP_TRANSHUGE", "THP" }, + { "GFP_TRANSHUGE_LIGHT", "THL" }, { "GFP_HIGHUSER_MOVABLE", "HUM" }, { "GFP_HIGHUSER", "HU" }, { "GFP_USER", "U" }, |