diff options
Diffstat (limited to 'mm')
56 files changed, 2820 insertions, 2531 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index c1acc34c1c35..e3490ecac839 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -126,9 +126,6 @@ config SPARSEMEM_VMEMMAP pfn_to_page and page_to_pfn operations. This is the most efficient option when sufficient kernel resources are available. -config HAVE_MEMBLOCK_NODE_MAP - bool - config HAVE_MEMBLOCK_PHYS_MAP bool @@ -705,9 +702,9 @@ config ZSMALLOC returned by an alloc(). This handle must be mapped in order to access the allocated space. -config PGTABLE_MAPPING +config ZSMALLOC_PGTABLE_MAPPING bool "Use page table mapping to access object in zsmalloc" - depends on ZSMALLOC + depends on ZSMALLOC=y help By default, zsmalloc uses a copy-based object mapping method to access allocations that span two pages. However, if a particular @@ -750,13 +747,13 @@ config DEFERRED_STRUCT_PAGE_INIT depends on SPARSEMEM depends on !NEED_PER_CPU_KM depends on 64BIT + select PADATA help Ordinarily all struct pages are initialised during early boot in a single thread. On very large machines this can take a considerable amount of time. If this option is set, large machines will bring up - a subset of memmap at boot and then initialise the rest in parallel - by starting one-off "pgdatinitX" kernel thread for each node X. This - has a potential performance impact on processes running early in the + a subset of memmap at boot and then initialise the rest in parallel. + This has a potential performance impact on tasks running early in the lifetime of the system until these kthreads finish the initialisation. diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 0271b22e063f..2409f7fc1567 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -118,6 +118,38 @@ config DEBUG_RODATA_TEST ---help--- This option enables a testcase for the setting rodata read-only. +config ARCH_HAS_DEBUG_WX + bool + +config DEBUG_WX + bool "Warn on W+X mappings at boot" + depends on ARCH_HAS_DEBUG_WX + depends on MMU + select PTDUMP_CORE + help + Generate a warning if any W+X mappings are found at boot. + + This is useful for discovering cases where the kernel is leaving W+X + mappings after applying NX, as such mappings are a security risk. + + Look for a message in dmesg output like this: + + <arch>/mm: Checked W+X mappings: passed, no W+X pages found. + + or like this, if the check failed: + + <arch>/mm: Checked W+X mappings: failed, <N> W+X pages found. + + Note that even if the check fails, your kernel is possibly + still fine, as W+X mappings are not a security hole in + themselves, what they do is that they make the exploitation + of other unfixed kernel bugs easier. + + There is no runtime or memory usage effect of this option + once the kernel has booted up - it's a one time check. + + If in doubt, say "Y". + config GENERIC_PTDUMP bool diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c81b4f3a7268..d382272bcc31 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -15,13 +15,12 @@ #include <trace/events/writeback.h> struct backing_dev_info noop_backing_dev_info = { - .name = "noop", .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; EXPORT_SYMBOL_GPL(noop_backing_dev_info); static struct class *bdi_class; -const char *bdi_unknown_name = "(unknown)"; +static const char *bdi_unknown_name = "(unknown)"; /* * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU @@ -865,12 +864,11 @@ static int bdi_init(struct backing_dev_info *bdi) return ret; } -struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id) +struct backing_dev_info *bdi_alloc(int node_id) { struct backing_dev_info *bdi; - bdi = kmalloc_node(sizeof(struct backing_dev_info), - gfp_mask | __GFP_ZERO, node_id); + bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id); if (!bdi) return NULL; @@ -880,7 +878,7 @@ struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id) } return bdi; } -EXPORT_SYMBOL(bdi_alloc_node); +EXPORT_SYMBOL(bdi_alloc); static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp) { @@ -938,7 +936,8 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) if (bdi->dev) /* The driver needs to use separate queues per device */ return 0; - dev = device_create_vargs(bdi_class, NULL, MKDEV(0, 0), bdi, fmt, args); + vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args); + dev = device_create(bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name); if (IS_ERR(dev)) return PTR_ERR(dev); @@ -963,7 +962,6 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) trace_writeback_bdi_register(bdi); return 0; } -EXPORT_SYMBOL(bdi_register_va); int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...) { @@ -977,20 +975,12 @@ int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...) } EXPORT_SYMBOL(bdi_register); -int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner) +void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner) { - int rc; - - rc = bdi_register(bdi, "%u:%u", MAJOR(owner->devt), MINOR(owner->devt)); - if (rc) - return rc; - /* Leaking owner reference... */ - WARN_ON(bdi->owner); + WARN_ON_ONCE(bdi->owner); bdi->owner = owner; get_device(owner); - return 0; } -EXPORT_SYMBOL(bdi_register_owner); /* * Remove bdi from bdi_list, and ensure that it is no longer visible @@ -1043,6 +1033,14 @@ void bdi_put(struct backing_dev_info *bdi) } EXPORT_SYMBOL(bdi_put); +const char *bdi_dev_name(struct backing_dev_info *bdi) +{ + if (!bdi || !bdi->dev) + return bdi_unknown_name; + return bdi->dev_name; +} +EXPORT_SYMBOL_GPL(bdi_dev_name); + static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) diff --git a/mm/compaction.c b/mm/compaction.c index 46f0fcc93081..14d2fe231ea4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1409,7 +1409,9 @@ fast_isolate_freepages(struct compact_control *cc) cc->free_pfn = highest; } else { if (cc->direct_compaction && pfn_valid(min_pfn)) { - page = pfn_to_page(min_pfn); + page = pageblock_pfn_to_page(min_pfn, + pageblock_end_pfn(min_pfn), + cc->zone); cc->free_pfn = min_pfn; } } @@ -1966,7 +1968,7 @@ static enum compact_result compact_finished(struct compact_control *cc) */ static enum compact_result __compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, - int classzone_idx, + int highest_zoneidx, unsigned long wmark_target) { unsigned long watermark; @@ -1979,7 +1981,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * If watermarks for high-order allocation are already met, there * should be no need for compaction at all. */ - if (zone_watermark_ok(zone, order, watermark, classzone_idx, + if (zone_watermark_ok(zone, order, watermark, highest_zoneidx, alloc_flags)) return COMPACT_SUCCESS; @@ -1989,9 +1991,9 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * watermark and alloc_flags have to match, or be more pessimistic than * the check in __isolate_free_page(). We don't use the direct * compactor's alloc_flags, as they are not relevant for freepage - * isolation. We however do use the direct compactor's classzone_idx to - * skip over zones where lowmem reserves would prevent allocation even - * if compaction succeeds. + * isolation. We however do use the direct compactor's highest_zoneidx + * to skip over zones where lowmem reserves would prevent allocation + * even if compaction succeeds. * For costly orders, we require low watermark instead of min for * compaction to proceed to increase its chances. * ALLOC_CMA is used, as pages in CMA pageblocks are considered @@ -2000,7 +2002,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? low_wmark_pages(zone) : min_wmark_pages(zone); watermark += compact_gap(order); - if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, + if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx, ALLOC_CMA, wmark_target)) return COMPACT_SKIPPED; @@ -2009,12 +2011,12 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, enum compact_result compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, - int classzone_idx) + int highest_zoneidx) { enum compact_result ret; int fragindex; - ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx, + ret = __compaction_suitable(zone, order, alloc_flags, highest_zoneidx, zone_page_state(zone, NR_FREE_PAGES)); /* * fragmentation index determines if allocation failures are due to @@ -2055,8 +2057,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, * Make sure at least one zone would pass __compaction_suitable if we continue * retrying the reclaim. */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { unsigned long available; enum compact_result compact_result; @@ -2069,7 +2071,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, available = zone_reclaimable_pages(zone) / order; available += zone_page_state_snapshot(zone, NR_FREE_PAGES); compact_result = __compaction_suitable(zone, order, alloc_flags, - ac_classzone_idx(ac), available); + ac->highest_zoneidx, available); if (compact_result != COMPACT_SKIPPED) return true; } @@ -2098,9 +2100,9 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) INIT_LIST_HEAD(&cc->freepages); INIT_LIST_HEAD(&cc->migratepages); - cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); + cc->migratetype = gfp_migratetype(cc->gfp_mask); ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, - cc->classzone_idx); + cc->highest_zoneidx); /* Compaction is likely to fail */ if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) return ret; @@ -2243,15 +2245,11 @@ check_drain: * would succeed. */ if (cc->order > 0 && last_migrated_pfn) { - int cpu; unsigned long current_block_start = block_start_pfn(cc->migrate_pfn, cc->order); if (last_migrated_pfn < current_block_start) { - cpu = get_cpu(); - lru_add_drain_cpu(cpu); - drain_local_pages(cc->zone); - put_cpu(); + lru_add_drain_cpu_zone(cc->zone); /* No more flushing until we migrate again */ last_migrated_pfn = 0; } @@ -2295,7 +2293,7 @@ out: static enum compact_result compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, enum compact_priority prio, - unsigned int alloc_flags, int classzone_idx, + unsigned int alloc_flags, int highest_zoneidx, struct page **capture) { enum compact_result ret; @@ -2307,7 +2305,7 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .mode = (prio == COMPACT_PRIO_ASYNC) ? MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT, .alloc_flags = alloc_flags, - .classzone_idx = classzone_idx, + .highest_zoneidx = highest_zoneidx, .direct_compaction = true, .whole_zone = (prio == MIN_COMPACT_PRIORITY), .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY), @@ -2363,8 +2361,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); /* Compact each zone in the list */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { enum compact_result status; if (prio > MIN_COMPACT_PRIORITY @@ -2374,7 +2372,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, } status = compact_zone_order(zone, order, gfp_mask, prio, - alloc_flags, ac_classzone_idx(ac), capture); + alloc_flags, ac->highest_zoneidx, capture); rc = max(status, rc); /* The allocation should succeed, stop compacting */ @@ -2463,7 +2461,7 @@ int sysctl_compact_memory; * /proc/sys/vm/compact_memory */ int sysctl_compaction_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { if (write) compact_nodes(); @@ -2509,16 +2507,16 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) { int zoneid; struct zone *zone; - enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx; + enum zone_type highest_zoneidx = pgdat->kcompactd_highest_zoneidx; - for (zoneid = 0; zoneid <= classzone_idx; zoneid++) { + for (zoneid = 0; zoneid <= highest_zoneidx; zoneid++) { zone = &pgdat->node_zones[zoneid]; if (!populated_zone(zone)) continue; if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0, - classzone_idx) == COMPACT_CONTINUE) + highest_zoneidx) == COMPACT_CONTINUE) return true; } @@ -2536,16 +2534,16 @@ static void kcompactd_do_work(pg_data_t *pgdat) struct compact_control cc = { .order = pgdat->kcompactd_max_order, .search_order = pgdat->kcompactd_max_order, - .classzone_idx = pgdat->kcompactd_classzone_idx, + .highest_zoneidx = pgdat->kcompactd_highest_zoneidx, .mode = MIGRATE_SYNC_LIGHT, .ignore_skip_hint = false, .gfp_mask = GFP_KERNEL, }; trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, - cc.classzone_idx); + cc.highest_zoneidx); count_compact_event(KCOMPACTD_WAKE); - for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) { + for (zoneid = 0; zoneid <= cc.highest_zoneidx; zoneid++) { int status; zone = &pgdat->node_zones[zoneid]; @@ -2594,16 +2592,16 @@ static void kcompactd_do_work(pg_data_t *pgdat) /* * Regardless of success, we are done until woken up next. But remember - * the requested order/classzone_idx in case it was higher/tighter than - * our current ones + * the requested order/highest_zoneidx in case it was higher/tighter + * than our current ones */ if (pgdat->kcompactd_max_order <= cc.order) pgdat->kcompactd_max_order = 0; - if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx) - pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; + if (pgdat->kcompactd_highest_zoneidx >= cc.highest_zoneidx) + pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1; } -void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) +void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx) { if (!order) return; @@ -2611,8 +2609,8 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) if (pgdat->kcompactd_max_order < order) pgdat->kcompactd_max_order = order; - if (pgdat->kcompactd_classzone_idx > classzone_idx) - pgdat->kcompactd_classzone_idx = classzone_idx; + if (pgdat->kcompactd_highest_zoneidx > highest_zoneidx) + pgdat->kcompactd_highest_zoneidx = highest_zoneidx; /* * Pairs with implicit barrier in wait_event_freezable() @@ -2625,7 +2623,7 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) return; trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order, - classzone_idx); + highest_zoneidx); wake_up_interruptible(&pgdat->kcompactd_wait); } @@ -2646,7 +2644,7 @@ static int kcompactd(void *p) set_freezable(); pgdat->kcompactd_max_order = 0; - pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; + pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1; while (!kthread_should_stop()) { unsigned long pflags; diff --git a/mm/debug.c b/mm/debug.c index 2189357f0987..f2ede2df585a 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -110,13 +110,57 @@ void __dump_page(struct page *page, const char *reason) else if (PageAnon(page)) type = "anon "; else if (mapping) { - if (mapping->host && mapping->host->i_dentry.first) { - struct dentry *dentry; - dentry = container_of(mapping->host->i_dentry.first, struct dentry, d_u.d_alias); - pr_warn("%ps name:\"%pd\"\n", mapping->a_ops, dentry); - } else - pr_warn("%ps\n", mapping->a_ops); + const struct inode *host; + const struct address_space_operations *a_ops; + const struct hlist_node *dentry_first; + const struct dentry *dentry_ptr; + struct dentry dentry; + + /* + * mapping can be invalid pointer and we don't want to crash + * accessing it, so probe everything depending on it carefully + */ + if (probe_kernel_read_strict(&host, &mapping->host, + sizeof(struct inode *)) || + probe_kernel_read_strict(&a_ops, &mapping->a_ops, + sizeof(struct address_space_operations *))) { + pr_warn("failed to read mapping->host or a_ops, mapping not a valid kernel address?\n"); + goto out_mapping; + } + + if (!host) { + pr_warn("mapping->a_ops:%ps\n", a_ops); + goto out_mapping; + } + + if (probe_kernel_read_strict(&dentry_first, + &host->i_dentry.first, sizeof(struct hlist_node *))) { + pr_warn("mapping->a_ops:%ps with invalid mapping->host inode address %px\n", + a_ops, host); + goto out_mapping; + } + + if (!dentry_first) { + pr_warn("mapping->a_ops:%ps\n", a_ops); + goto out_mapping; + } + + dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); + if (probe_kernel_read_strict(&dentry, dentry_ptr, + sizeof(struct dentry))) { + pr_warn("mapping->aops:%ps with invalid mapping->host->i_dentry.first %px\n", + a_ops, dentry_ptr); + } else { + /* + * if dentry is corrupted, the %pd handler may still + * crash, but it's unlikely that we reach here with a + * corrupted struct page + */ + pr_warn("mapping->aops:%ps dentry name:\"%pd\"\n", + a_ops, &dentry); + } } +out_mapping: BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); pr_warn("%sflags: %#lx(%pGp)%s\n", type, page->flags, &page->flags, diff --git a/mm/fadvise.c b/mm/fadvise.c index 4f17c83db575..0e66f2aaeea3 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -22,6 +22,8 @@ #include <asm/unistd.h> +#include "internal.h" + /* * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could * deactivate the pages and clear PG_Referenced. @@ -102,10 +104,6 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) if (!nrpages) nrpages = ~0UL; - /* - * Ignore return value because fadvise() shall return - * success even if filesystem can't retrieve a hint, - */ force_page_cache_readahead(mapping, file, start_index, nrpages); break; case POSIX_FADV_NOREUSE: diff --git a/mm/filemap.c b/mm/filemap.c index 23a051a7ef0f..455990621989 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -199,9 +199,9 @@ static void unaccount_page_cache_page(struct address_space *mapping, nr = hpage_nr_pages(page); - __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); + __mod_lruvec_page_state(page, NR_FILE_PAGES, -nr); if (PageSwapBacked(page)) { - __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); + __mod_lruvec_page_state(page, NR_SHMEM, -nr); if (PageTransHuge(page)) __dec_node_page_state(page, NR_SHMEM_THPS); } else if (PageTransHuge(page)) { @@ -802,21 +802,22 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->mapping = mapping; new->index = offset; + mem_cgroup_migrate(old, new); + xas_lock_irqsave(&xas, flags); xas_store(&xas, new); old->mapping = NULL; /* hugetlb pages do not participate in page cache accounting. */ if (!PageHuge(old)) - __dec_node_page_state(new, NR_FILE_PAGES); + __dec_lruvec_page_state(old, NR_FILE_PAGES); if (!PageHuge(new)) - __inc_node_page_state(new, NR_FILE_PAGES); + __inc_lruvec_page_state(new, NR_FILE_PAGES); if (PageSwapBacked(old)) - __dec_node_page_state(new, NR_SHMEM); + __dec_lruvec_page_state(old, NR_SHMEM); if (PageSwapBacked(new)) - __inc_node_page_state(new, NR_SHMEM); + __inc_lruvec_page_state(new, NR_SHMEM); xas_unlock_irqrestore(&xas, flags); - mem_cgroup_migrate(old, new); if (freepage) freepage(old); put_page(old); @@ -832,7 +833,6 @@ static int __add_to_page_cache_locked(struct page *page, { XA_STATE(xas, &mapping->i_pages, offset); int huge = PageHuge(page); - struct mem_cgroup *memcg; int error; void *old; @@ -840,17 +840,16 @@ static int __add_to_page_cache_locked(struct page *page, VM_BUG_ON_PAGE(PageSwapBacked(page), page); mapping_set_update(&xas, mapping); - if (!huge) { - error = mem_cgroup_try_charge(page, current->mm, - gfp_mask, &memcg, false); - if (error) - return error; - } - get_page(page); page->mapping = mapping; page->index = offset; + if (!huge) { + error = mem_cgroup_charge(page, current->mm, gfp_mask); + if (error) + goto error; + } + do { xas_lock_irq(&xas); old = xas_load(&xas); @@ -869,25 +868,23 @@ static int __add_to_page_cache_locked(struct page *page, /* hugetlb pages do not participate in page cache accounting */ if (!huge) - __inc_node_page_state(page, NR_FILE_PAGES); + __inc_lruvec_page_state(page, NR_FILE_PAGES); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)); - if (xas_error(&xas)) + if (xas_error(&xas)) { + error = xas_error(&xas); goto error; + } - if (!huge) - mem_cgroup_commit_charge(page, memcg, false, false); trace_mm_filemap_add_to_page_cache(page); return 0; error: page->mapping = NULL; /* Leave page->index set: truncation relies upon it */ - if (!huge) - mem_cgroup_cancel_charge(page, memcg, false); put_page(page); - return xas_error(&xas); + return error; } ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO); @@ -1991,7 +1988,7 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra) * * total number of bytes copied, including those the were already @written * * negative error code if nothing was copied */ -static ssize_t generic_file_buffered_read(struct kiocb *iocb, +ssize_t generic_file_buffered_read(struct kiocb *iocb, struct iov_iter *iter, ssize_t written) { struct file *filp = iocb->ki_filp; @@ -2243,6 +2240,7 @@ out: file_accessed(filp); return written ? written : error; } +EXPORT_SYMBOL_GPL(generic_file_buffered_read); /** * generic_file_read_iter - generic filesystem read routine @@ -2566,7 +2564,6 @@ page_not_uptodate: if (!error || error == AOP_TRUNCATED_PAGE) goto retry_find; - /* Things didn't work out. Return zero to tell the mm layer so. */ shrink_readahead_size_eio(ra); return VM_FAULT_SIGBUS; @@ -2636,7 +2633,7 @@ void filemap_map_pages(struct vm_fault *vmf, if (vmf->pte) vmf->pte += xas.xa_index - last_pgoff; last_pgoff = xas.xa_index; - if (alloc_set_pte(vmf, NULL, page)) + if (alloc_set_pte(vmf, page)) goto unlock; unlock_page(page); goto next; @@ -382,13 +382,22 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, } /* - * FOLL_FORCE can write to even unwritable pte's, but only - * after we've gone through a COW cycle and they are dirty. + * FOLL_FORCE or a forced COW break can write even to unwritable pte's, + * but only after we've gone through a COW cycle and they are dirty. */ static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) { - return pte_write(pte) || - ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); + return pte_write(pte) || ((flags & FOLL_COW) && pte_dirty(pte)); +} + +/* + * A (separate) COW fault might break the page the other way and + * get_user_pages() would return the page from what is now the wrong + * VM. So we need to force a COW break at GUP time even for reads. + */ +static inline bool should_force_cow_break(struct vm_area_struct *vma, unsigned int flags) +{ + return is_cow_mapping(vma->vm_flags) && (flags & (FOLL_GET | FOLL_PIN)); } static struct page *follow_page_pte(struct vm_area_struct *vma, @@ -980,6 +989,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * -- If nr_pages is >0, but no pages were pinned, returns -errno. * -- If nr_pages is >0, and some pages were pinned, returns the number of * pages pinned. Again, this may be less than nr_pages. + * -- 0 return value is possible when the fault would need to be retried. * * The caller is responsible for releasing returned @pages, via put_page(). * @@ -1066,9 +1076,11 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, goto out; } if (is_vm_hugetlb_page(vma)) { + if (should_force_cow_break(vma, foll_flags)) + foll_flags |= FOLL_WRITE; i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &nr_pages, i, - gup_flags, locked); + foll_flags, locked); if (locked && *locked == 0) { /* * We've got a VM_FAULT_RETRY @@ -1082,6 +1094,10 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, continue; } } + + if (should_force_cow_break(vma, foll_flags)) + foll_flags |= FOLL_WRITE; + retry: /* * If we have a pending SIGKILL, don't keep faulting pages and @@ -1168,7 +1184,7 @@ static bool vma_permits_fault(struct vm_area_struct *vma, return true; } -/* +/** * fixup_user_fault() - manually resolve a user page fault * @tsk: the task_struct to use for page fault accounting, or * NULL if faults are not to be recorded. @@ -1176,7 +1192,8 @@ static bool vma_permits_fault(struct vm_area_struct *vma, * @address: user address * @fault_flags:flags to pass down to handle_mm_fault() * @unlocked: did we unlock the mmap_sem while retrying, maybe NULL if caller - * does not allow retry + * does not allow retry. If NULL, the caller must guarantee + * that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY. * * This is meant to be called in the specific scenario where for locking reasons * we try to access user memory in atomic context (within a pagefault_disable() @@ -1218,6 +1235,10 @@ retry: if (!vma_permits_fault(vma, fault_flags)) return -EFAULT; + if ((fault_flags & FAULT_FLAG_KILLABLE) && + fatal_signal_pending(current)) + return -EINTR; + ret = handle_mm_fault(vma, address, fault_flags); major |= ret & VM_FAULT_MAJOR; if (ret & VM_FAULT_ERROR) { @@ -1230,11 +1251,9 @@ retry: if (ret & VM_FAULT_RETRY) { down_read(&mm->mmap_sem); - if (!(fault_flags & FAULT_FLAG_TRIED)) { - *unlocked = true; - fault_flags |= FAULT_FLAG_TRIED; - goto retry; - } + *unlocked = true; + fault_flags |= FAULT_FLAG_TRIED; + goto retry; } if (tsk) { @@ -1247,6 +1266,10 @@ retry: } EXPORT_SYMBOL_GPL(fixup_user_fault); +/* + * Please note that this function, unlike __get_user_pages will not + * return 0 for nr_pages > 0 without FOLL_NOWAIT + */ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, @@ -1837,7 +1860,7 @@ static long __get_user_pages_remote(struct task_struct *tsk, gup_flags | FOLL_TOUCH | FOLL_REMOTE); } -/* +/** * get_user_pages_remote() - pin user pages in memory * @tsk: the task_struct to use for page fault accounting, or * NULL if faults are not to be recorded. @@ -1868,13 +1891,13 @@ static long __get_user_pages_remote(struct task_struct *tsk, * * Must be called with mmap_sem held for read or write. * - * get_user_pages walks a process's page tables and takes a reference to - * each struct page that each user address corresponds to at a given + * get_user_pages_remote walks a process's page tables and takes a reference + * to each struct page that each user address corresponds to at a given * instant. That is, it takes the page that would be accessed if a user * thread accesses the given user virtual address at that instant. * * This does not guarantee that the page exists in the user mappings when - * get_user_pages returns, and there may even be a completely different + * get_user_pages_remote returns, and there may even be a completely different * page there in some cases (eg. if mmapped pagecache has been invalidated * and subsequently re faulted). However it does guarantee that the page * won't be freed completely. And mostly callers simply care that the page @@ -1886,17 +1909,17 @@ static long __get_user_pages_remote(struct task_struct *tsk, * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must * be called after the page is finished with, and before put_page is called. * - * get_user_pages is typically used for fewer-copy IO operations, to get a - * handle on the memory by some means other than accesses via the user virtual - * addresses. The pages may be submitted for DMA to devices or accessed via - * their kernel linear mapping (via the kmap APIs). Care should be taken to - * use the correct cache flushing APIs. + * get_user_pages_remote is typically used for fewer-copy IO operations, + * to get a handle on the memory by some means other than accesses + * via the user virtual addresses. The pages may be submitted for + * DMA to devices or accessed via their kernel linear mapping (via the + * kmap APIs). Care should be taken to use the correct cache flushing APIs. * * See also get_user_pages_fast, for performance critical applications. * - * get_user_pages should be phased out in favor of + * get_user_pages_remote should be phased out in favor of * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing - * should use get_user_pages because it cannot pass + * should use get_user_pages_remote because it cannot pass * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. */ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, @@ -1935,7 +1958,17 @@ static long __get_user_pages_remote(struct task_struct *tsk, } #endif /* !CONFIG_MMU */ -/* +/** + * get_user_pages() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying lookup behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * * This is the same as get_user_pages_remote(), just with a * less-flexible calling convention where we assume that the task * and mm being operated on are the current task's and don't allow @@ -1958,11 +1991,7 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, } EXPORT_SYMBOL(get_user_pages); -/* - * We can leverage the VM_FAULT_RETRY functionality in the page fault - * paths better by using either get_user_pages_locked() or - * get_user_pages_unlocked(). - * +/** * get_user_pages_locked() is suitable to replace the form: * * down_read(&mm->mmap_sem); @@ -1978,6 +2007,21 @@ EXPORT_SYMBOL(get_user_pages); * get_user_pages_locked(tsk, mm, ..., pages, &locked); * if (locked) * up_read(&mm->mmap_sem); + * + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying lookup behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @locked: pointer to lock flag indicating whether lock is held and + * subsequently whether VM_FAULT_RETRY functionality can be + * utilised. Lock must initially be held. + * + * We can leverage the VM_FAULT_RETRY functionality in the page fault + * paths better by using either get_user_pages_locked() or + * get_user_pages_unlocked(). + * */ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, @@ -2664,62 +2708,6 @@ static bool gup_fast_permitted(unsigned long start, unsigned long end) } #endif -/* - * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to - * the regular GUP. - * Note a difference with get_user_pages_fast: this always returns the - * number of pages pinned, 0 if no pages were pinned. - * - * If the architecture does not support this function, simply return with no - * pages pinned. - */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - unsigned long len, end; - unsigned long flags; - int nr_pinned = 0; - /* - * Internally (within mm/gup.c), gup fast variants must set FOLL_GET, - * because gup fast is always a "pin with a +1 page refcount" request. - */ - unsigned int gup_flags = FOLL_GET; - - if (write) - gup_flags |= FOLL_WRITE; - - start = untagged_addr(start) & PAGE_MASK; - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - - if (end <= start) - return 0; - if (unlikely(!access_ok((void __user *)start, len))) - return 0; - - /* - * Disable interrupts. We use the nested form as we can already have - * interrupts disabled by get_futex_key. - * - * With interrupts disabled, we block page table pages from being - * freed from under us. See struct mmu_table_batch comments in - * include/asm-generic/tlb.h for more details. - * - * We do not adopt an rcu_read_lock(.) here as we also want to - * block IPIs that come from THPs splitting. - */ - - if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && - gup_fast_permitted(start, end)) { - local_irq_save(flags); - gup_pgd_range(start, end, gup_flags, pages, &nr_pinned); - local_irq_restore(flags); - } - - return nr_pinned; -} -EXPORT_SYMBOL_GPL(__get_user_pages_fast); - static int __gup_longterm_unlocked(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { @@ -2748,12 +2736,17 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, struct page **pages) { unsigned long addr, len, end; + unsigned long flags; int nr_pinned = 0, ret = 0; if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | - FOLL_FORCE | FOLL_PIN | FOLL_GET))) + FOLL_FORCE | FOLL_PIN | FOLL_GET | + FOLL_FAST_ONLY))) return -EINVAL; + if (!(gup_flags & FOLL_FAST_ONLY)) + might_lock_read(¤t->mm->mmap_sem); + start = untagged_addr(start) & PAGE_MASK; addr = start; len = (unsigned long) nr_pages << PAGE_SHIFT; @@ -2764,15 +2757,42 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT; - if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && - gup_fast_permitted(start, end)) { - local_irq_disable(); - gup_pgd_range(addr, end, gup_flags, pages, &nr_pinned); - local_irq_enable(); + /* + * The FAST_GUP case requires FOLL_WRITE even for pure reads, + * because get_user_pages() may need to cause an early COW in + * order to avoid confusing the normal COW routines. So only + * targets that are already writable are safe to do by just + * looking at the page tables. + * + * NOTE! With FOLL_FAST_ONLY we allow read-only gup_fast() here, + * because there is no slow path to fall back on. But you'd + * better be careful about possible COW pages - you'll get _a_ + * COW page, but not necessarily the one you intended to get + * depending on what COW event happens after this. COW may break + * the page copy in a random direction. + * + * Disable interrupts. The nested form is used, in order to allow + * full, general purpose use of this routine. + * + * With interrupts disabled, we block page table pages from being + * freed from under us. See struct mmu_table_batch comments in + * include/asm-generic/tlb.h for more details. + * + * We do not adopt an rcu_read_lock(.) here as we also want to + * block IPIs that come from THPs splitting. + */ + if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && gup_fast_permitted(start, end)) { + unsigned long fast_flags = gup_flags; + if (!(gup_flags & FOLL_FAST_ONLY)) + fast_flags |= FOLL_WRITE; + + local_irq_save(flags); + gup_pgd_range(addr, end, fast_flags, pages, &nr_pinned); + local_irq_restore(flags); ret = nr_pinned; } - if (nr_pinned < nr_pages) { + if (nr_pinned < nr_pages && !(gup_flags & FOLL_FAST_ONLY)) { /* Try to get the remaining pages with get_user_pages */ start += nr_pinned << PAGE_SHIFT; pages += nr_pinned; @@ -2792,6 +2812,51 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, return ret; } +/* + * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to + * the regular GUP. + * Note a difference with get_user_pages_fast: this always returns the + * number of pages pinned, 0 if no pages were pinned. + * + * If the architecture does not support this function, simply return with no + * pages pinned. + * + * Careful, careful! COW breaking can go either way, so a non-write + * access can get ambiguous page results. If you call this function without + * 'write' set, you'd better be sure that you're ok with that ambiguity. + */ +int __get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + int nr_pinned; + /* + * Internally (within mm/gup.c), gup fast variants must set FOLL_GET, + * because gup fast is always a "pin with a +1 page refcount" request. + * + * FOLL_FAST_ONLY is required in order to match the API description of + * this routine: no fall back to regular ("slow") GUP. + */ + unsigned int gup_flags = FOLL_GET | FOLL_FAST_ONLY; + + if (write) + gup_flags |= FOLL_WRITE; + + nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags, + pages); + + /* + * As specified in the API description above, this routine is not + * allowed to return negative values. However, the common core + * routine internal_get_user_pages_fast() *can* return -errno. + * Therefore, correct for that here: + */ + if (nr_pinned < 0) + nr_pinned = 0; + + return nr_pinned; +} +EXPORT_SYMBOL_GPL(__get_user_pages_fast); + /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address @@ -2843,9 +2908,9 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast); * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please - * see Documentation/vm/pin_user_pages.rst for further details. + * see Documentation/core-api/pin_user_pages.rst for further details. * - * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It + * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It * is NOT intended for Case 2 (RDMA: long-term pins). */ int pin_user_pages_fast(unsigned long start, int nr_pages, @@ -2860,6 +2925,42 @@ int pin_user_pages_fast(unsigned long start, int nr_pages, } EXPORT_SYMBOL_GPL(pin_user_pages_fast); +/* + * This is the FOLL_PIN equivalent of __get_user_pages_fast(). Behavior is the + * same, except that this one sets FOLL_PIN instead of FOLL_GET. + * + * The API rules are the same, too: no negative values may be returned. + */ +int pin_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) +{ + int nr_pinned; + + /* + * FOLL_GET and FOLL_PIN are mutually exclusive. Note that the API + * rules require returning 0, rather than -errno: + */ + if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + return 0; + /* + * FOLL_FAST_ONLY is required in order to match the API description of + * this routine: no fall back to regular ("slow") GUP. + */ + gup_flags |= (FOLL_PIN | FOLL_FAST_ONLY); + nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags, + pages); + /* + * This routine is not allowed to return negative values. However, + * internal_get_user_pages_fast() *can* return -errno. Therefore, + * correct for that here: + */ + if (nr_pinned < 0) + nr_pinned = 0; + + return nr_pinned; +} +EXPORT_SYMBOL_GPL(pin_user_pages_fast_only); + /** * pin_user_pages_remote() - pin pages of a remote process (task != current) * @@ -2883,9 +2984,9 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast); * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please - * see Documentation/vm/pin_user_pages.rst for details. + * see Documentation/core-api/pin_user_pages.rst for details. * - * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It + * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It * is NOT intended for Case 2 (RDMA: long-term pins). */ long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, @@ -2919,9 +3020,9 @@ EXPORT_SYMBOL(pin_user_pages_remote); * FOLL_PIN is set. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please - * see Documentation/vm/pin_user_pages.rst for details. + * see Documentation/core-api/pin_user_pages.rst for details. * - * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It + * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It * is NOT intended for Case 2 (RDMA: long-term pins). */ long pin_user_pages(unsigned long start, unsigned long nr_pages, @@ -2937,3 +3038,20 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages, pages, vmas, gup_flags); } EXPORT_SYMBOL(pin_user_pages); + +/* + * pin_user_pages_unlocked() is the FOLL_PIN variant of + * get_user_pages_unlocked(). Behavior is the same, except that this one sets + * FOLL_PIN and rejects FOLL_GET. + */ +long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, + struct page **pages, unsigned int gup_flags) +{ + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + return -EINVAL; + + gup_flags |= FOLL_PIN; + return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); +} +EXPORT_SYMBOL(pin_user_pages_unlocked); @@ -37,28 +37,13 @@ enum { HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT, }; -/* - * hmm_device_entry_from_pfn() - create a valid device entry value from pfn - * @range: range use to encode HMM pfn value - * @pfn: pfn value for which to create the device entry - * Return: valid device entry for the pfn - */ -static uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range, - unsigned long pfn) -{ - return (pfn << range->pfn_shift) | range->flags[HMM_PFN_VALID]; -} - static int hmm_pfns_fill(unsigned long addr, unsigned long end, - struct hmm_range *range, enum hmm_pfn_value_e value) + struct hmm_range *range, unsigned long cpu_flags) { - uint64_t *pfns = range->pfns; - unsigned long i; + unsigned long i = (addr - range->start) >> PAGE_SHIFT; - i = (addr - range->start) >> PAGE_SHIFT; for (; addr < end; addr += PAGE_SIZE, i++) - pfns[i] = range->values[value]; - + range->hmm_pfns[i] = cpu_flags; return 0; } @@ -96,7 +81,8 @@ static int hmm_vma_fault(unsigned long addr, unsigned long end, } static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, - uint64_t pfns, uint64_t cpu_flags) + unsigned long pfn_req_flags, + unsigned long cpu_flags) { struct hmm_range *range = hmm_vma_walk->range; @@ -110,27 +96,28 @@ static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, * waste to have the user pre-fill the pfn arrays with a default * flags value. */ - pfns = (pfns & range->pfn_flags_mask) | range->default_flags; + pfn_req_flags &= range->pfn_flags_mask; + pfn_req_flags |= range->default_flags; /* We aren't ask to do anything ... */ - if (!(pfns & range->flags[HMM_PFN_VALID])) + if (!(pfn_req_flags & HMM_PFN_REQ_FAULT)) return 0; /* Need to write fault ? */ - if ((pfns & range->flags[HMM_PFN_WRITE]) && - !(cpu_flags & range->flags[HMM_PFN_WRITE])) + if ((pfn_req_flags & HMM_PFN_REQ_WRITE) && + !(cpu_flags & HMM_PFN_WRITE)) return HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT; /* If CPU page table is not valid then we need to fault */ - if (!(cpu_flags & range->flags[HMM_PFN_VALID])) + if (!(cpu_flags & HMM_PFN_VALID)) return HMM_NEED_FAULT; return 0; } static unsigned int hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, - const uint64_t *pfns, unsigned long npages, - uint64_t cpu_flags) + const unsigned long hmm_pfns[], unsigned long npages, + unsigned long cpu_flags) { struct hmm_range *range = hmm_vma_walk->range; unsigned int required_fault = 0; @@ -142,12 +129,12 @@ hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, * hmm_pte_need_fault() will always return 0. */ if (!((range->default_flags | range->pfn_flags_mask) & - range->flags[HMM_PFN_VALID])) + HMM_PFN_REQ_FAULT)) return 0; for (i = 0; i < npages; ++i) { - required_fault |= - hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags); + required_fault |= hmm_pte_need_fault(hmm_vma_walk, hmm_pfns[i], + cpu_flags); if (required_fault == HMM_NEED_ALL_BITS) return required_fault; } @@ -161,12 +148,13 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, struct hmm_range *range = hmm_vma_walk->range; unsigned int required_fault; unsigned long i, npages; - uint64_t *pfns; + unsigned long *hmm_pfns; i = (addr - range->start) >> PAGE_SHIFT; npages = (end - addr) >> PAGE_SHIFT; - pfns = &range->pfns[i]; - required_fault = hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0); + hmm_pfns = &range->hmm_pfns[i]; + required_fault = + hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0); if (!walk->vma) { if (required_fault) return -EFAULT; @@ -174,46 +162,44 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, } if (required_fault) return hmm_vma_fault(addr, end, required_fault, walk); - hmm_vma_walk->last = addr; - return hmm_pfns_fill(addr, end, range, HMM_PFN_NONE); + return hmm_pfns_fill(addr, end, range, 0); } -static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) +static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range, + pmd_t pmd) { if (pmd_protnone(pmd)) return 0; - return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] | - range->flags[HMM_PFN_WRITE] : - range->flags[HMM_PFN_VALID]; + return pmd_write(pmd) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, - unsigned long end, uint64_t *pfns, pmd_t pmd) + unsigned long end, unsigned long hmm_pfns[], + pmd_t pmd) { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; unsigned long pfn, npages, i; unsigned int required_fault; - uint64_t cpu_flags; + unsigned long cpu_flags; npages = (end - addr) >> PAGE_SHIFT; cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); required_fault = - hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags); + hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, cpu_flags); if (required_fault) return hmm_vma_fault(addr, end, required_fault, walk); pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) - pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; - hmm_vma_walk->last = end; + hmm_pfns[i] = pfn | cpu_flags; return 0; } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ /* stub to allow the code below to compile */ int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, - unsigned long end, uint64_t *pfns, pmd_t pmd); + unsigned long end, unsigned long hmm_pfns[], pmd_t pmd); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline bool hmm_is_device_private_entry(struct hmm_range *range, @@ -224,31 +210,31 @@ static inline bool hmm_is_device_private_entry(struct hmm_range *range, range->dev_private_owner; } -static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) +static inline unsigned long pte_to_hmm_pfn_flags(struct hmm_range *range, + pte_t pte) { if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte)) return 0; - return pte_write(pte) ? range->flags[HMM_PFN_VALID] | - range->flags[HMM_PFN_WRITE] : - range->flags[HMM_PFN_VALID]; + return pte_write(pte) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID; } static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, unsigned long end, pmd_t *pmdp, pte_t *ptep, - uint64_t *pfn) + unsigned long *hmm_pfn) { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; unsigned int required_fault; - uint64_t cpu_flags; + unsigned long cpu_flags; pte_t pte = *ptep; - uint64_t orig_pfn = *pfn; + uint64_t pfn_req_flags = *hmm_pfn; if (pte_none(pte)) { - required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0); + required_fault = + hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); if (required_fault) goto fault; - *pfn = range->values[HMM_PFN_NONE]; + *hmm_pfn = 0; return 0; } @@ -260,17 +246,18 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, * the PFN even if not present. */ if (hmm_is_device_private_entry(range, entry)) { - *pfn = hmm_device_entry_from_pfn(range, - device_private_entry_to_pfn(entry)); - *pfn |= range->flags[HMM_PFN_VALID]; + cpu_flags = HMM_PFN_VALID; if (is_write_device_private_entry(entry)) - *pfn |= range->flags[HMM_PFN_WRITE]; + cpu_flags |= HMM_PFN_WRITE; + *hmm_pfn = device_private_entry_to_pfn(entry) | + cpu_flags; return 0; } - required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0); + required_fault = + hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); if (!required_fault) { - *pfn = range->values[HMM_PFN_NONE]; + *hmm_pfn = 0; return 0; } @@ -290,7 +277,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, } cpu_flags = pte_to_hmm_pfn_flags(range, pte); - required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags); + required_fault = + hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags); if (required_fault) goto fault; @@ -299,15 +287,15 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, * fall through and treat it like a normal page. */ if (pte_special(pte) && !is_zero_pfn(pte_pfn(pte))) { - if (hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0)) { + if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) { pte_unmap(ptep); return -EFAULT; } - *pfn = range->values[HMM_PFN_SPECIAL]; + *hmm_pfn = HMM_PFN_ERROR; return 0; } - *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags; + *hmm_pfn = pte_pfn(pte) | cpu_flags; return 0; fault: @@ -323,7 +311,8 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - uint64_t *pfns = &range->pfns[(start - range->start) >> PAGE_SHIFT]; + unsigned long *hmm_pfns = + &range->hmm_pfns[(start - range->start) >> PAGE_SHIFT]; unsigned long npages = (end - start) >> PAGE_SHIFT; unsigned long addr = start; pte_t *ptep; @@ -335,16 +324,16 @@ again: return hmm_vma_walk_hole(start, end, -1, walk); if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { - if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) { + if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) { hmm_vma_walk->last = addr; pmd_migration_entry_wait(walk->mm, pmdp); return -EBUSY; } - return hmm_pfns_fill(start, end, range, HMM_PFN_NONE); + return hmm_pfns_fill(start, end, range, 0); } if (!pmd_present(pmd)) { - if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) + if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) return -EFAULT; return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); } @@ -364,7 +353,7 @@ again: if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) goto again; - return hmm_vma_handle_pmd(walk, addr, end, pfns, pmd); + return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd); } /* @@ -374,37 +363,33 @@ again: * recover. */ if (pmd_bad(pmd)) { - if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) + if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) return -EFAULT; return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); } ptep = pte_offset_map(pmdp, addr); - for (; addr < end; addr += PAGE_SIZE, ptep++, pfns++) { + for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) { int r; - r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, pfns); + r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, hmm_pfns); if (r) { /* hmm_vma_handle_pte() did pte_unmap() */ - hmm_vma_walk->last = addr; return r; } } pte_unmap(ptep - 1); - - hmm_vma_walk->last = addr; return 0; } #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) -static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) +static inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range, + pud_t pud) { if (!pud_present(pud)) return 0; - return pud_write(pud) ? range->flags[HMM_PFN_VALID] | - range->flags[HMM_PFN_WRITE] : - range->flags[HMM_PFN_VALID]; + return pud_write(pud) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID; } static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, @@ -432,7 +417,8 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, if (pud_huge(pud) && pud_devmap(pud)) { unsigned long i, npages, pfn; unsigned int required_fault; - uint64_t *pfns, cpu_flags; + unsigned long *hmm_pfns; + unsigned long cpu_flags; if (!pud_present(pud)) { spin_unlock(ptl); @@ -441,10 +427,10 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, i = (addr - range->start) >> PAGE_SHIFT; npages = (end - addr) >> PAGE_SHIFT; - pfns = &range->pfns[i]; + hmm_pfns = &range->hmm_pfns[i]; cpu_flags = pud_to_hmm_pfn_flags(range, pud); - required_fault = hmm_range_need_fault(hmm_vma_walk, pfns, + required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, cpu_flags); if (required_fault) { spin_unlock(ptl); @@ -453,9 +439,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); for (i = 0; i < npages; ++i, ++pfn) - pfns[i] = hmm_device_entry_from_pfn(range, pfn) | - cpu_flags; - hmm_vma_walk->last = end; + hmm_pfns[i] = pfn | cpu_flags; goto out_unlock; } @@ -479,8 +463,9 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; - uint64_t orig_pfn, cpu_flags; unsigned int required_fault; + unsigned long pfn_req_flags; + unsigned long cpu_flags; spinlock_t *ptl; pte_t entry; @@ -488,9 +473,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, entry = huge_ptep_get(pte); i = (start - range->start) >> PAGE_SHIFT; - orig_pfn = range->pfns[i]; + pfn_req_flags = range->hmm_pfns[i]; cpu_flags = pte_to_hmm_pfn_flags(range, entry); - required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags); + required_fault = + hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags); if (required_fault) { spin_unlock(ptl); return hmm_vma_fault(addr, end, required_fault, walk); @@ -498,9 +484,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); for (; addr < end; addr += PAGE_SIZE, i++, pfn++) - range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | - cpu_flags; - hmm_vma_walk->last = end; + range->hmm_pfns[i] = pfn | cpu_flags; + spin_unlock(ptl); return 0; } @@ -531,13 +516,12 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end, * failure. */ if (hmm_range_need_fault(hmm_vma_walk, - range->pfns + + range->hmm_pfns + ((start - range->start) >> PAGE_SHIFT), (end - start) >> PAGE_SHIFT, 0)) return -EFAULT; hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); - hmm_vma_walk->last = end; /* Skip this vma and continue processing the next vma. */ return 1; @@ -555,9 +539,7 @@ static const struct mm_walk_ops hmm_walk_ops = { * hmm_range_fault - try to fault some address in a virtual address range * @range: argument structure * - * Return: the number of valid pages in range->pfns[] (from range start - * address), which may be zero. On error one of the following status codes - * can be returned: + * Returns 0 on success or one of the following error codes: * * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma * (e.g., device file vma). @@ -572,7 +554,7 @@ static const struct mm_walk_ops hmm_walk_ops = { * This is similar to get_user_pages(), except that it can read the page tables * without mutating them (ie causing faults). */ -long hmm_range_fault(struct hmm_range *range) +int hmm_range_fault(struct hmm_range *range) { struct hmm_vma_walk hmm_vma_walk = { .range = range, @@ -590,10 +572,13 @@ long hmm_range_fault(struct hmm_range *range) return -EBUSY; ret = walk_page_range(mm, hmm_vma_walk.last, range->end, &hmm_walk_ops, &hmm_vma_walk); + /* + * When -EBUSY is returned the loop restarts with + * hmm_vma_walk.last set to an address that has not been stored + * in pfns. All entries < last in the pfn array are set to their + * output, and all >= are still at their input values. + */ } while (ret == -EBUSY); - - if (ret) - return ret; - return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; + return ret; } EXPORT_SYMBOL(hmm_range_fault); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6ecd1045113b..e8669885232f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -587,19 +587,19 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, gfp_t gfp) { struct vm_area_struct *vma = vmf->vma; - struct mem_cgroup *memcg; pgtable_t pgtable; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; vm_fault_t ret = 0; VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) { + if (mem_cgroup_charge(page, vma->vm_mm, gfp)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK_CHARGE); return VM_FAULT_FALLBACK; } + cgroup_throttle_swaprate(page, gfp); pgtable = pte_alloc_one(vma->vm_mm); if (unlikely(!pgtable)) { @@ -630,7 +630,6 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, vm_fault_t ret2; spin_unlock(vmf->ptl); - mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(vma->vm_mm, pgtable); ret2 = handle_userfault(vmf, VM_UFFD_MISSING); @@ -641,7 +640,6 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); page_add_new_anon_rmap(page, vma, haddr, true); - mem_cgroup_commit_charge(page, memcg, false, true); lru_cache_add_active_or_unevictable(page, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); @@ -649,7 +647,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); - count_memcg_events(memcg, THP_FAULT_ALLOC, 1); + count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); } return 0; @@ -658,7 +656,6 @@ unlock_release: release: if (pgtable) pte_free(vma->vm_mm, pgtable); - mem_cgroup_cancel_charge(page, memcg, true); put_page(page); return ret; @@ -1255,273 +1252,72 @@ unlock: spin_unlock(vmf->ptl); } -static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, - pmd_t orig_pmd, struct page *page) -{ - struct vm_area_struct *vma = vmf->vma; - unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - struct mem_cgroup *memcg; - pgtable_t pgtable; - pmd_t _pmd; - int i; - vm_fault_t ret = 0; - struct page **pages; - struct mmu_notifier_range range; - - pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *), - GFP_KERNEL); - if (unlikely(!pages)) { - ret |= VM_FAULT_OOM; - goto out; - } - - for (i = 0; i < HPAGE_PMD_NR; i++) { - pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma, - vmf->address, page_to_nid(page)); - if (unlikely(!pages[i] || - mem_cgroup_try_charge_delay(pages[i], vma->vm_mm, - GFP_KERNEL, &memcg, false))) { - if (pages[i]) - put_page(pages[i]); - while (--i >= 0) { - memcg = (void *)page_private(pages[i]); - set_page_private(pages[i], 0); - mem_cgroup_cancel_charge(pages[i], memcg, - false); - put_page(pages[i]); - } - kfree(pages); - ret |= VM_FAULT_OOM; - goto out; - } - set_page_private(pages[i], (unsigned long)memcg); - } - - for (i = 0; i < HPAGE_PMD_NR; i++) { - copy_user_highpage(pages[i], page + i, - haddr + PAGE_SIZE * i, vma); - __SetPageUptodate(pages[i]); - cond_resched(); - } - - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, - haddr, haddr + HPAGE_PMD_SIZE); - mmu_notifier_invalidate_range_start(&range); - - vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); - if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) - goto out_free_pages; - VM_BUG_ON_PAGE(!PageHead(page), page); - - /* - * Leave pmd empty until pte is filled note we must notify here as - * concurrent CPU thread might write to new page before the call to - * mmu_notifier_invalidate_range_end() happens which can lead to a - * device seeing memory write in different order than CPU. - * - * See Documentation/vm/mmu_notifier.rst - */ - pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); - - pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd); - pmd_populate(vma->vm_mm, &_pmd, pgtable); - - for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { - pte_t entry; - entry = mk_pte(pages[i], vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - memcg = (void *)page_private(pages[i]); - set_page_private(pages[i], 0); - page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false); - mem_cgroup_commit_charge(pages[i], memcg, false, false); - lru_cache_add_active_or_unevictable(pages[i], vma); - vmf->pte = pte_offset_map(&_pmd, haddr); - VM_BUG_ON(!pte_none(*vmf->pte)); - set_pte_at(vma->vm_mm, haddr, vmf->pte, entry); - pte_unmap(vmf->pte); - } - kfree(pages); - - smp_wmb(); /* make pte visible before pmd */ - pmd_populate(vma->vm_mm, vmf->pmd, pgtable); - page_remove_rmap(page, true); - spin_unlock(vmf->ptl); - - /* - * No need to double call mmu_notifier->invalidate_range() callback as - * the above pmdp_huge_clear_flush_notify() did already call it. - */ - mmu_notifier_invalidate_range_only_end(&range); - - ret |= VM_FAULT_WRITE; - put_page(page); - -out: - return ret; - -out_free_pages: - spin_unlock(vmf->ptl); - mmu_notifier_invalidate_range_end(&range); - for (i = 0; i < HPAGE_PMD_NR; i++) { - memcg = (void *)page_private(pages[i]); - set_page_private(pages[i], 0); - mem_cgroup_cancel_charge(pages[i], memcg, false); - put_page(pages[i]); - } - kfree(pages); - goto out; -} - vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) { struct vm_area_struct *vma = vmf->vma; - struct page *page = NULL, *new_page; - struct mem_cgroup *memcg; + struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - struct mmu_notifier_range range; - gfp_t huge_gfp; /* for allocation and charge */ - vm_fault_t ret = 0; vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); + if (is_huge_zero_pmd(orig_pmd)) - goto alloc; + goto fallback; + spin_lock(vmf->ptl); - if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) - goto out_unlock; + + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { + spin_unlock(vmf->ptl); + return 0; + } page = pmd_page(orig_pmd); VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); - /* - * We can only reuse the page if nobody else maps the huge page or it's - * part. - */ + + /* Lock page for reuse_swap_page() */ if (!trylock_page(page)) { get_page(page); spin_unlock(vmf->ptl); lock_page(page); spin_lock(vmf->ptl); if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { + spin_unlock(vmf->ptl); unlock_page(page); put_page(page); - goto out_unlock; + return 0; } put_page(page); } + + /* + * We can only reuse the page if nobody else maps the huge page or it's + * part. + */ if (reuse_swap_page(page, NULL)) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) + if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); - ret |= VM_FAULT_WRITE; unlock_page(page); - goto out_unlock; - } - unlock_page(page); - get_page(page); - spin_unlock(vmf->ptl); -alloc: - if (__transparent_hugepage_enabled(vma) && - !transparent_hugepage_debug_cow()) { - huge_gfp = alloc_hugepage_direct_gfpmask(vma); - new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); - } else - new_page = NULL; - - if (likely(new_page)) { - prep_transhuge_page(new_page); - } else { - if (!page) { - split_huge_pmd(vma, vmf->pmd, vmf->address); - ret |= VM_FAULT_FALLBACK; - } else { - ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page); - if (ret & VM_FAULT_OOM) { - split_huge_pmd(vma, vmf->pmd, vmf->address); - ret |= VM_FAULT_FALLBACK; - } - put_page(page); - } - count_vm_event(THP_FAULT_FALLBACK); - goto out; - } - - if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm, - huge_gfp, &memcg, true))) { - put_page(new_page); - split_huge_pmd(vma, vmf->pmd, vmf->address); - if (page) - put_page(page); - ret |= VM_FAULT_FALLBACK; - count_vm_event(THP_FAULT_FALLBACK); - count_vm_event(THP_FAULT_FALLBACK_CHARGE); - goto out; - } - - count_vm_event(THP_FAULT_ALLOC); - count_memcg_events(memcg, THP_FAULT_ALLOC, 1); - - if (!page) - clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR); - else - copy_user_huge_page(new_page, page, vmf->address, - vma, HPAGE_PMD_NR); - __SetPageUptodate(new_page); - - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, - haddr, haddr + HPAGE_PMD_SIZE); - mmu_notifier_invalidate_range_start(&range); - - spin_lock(vmf->ptl); - if (page) - put_page(page); - if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { spin_unlock(vmf->ptl); - mem_cgroup_cancel_charge(new_page, memcg, true); - put_page(new_page); - goto out_mn; - } else { - pmd_t entry; - entry = mk_huge_pmd(new_page, vma->vm_page_prot); - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); - page_add_new_anon_rmap(new_page, vma, haddr, true); - mem_cgroup_commit_charge(new_page, memcg, false, true); - lru_cache_add_active_or_unevictable(new_page, vma); - set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); - update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); - if (!page) { - add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); - } else { - VM_BUG_ON_PAGE(!PageHead(page), page); - page_remove_rmap(page, true); - put_page(page); - } - ret |= VM_FAULT_WRITE; + return VM_FAULT_WRITE; } + + unlock_page(page); spin_unlock(vmf->ptl); -out_mn: - /* - * No need to double call mmu_notifier->invalidate_range() callback as - * the above pmdp_huge_clear_flush_notify() did already call it. - */ - mmu_notifier_invalidate_range_only_end(&range); -out: - return ret; -out_unlock: - spin_unlock(vmf->ptl); - return ret; +fallback: + __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); + return VM_FAULT_FALLBACK; } /* - * FOLL_FORCE can write to even unwritable pmd's, but only - * after we've gone through a COW cycle and they are dirty. + * FOLL_FORCE or a forced COW break can write even to unwritable pmd's, + * but only after we've gone through a COW cycle and they are dirty. */ static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags) { - return pmd_write(pmd) || - ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd)); + return pmd_write(pmd) || ((flags & FOLL_COW) && pmd_dirty(pmd)); } struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, @@ -1582,7 +1378,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, goto skip_mlock; if (!trylock_page(page)) goto skip_mlock; - lru_add_drain(); if (page->mapping && !PageDoubleMap(page)) mlock_vma_page(page); unlock_page(page); @@ -2360,15 +2155,17 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, atomic_inc(&page[i]._mapcount); } + lock_page_memcg(page); if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { /* Last compound_mapcount is gone. */ - __dec_node_page_state(page, NR_ANON_THPS); + __dec_lruvec_page_state(page, NR_ANON_THPS); if (TestClearPageDoubleMap(page)) { /* No need in mapcount reference anymore */ for (i = 0; i < HPAGE_PMD_NR; i++) atomic_dec(&page[i]._mapcount); } } + unlock_page_memcg(page); smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); @@ -2386,6 +2183,8 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, { spinlock_t *ptl; struct mmu_notifier_range range; + bool was_locked = false; + pmd_t _pmd; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address & HPAGE_PMD_MASK, @@ -2398,11 +2197,32 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, * pmd against. Otherwise we can end up replacing wrong page. */ VM_BUG_ON(freeze && !page); - if (page && page != pmd_page(*pmd)) - goto out; + if (page) { + VM_WARN_ON_ONCE(!PageLocked(page)); + was_locked = true; + if (page != pmd_page(*pmd)) + goto out; + } +repeat: if (pmd_trans_huge(*pmd)) { - page = pmd_page(*pmd); + if (!page) { + page = pmd_page(*pmd); + if (unlikely(!trylock_page(page))) { + get_page(page); + _pmd = *pmd; + spin_unlock(ptl); + lock_page(page); + spin_lock(ptl); + if (unlikely(!pmd_same(*pmd, _pmd))) { + unlock_page(page); + put_page(page); + page = NULL; + goto repeat; + } + put_page(page); + } + } if (PageMlocked(page)) clear_page_mlock(page); } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) @@ -2410,6 +2230,8 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, __split_huge_pmd_locked(vma, pmd, range.start, freeze); out: spin_unlock(ptl); + if (!was_locked && page) + unlock_page(page); /* * No need to double call mmu_notifier->invalidate_range() callback. * They are 3 cases to consider inside __split_huge_pmd_locked(): @@ -2784,7 +2606,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; int count, mapcount, extra_pins, ret; - bool mlocked; unsigned long flags; pgoff_t end; @@ -2843,14 +2664,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out_unlock; } - mlocked = PageMlocked(head); unmap_page(head); VM_BUG_ON_PAGE(compound_mapcount(head), head); - /* Make sure the page is not on per-CPU pagevec as it takes pin */ - if (mlocked) - lru_add_drain(); - /* prevent PageLRU to go away from under us, and freeze lru stats */ spin_lock_irqsave(&pgdata->lru_lock, flags); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bcabbe02192b..ac0d7bbc0692 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -59,8 +59,8 @@ __initdata LIST_HEAD(huge_boot_pages); /* for command line parsing */ static struct hstate * __initdata parsed_hstate; static unsigned long __initdata default_hstate_max_huge_pages; -static unsigned long __initdata default_hstate_size; static bool __initdata parsed_valid_hugepagesz = true; +static bool __initdata parsed_default_hugepagesz; /* * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, @@ -3060,7 +3060,7 @@ static void __init hugetlb_sysfs_init(void) err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, hstate_kobjs, &hstate_attr_group); if (err) - pr_err("Hugetlb: Unable to add hstate %s", h->name); + pr_err("HugeTLB: Unable to add hstate %s", h->name); } } @@ -3164,7 +3164,7 @@ static void hugetlb_register_node(struct node *node) nhs->hstate_kobjs, &per_node_hstate_attr_group); if (err) { - pr_err("Hugetlb: Unable to add hstate %s for node %d\n", + pr_err("HugeTLB: Unable to add hstate %s for node %d\n", h->name, node->dev.id); hugetlb_unregister_node(node); break; @@ -3212,23 +3212,41 @@ static int __init hugetlb_init(void) { int i; - if (!hugepages_supported()) + if (!hugepages_supported()) { + if (hugetlb_max_hstate || default_hstate_max_huge_pages) + pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n"); return 0; + } - if (!size_to_hstate(default_hstate_size)) { - if (default_hstate_size != 0) { - pr_err("HugeTLB: unsupported default_hugepagesz %lu. Reverting to %lu\n", - default_hstate_size, HPAGE_SIZE); + /* + * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some + * architectures depend on setup being done here. + */ + hugetlb_add_hstate(HUGETLB_PAGE_ORDER); + if (!parsed_default_hugepagesz) { + /* + * If we did not parse a default huge page size, set + * default_hstate_idx to HPAGE_SIZE hstate. And, if the + * number of huge pages for this default size was implicitly + * specified, set that here as well. + * Note that the implicit setting will overwrite an explicit + * setting. A warning will be printed in this case. + */ + default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE)); + if (default_hstate_max_huge_pages) { + if (default_hstate.max_huge_pages) { + char buf[32]; + + string_get_size(huge_page_size(&default_hstate), + 1, STRING_UNITS_2, buf, 32); + pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n", + default_hstate.max_huge_pages, buf); + pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n", + default_hstate_max_huge_pages); + } + default_hstate.max_huge_pages = + default_hstate_max_huge_pages; } - - default_hstate_size = HPAGE_SIZE; - if (!size_to_hstate(default_hstate_size)) - hugetlb_add_hstate(HUGETLB_PAGE_ORDER); - } - default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size)); - if (default_hstate_max_huge_pages) { - if (!default_hstate.max_huge_pages) - default_hstate.max_huge_pages = default_hstate_max_huge_pages; } hugetlb_cma_check(); @@ -3256,10 +3274,10 @@ static int __init hugetlb_init(void) } subsys_initcall(hugetlb_init); -/* Should be called on processing a hugepagesz=... option */ -void __init hugetlb_bad_size(void) +/* Overwritten by architectures with more huge page sizes */ +bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size) { - parsed_valid_hugepagesz = false; + return size == HPAGE_SIZE; } void __init hugetlb_add_hstate(unsigned int order) @@ -3268,7 +3286,6 @@ void __init hugetlb_add_hstate(unsigned int order) unsigned long i; if (size_to_hstate(PAGE_SIZE << order)) { - pr_warn("hugepagesz= specified twice, ignoring\n"); return; } BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); @@ -3289,20 +3306,29 @@ void __init hugetlb_add_hstate(unsigned int order) parsed_hstate = h; } -static int __init hugetlb_nrpages_setup(char *s) +/* + * hugepages command line processing + * hugepages normally follows a valid hugepagsz or default_hugepagsz + * specification. If not, ignore the hugepages value. hugepages can also + * be the first huge page command line option in which case it implicitly + * specifies the number of huge pages for the default size. + */ +static int __init hugepages_setup(char *s) { unsigned long *mhp; static unsigned long *last_mhp; if (!parsed_valid_hugepagesz) { - pr_warn("hugepages = %s preceded by " - "an unsupported hugepagesz, ignoring\n", s); + pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s); parsed_valid_hugepagesz = true; - return 1; + return 0; } + /* - * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet, - * so this hugepages= parameter goes to the "default hstate". + * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter + * yet, so this hugepages= parameter goes to the "default hstate". + * Otherwise, it goes with the previously parsed hugepagesz or + * default_hugepagesz. */ else if (!hugetlb_max_hstate) mhp = &default_hstate_max_huge_pages; @@ -3310,8 +3336,8 @@ static int __init hugetlb_nrpages_setup(char *s) mhp = &parsed_hstate->max_huge_pages; if (mhp == last_mhp) { - pr_warn("hugepages= specified twice without interleaving hugepagesz=, ignoring\n"); - return 1; + pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s); + return 0; } if (sscanf(s, "%lu", mhp) <= 0) @@ -3329,14 +3355,102 @@ static int __init hugetlb_nrpages_setup(char *s) return 1; } -__setup("hugepages=", hugetlb_nrpages_setup); +__setup("hugepages=", hugepages_setup); -static int __init hugetlb_default_setup(char *s) +/* + * hugepagesz command line processing + * A specific huge page size can only be specified once with hugepagesz. + * hugepagesz is followed by hugepages on the command line. The global + * variable 'parsed_valid_hugepagesz' is used to determine if prior + * hugepagesz argument was valid. + */ +static int __init hugepagesz_setup(char *s) { - default_hstate_size = memparse(s, &s); + unsigned long size; + struct hstate *h; + + parsed_valid_hugepagesz = false; + size = (unsigned long)memparse(s, NULL); + + if (!arch_hugetlb_valid_size(size)) { + pr_err("HugeTLB: unsupported hugepagesz=%s\n", s); + return 0; + } + + h = size_to_hstate(size); + if (h) { + /* + * hstate for this size already exists. This is normally + * an error, but is allowed if the existing hstate is the + * default hstate. More specifically, it is only allowed if + * the number of huge pages for the default hstate was not + * previously specified. + */ + if (!parsed_default_hugepagesz || h != &default_hstate || + default_hstate.max_huge_pages) { + pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s); + return 0; + } + + /* + * No need to call hugetlb_add_hstate() as hstate already + * exists. But, do set parsed_hstate so that a following + * hugepages= parameter will be applied to this hstate. + */ + parsed_hstate = h; + parsed_valid_hugepagesz = true; + return 1; + } + + hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); + parsed_valid_hugepagesz = true; return 1; } -__setup("default_hugepagesz=", hugetlb_default_setup); +__setup("hugepagesz=", hugepagesz_setup); + +/* + * default_hugepagesz command line input + * Only one instance of default_hugepagesz allowed on command line. + */ +static int __init default_hugepagesz_setup(char *s) +{ + unsigned long size; + + parsed_valid_hugepagesz = false; + if (parsed_default_hugepagesz) { + pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s); + return 0; + } + + size = (unsigned long)memparse(s, NULL); + + if (!arch_hugetlb_valid_size(size)) { + pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s); + return 0; + } + + hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); + parsed_valid_hugepagesz = true; + parsed_default_hugepagesz = true; + default_hstate_idx = hstate_index(size_to_hstate(size)); + + /* + * The number of default huge pages (for this size) could have been + * specified as the first hugetlb parameter: hugepages=X. If so, + * then default_hstate_max_huge_pages is set. If the default huge + * page size is gigantic (>= MAX_ORDER), then the pages must be + * allocated here from bootmem allocator. + */ + if (default_hstate_max_huge_pages) { + default_hstate.max_huge_pages = default_hstate_max_huge_pages; + if (hstate_is_gigantic(&default_hstate)) + hugetlb_hstate_alloc_pages(&default_hstate); + default_hstate_max_huge_pages = 0; + } + + return 1; +} +__setup("default_hugepagesz=", default_hugepagesz_setup); static unsigned int cpuset_mems_nr(unsigned int *array) { @@ -3352,7 +3466,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array) #ifdef CONFIG_SYSCTL static int hugetlb_sysctl_handler_common(bool obey_mempolicy, struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; unsigned long tmp = h->max_huge_pages; @@ -3375,7 +3489,7 @@ out: } int hugetlb_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { return hugetlb_sysctl_handler_common(false, table, write, @@ -3384,7 +3498,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, #ifdef CONFIG_NUMA int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { return hugetlb_sysctl_handler_common(true, table, write, buffer, length, ppos); @@ -3392,8 +3506,7 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, #endif /* CONFIG_NUMA */ int hugetlb_overcommit_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; unsigned long tmp; @@ -5355,8 +5468,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, * huge_pte_offset() - Walk the page table to resolve the hugepage * entry at address @addr * - * Return: Pointer to page table or swap entry (PUD or PMD) for - * address @addr, or NULL if a p*d_none() entry is encountered and the + * Return: Pointer to page table entry (PUD or PMD) for + * address @addr, or NULL if a !p*d_present() entry is encountered and the * size @sz doesn't match the hugepage size at this level of the page * table. */ @@ -5365,8 +5478,8 @@ pte_t *huge_pte_offset(struct mm_struct *mm, { pgd_t *pgd; p4d_t *p4d; - pud_t *pud, pud_entry; - pmd_t *pmd, pmd_entry; + pud_t *pud; + pmd_t *pmd; pgd = pgd_offset(mm, addr); if (!pgd_present(*pgd)) @@ -5376,22 +5489,16 @@ pte_t *huge_pte_offset(struct mm_struct *mm, return NULL; pud = pud_offset(p4d, addr); - pud_entry = READ_ONCE(*pud); - if (sz != PUD_SIZE && pud_none(pud_entry)) - return NULL; - /* hugepage or swap? */ - if (pud_huge(pud_entry) || !pud_present(pud_entry)) + if (sz == PUD_SIZE) + /* must be pud huge, non-present or none */ return (pte_t *)pud; - - pmd = pmd_offset(pud, addr); - pmd_entry = READ_ONCE(*pmd); - if (sz != PMD_SIZE && pmd_none(pmd_entry)) + if (!pud_present(*pud)) return NULL; - /* hugepage or swap? */ - if (pmd_huge(pmd_entry) || !pmd_present(pmd_entry)) - return (pte_t *)pmd; + /* must have a valid entry and size to go further */ - return NULL; + pmd = pmd_offset(pud, addr); + /* must be pmd huge, non-present or none */ + return (pte_t *)pmd; } #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ diff --git a/mm/internal.h b/mm/internal.h index b5634e78f01d..9117bca90f4b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -49,18 +49,20 @@ void unmap_page_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, struct zap_details *details); -extern unsigned int __do_page_cache_readahead(struct address_space *mapping, - struct file *filp, pgoff_t offset, unsigned long nr_to_read, +void force_page_cache_readahead(struct address_space *, struct file *, + pgoff_t index, unsigned long nr_to_read); +void __do_page_cache_readahead(struct address_space *, struct file *, + pgoff_t index, unsigned long nr_to_read, unsigned long lookahead_size); /* * Submit IO for the read-ahead request in file_ra_state. */ -static inline unsigned long ra_submit(struct file_ra_state *ra, +static inline void ra_submit(struct file_ra_state *ra, struct address_space *mapping, struct file *filp) { - return __do_page_cache_readahead(mapping, filp, - ra->start, ra->size, ra->async_size); + __do_page_cache_readahead(mapping, filp, + ra->start, ra->size, ra->async_size); } /** @@ -125,10 +127,10 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); * between functions involved in allocations, including the alloc_pages* * family of functions. * - * nodemask, migratetype and high_zoneidx are initialized only once in + * nodemask, migratetype and highest_zoneidx are initialized only once in * __alloc_pages_nodemask() and then never change. * - * zonelist, preferred_zone and classzone_idx are set first in + * zonelist, preferred_zone and highest_zoneidx are set first in * __alloc_pages_nodemask() for the fast path, and might be later changed * in __alloc_pages_slowpath(). All other functions pass the whole strucure * by a const pointer. @@ -138,12 +140,21 @@ struct alloc_context { nodemask_t *nodemask; struct zoneref *preferred_zoneref; int migratetype; - enum zone_type high_zoneidx; + + /* + * highest_zoneidx represents highest usable zone index of + * the allocation request. Due to the nature of the zone, + * memory on lower zone than the highest_zoneidx will be + * protected by lowmem_reserve[highest_zoneidx]. + * + * highest_zoneidx is also used by reclaim/compaction to limit + * the target zone since higher zone than this index cannot be + * usable for this allocation request. + */ + enum zone_type highest_zoneidx; bool spread_dirty_pages; }; -#define ac_classzone_idx(ac) zonelist_zone_idx(ac->preferred_zoneref) - /* * Locate the struct page for both the matching buddy in our * pair (buddy1) and the combined O(n+1) page they form (page). @@ -222,7 +233,7 @@ struct compact_control { int order; /* order a direct compactor needs */ int migratetype; /* migratetype of direct compactor */ const unsigned int alloc_flags; /* alloc flags of a direct compactor */ - const int classzone_idx; /* zone index of a direct compactor */ + const int highest_zoneidx; /* zone index of a direct compactor */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool no_set_skip_hint; /* Don't mark blocks for skipping */ @@ -527,7 +538,7 @@ extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long); extern void set_pageblock_order(void); -unsigned long reclaim_clean_pages_from_list(struct zone *zone, +unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *page_list); /* The ALLOC_WMARK bits are used as an index to zone->watermark */ #define ALLOC_WMARK_MIN WMARK_MIN diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 08b43de2383b..d532c2587731 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -1,23 +1,33 @@ # SPDX-License-Identifier: GPL-2.0 KASAN_SANITIZE := n -UBSAN_SANITIZE_common.o := n -UBSAN_SANITIZE_generic.o := n -UBSAN_SANITIZE_generic_report.o := n -UBSAN_SANITIZE_tags.o := n +UBSAN_SANITIZE := n KCOV_INSTRUMENT := n +# Disable ftrace to avoid recursion. CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_generic.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_generic_report.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_init.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_quarantine.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_tags.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_tags_report.o = $(CC_FLAGS_FTRACE) # Function splitter causes unnecessary splits in __asan_load1/__asan_store1 # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 +CC_FLAGS_KASAN_RUNTIME := $(call cc-option, -fno-conserve-stack) +CC_FLAGS_KASAN_RUNTIME += $(call cc-option, -fno-stack-protector) +# Disable branch tracing to avoid recursion. +CC_FLAGS_KASAN_RUNTIME += -DDISABLE_BRANCH_PROFILING -CFLAGS_common.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -CFLAGS_generic.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -CFLAGS_generic_report.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -CFLAGS_tags.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) +CFLAGS_common.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_generic.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_generic_report.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_init.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_quarantine.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_report.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_tags.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_tags_report.o := $(CC_FLAGS_KASAN_RUNTIME) obj-$(CONFIG_KASAN) := common.o init.o report.o obj-$(CONFIG_KASAN_GENERIC) += generic.o generic_report.o quarantine.o diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 2906358e42f0..757d4074fe28 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -33,7 +33,6 @@ #include <linux/types.h> #include <linux/vmalloc.h> #include <linux/bug.h> -#include <linux/uaccess.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> @@ -613,24 +612,6 @@ void kasan_free_shadow(const struct vm_struct *vm) } #endif -extern void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); -extern bool report_enabled(void); - -bool kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) -{ - unsigned long flags = user_access_save(); - bool ret = false; - - if (likely(report_enabled())) { - __kasan_report(addr, size, is_write, ip); - ret = true; - } - - user_access_restore(flags); - - return ret; -} - #ifdef CONFIG_MEMORY_HOTPLUG static bool shadow_mapped(unsigned long addr) { diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 56ff8885fe2e..098a7dbaced6 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -15,7 +15,6 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#define DISABLE_BRANCH_PROFILING #include <linux/export.h> #include <linux/interrupt.h> diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index e8f37199d885..cfade6413528 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -212,8 +212,6 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) asmlinkage void kasan_unpoison_task_stack_below(const void *watermark); void __asan_register_globals(struct kasan_global *globals, size_t size); void __asan_unregister_globals(struct kasan_global *globals, size_t size); -void __asan_loadN(unsigned long addr, size_t size); -void __asan_storeN(unsigned long addr, size_t size); void __asan_handle_no_return(void); void __asan_alloca_poison(unsigned long addr, size_t size); void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom); @@ -228,6 +226,8 @@ void __asan_load8(unsigned long addr); void __asan_store8(unsigned long addr); void __asan_load16(unsigned long addr); void __asan_store16(unsigned long addr); +void __asan_loadN(unsigned long addr, size_t size); +void __asan_storeN(unsigned long addr, size_t size); void __asan_load1_noabort(unsigned long addr); void __asan_store1_noabort(unsigned long addr); @@ -239,6 +239,21 @@ void __asan_load8_noabort(unsigned long addr); void __asan_store8_noabort(unsigned long addr); void __asan_load16_noabort(unsigned long addr); void __asan_store16_noabort(unsigned long addr); +void __asan_loadN_noabort(unsigned long addr, size_t size); +void __asan_storeN_noabort(unsigned long addr, size_t size); + +void __asan_report_load1_noabort(unsigned long addr); +void __asan_report_store1_noabort(unsigned long addr); +void __asan_report_load2_noabort(unsigned long addr); +void __asan_report_store2_noabort(unsigned long addr); +void __asan_report_load4_noabort(unsigned long addr); +void __asan_report_store4_noabort(unsigned long addr); +void __asan_report_load8_noabort(unsigned long addr); +void __asan_report_store8_noabort(unsigned long addr); +void __asan_report_load16_noabort(unsigned long addr); +void __asan_report_store16_noabort(unsigned long addr); +void __asan_report_load_n_noabort(unsigned long addr, size_t size); +void __asan_report_store_n_noabort(unsigned long addr, size_t size); void __asan_set_shadow_00(const void *addr, size_t size); void __asan_set_shadow_f1(const void *addr, size_t size); @@ -247,4 +262,19 @@ void __asan_set_shadow_f3(const void *addr, size_t size); void __asan_set_shadow_f5(const void *addr, size_t size); void __asan_set_shadow_f8(const void *addr, size_t size); +void __hwasan_load1_noabort(unsigned long addr); +void __hwasan_store1_noabort(unsigned long addr); +void __hwasan_load2_noabort(unsigned long addr); +void __hwasan_store2_noabort(unsigned long addr); +void __hwasan_load4_noabort(unsigned long addr); +void __hwasan_store4_noabort(unsigned long addr); +void __hwasan_load8_noabort(unsigned long addr); +void __hwasan_store8_noabort(unsigned long addr); +void __hwasan_load16_noabort(unsigned long addr); +void __hwasan_store16_noabort(unsigned long addr); +void __hwasan_loadN_noabort(unsigned long addr, size_t size); +void __hwasan_storeN_noabort(unsigned long addr, size_t size); + +void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size); + #endif diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 80f23c9da6b0..51ec45407a0b 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -29,6 +29,7 @@ #include <linux/kasan.h> #include <linux/module.h> #include <linux/sched/task_stack.h> +#include <linux/uaccess.h> #include <asm/sections.h> @@ -454,7 +455,7 @@ static void print_shadow_for_address(const void *addr) } } -bool report_enabled(void) +static bool report_enabled(void) { if (current->kasan_depth) return false; @@ -479,7 +480,8 @@ void kasan_report_invalid_free(void *object, unsigned long ip) end_report(&flags); } -void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) +static void __kasan_report(unsigned long addr, size_t size, bool is_write, + unsigned long ip) { struct kasan_access_info info; void *tagged_addr; @@ -518,6 +520,22 @@ void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned lon end_report(&flags); } +bool kasan_report(unsigned long addr, size_t size, bool is_write, + unsigned long ip) +{ + unsigned long flags = user_access_save(); + bool ret = false; + + if (likely(report_enabled())) { + __kasan_report(addr, size, is_write, ip); + ret = true; + } + + user_access_restore(flags); + + return ret; +} + #ifdef CONFIG_KASAN_INLINE /* * With CONFIG_KASAN_INLINE, accesses to bogus pointers (outside the high diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 25b7734e7013..8a959fdd30e3 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -12,7 +12,6 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#define DISABLE_BRANCH_PROFILING #include <linux/export.h> #include <linux/interrupt.h> diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 99d77ffb79c2..3f032487825b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -28,6 +28,8 @@ enum scan_result { SCAN_SUCCEED, SCAN_PMD_NULL, SCAN_EXCEED_NONE_PTE, + SCAN_EXCEED_SWAP_PTE, + SCAN_EXCEED_SHARED_PTE, SCAN_PTE_NON_PRESENT, SCAN_PTE_UFFD_WP, SCAN_PAGE_RO, @@ -47,7 +49,6 @@ enum scan_result { SCAN_DEL_PAGE_LRU, SCAN_ALLOC_HUGE_PAGE_FAIL, SCAN_CGROUP_CHARGE_FAIL, - SCAN_EXCEED_SWAP_PTE, SCAN_TRUNCATED, SCAN_PAGE_HAS_PRIVATE, }; @@ -72,6 +73,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); */ static unsigned int khugepaged_max_ptes_none __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly; +static unsigned int khugepaged_max_ptes_shared __read_mostly; #define MM_SLOTS_HASH_BITS 10 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); @@ -291,15 +293,43 @@ static struct kobj_attribute khugepaged_max_ptes_swap_attr = __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show, khugepaged_max_ptes_swap_store); +static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_max_ptes_shared); +} + +static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long max_ptes_shared; + + err = kstrtoul(buf, 10, &max_ptes_shared); + if (err || max_ptes_shared > HPAGE_PMD_NR-1) + return -EINVAL; + + khugepaged_max_ptes_shared = max_ptes_shared; + + return count; +} + +static struct kobj_attribute khugepaged_max_ptes_shared_attr = + __ATTR(max_ptes_shared, 0644, khugepaged_max_ptes_shared_show, + khugepaged_max_ptes_shared_store); + static struct attribute *khugepaged_attr[] = { &khugepaged_defrag_attr.attr, &khugepaged_max_ptes_none_attr.attr, + &khugepaged_max_ptes_swap_attr.attr, + &khugepaged_max_ptes_shared_attr.attr, &pages_to_scan_attr.attr, &pages_collapsed_attr.attr, &full_scans_attr.attr, &scan_sleep_millisecs_attr.attr, &alloc_sleep_millisecs_attr.attr, - &khugepaged_max_ptes_swap_attr.attr, NULL, }; @@ -359,6 +389,7 @@ int __init khugepaged_init(void) khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; + khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2; return 0; } @@ -512,27 +543,52 @@ void __khugepaged_exit(struct mm_struct *mm) static void release_pte_page(struct page *page) { - dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_lru(page)); + mod_node_page_state(page_pgdat(page), + NR_ISOLATED_ANON + page_is_file_lru(page), + -compound_nr(page)); unlock_page(page); putback_lru_page(page); } -static void release_pte_pages(pte_t *pte, pte_t *_pte) +static void release_pte_pages(pte_t *pte, pte_t *_pte, + struct list_head *compound_pagelist) { + struct page *page, *tmp; + while (--_pte >= pte) { pte_t pteval = *_pte; - if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval))) - release_pte_page(pte_page(pteval)); + + page = pte_page(pteval); + if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) && + !PageCompound(page)) + release_pte_page(page); + } + + list_for_each_entry_safe(page, tmp, compound_pagelist, lru) { + list_del(&page->lru); + release_pte_page(page); } } +static bool is_refcount_suitable(struct page *page) +{ + int expected_refcount; + + expected_refcount = total_mapcount(page); + if (PageSwapCache(page)) + expected_refcount += compound_nr(page); + + return page_count(page) == expected_refcount; +} + static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, - pte_t *pte) + pte_t *pte, + struct list_head *compound_pagelist) { struct page *page = NULL; pte_t *_pte; - int none_or_zero = 0, result = 0, referenced = 0; + int none_or_zero = 0, shared = 0, result = 0, referenced = 0; bool writable = false; for (_pte = pte; _pte < pte+HPAGE_PMD_NR; @@ -558,13 +614,27 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, goto out; } - /* TODO: teach khugepaged to collapse THP mapped with pte */ - if (PageCompound(page)) { - result = SCAN_PAGE_COMPOUND; + VM_BUG_ON_PAGE(!PageAnon(page), page); + + if (page_mapcount(page) > 1 && + ++shared > khugepaged_max_ptes_shared) { + result = SCAN_EXCEED_SHARED_PTE; goto out; } - VM_BUG_ON_PAGE(!PageAnon(page), page); + if (PageCompound(page)) { + struct page *p; + page = compound_head(page); + + /* + * Check if we have dealt with the compound page + * already + */ + list_for_each_entry(p, compound_pagelist, lru) { + if (page == p) + goto next; + } + } /* * We can do it before isolate_lru_page because the @@ -578,28 +648,30 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, } /* - * cannot use mapcount: can't collapse if there's a gup pin. - * The page must only be referenced by the scanned process - * and page swap cache. + * Check if the page has any GUP (or other external) pins. + * + * The page table that maps the page has been already unlinked + * from the page table tree and this process cannot get + * an additinal pin on the page. + * + * New pins can come later if the page is shared across fork, + * but not from this process. The other process cannot write to + * the page, only trigger CoW. */ - if (page_count(page) != 1 + PageSwapCache(page)) { + if (!is_refcount_suitable(page)) { unlock_page(page); result = SCAN_PAGE_COUNT; goto out; } - if (pte_write(pteval)) { - writable = true; - } else { - if (PageSwapCache(page) && - !reuse_swap_page(page, NULL)) { - unlock_page(page); - result = SCAN_SWAP_CACHE_PAGE; - goto out; - } + if (!pte_write(pteval) && PageSwapCache(page) && + !reuse_swap_page(page, NULL)) { /* - * Page is not in the swap cache. It can be collapsed - * into a THP. + * Page is in the swap cache and cannot be re-used. + * It cannot be collapsed into a THP. */ + unlock_page(page); + result = SCAN_SWAP_CACHE_PAGE; + goto out; } /* @@ -611,16 +683,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, result = SCAN_DEL_PAGE_LRU; goto out; } - inc_node_page_state(page, - NR_ISOLATED_ANON + page_is_file_lru(page)); + mod_node_page_state(page_pgdat(page), + NR_ISOLATED_ANON + page_is_file_lru(page), + compound_nr(page)); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageLRU(page), page); + if (PageCompound(page)) + list_add_tail(&page->lru, compound_pagelist); +next: /* There should be enough young pte to collapse the page */ if (pte_young(pteval) || page_is_young(page) || PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, address)) referenced++; + + if (pte_write(pteval)) + writable = true; } if (likely(writable)) { if (likely(referenced)) { @@ -634,7 +713,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, } out: - release_pte_pages(pte, _pte); + release_pte_pages(pte, _pte, compound_pagelist); trace_mm_collapse_huge_page_isolate(page, none_or_zero, referenced, writable, result); return 0; @@ -643,13 +722,14 @@ out: static void __collapse_huge_page_copy(pte_t *pte, struct page *page, struct vm_area_struct *vma, unsigned long address, - spinlock_t *ptl) + spinlock_t *ptl, + struct list_head *compound_pagelist) { + struct page *src_page, *tmp; pte_t *_pte; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, page++, address += PAGE_SIZE) { pte_t pteval = *_pte; - struct page *src_page; if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { clear_user_highpage(page, address); @@ -669,8 +749,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, } else { src_page = pte_page(pteval); copy_user_highpage(page, src_page, address, vma); - VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page); - release_pte_page(src_page); + if (!PageCompound(src_page)) + release_pte_page(src_page); /* * ptl mostly unnecessary, but preempt has to * be disabled to update the per-cpu stats @@ -687,6 +767,11 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, free_page_and_swap_cache(src_page); } } + + list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) { + list_del(&src_page->lru); + release_pte_page(src_page); + } } static void khugepaged_alloc_sleep(void) @@ -899,11 +984,6 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, .pgoff = linear_page_index(vma, address), }; - /* we only decide to swapin, if there is enough young ptes */ - if (referenced < HPAGE_PMD_NR/2) { - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return false; - } vmf.pte = pte_offset_map(pmd, address); for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE; vmf.pte++, vmf.address += PAGE_SIZE) { @@ -936,6 +1016,11 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, } vmf.pte--; pte_unmap(vmf.pte); + + /* Drain LRU add pagevec to remove extra pin on the swapped in pages */ + if (swapped_in) + lru_add_drain(); + trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); return true; } @@ -943,15 +1028,15 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, static void collapse_huge_page(struct mm_struct *mm, unsigned long address, struct page **hpage, - int node, int referenced) + int node, int referenced, int unmapped) { + LIST_HEAD(compound_pagelist); pmd_t *pmd, _pmd; pte_t *pte; pgtable_t pgtable; struct page *new_page; spinlock_t *pmd_ptl, *pte_ptl; int isolated = 0, result = 0; - struct mem_cgroup *memcg; struct vm_area_struct *vma; struct mmu_notifier_range range; gfp_t gfp; @@ -974,15 +1059,15 @@ static void collapse_huge_page(struct mm_struct *mm, goto out_nolock; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { + if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out_nolock; } + count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); down_read(&mm->mmap_sem); result = hugepage_vma_revalidate(mm, address, &vma); if (result) { - mem_cgroup_cancel_charge(new_page, memcg, true); up_read(&mm->mmap_sem); goto out_nolock; } @@ -990,7 +1075,6 @@ static void collapse_huge_page(struct mm_struct *mm, pmd = mm_find_pmd(mm, address); if (!pmd) { result = SCAN_PMD_NULL; - mem_cgroup_cancel_charge(new_page, memcg, true); up_read(&mm->mmap_sem); goto out_nolock; } @@ -1000,8 +1084,8 @@ static void collapse_huge_page(struct mm_struct *mm, * If it fails, we release mmap_sem and jump out_nolock. * Continuing to collapse causes inconsistency. */ - if (!__collapse_huge_page_swapin(mm, vma, address, pmd, referenced)) { - mem_cgroup_cancel_charge(new_page, memcg, true); + if (unmapped && !__collapse_huge_page_swapin(mm, vma, address, + pmd, referenced)) { up_read(&mm->mmap_sem); goto out_nolock; } @@ -1044,7 +1128,8 @@ static void collapse_huge_page(struct mm_struct *mm, mmu_notifier_invalidate_range_end(&range); spin_lock(pte_ptl); - isolated = __collapse_huge_page_isolate(vma, address, pte); + isolated = __collapse_huge_page_isolate(vma, address, pte, + &compound_pagelist); spin_unlock(pte_ptl); if (unlikely(!isolated)) { @@ -1069,7 +1154,8 @@ static void collapse_huge_page(struct mm_struct *mm, */ anon_vma_unlock_write(vma->anon_vma); - __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl); + __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl, + &compound_pagelist); pte_unmap(pte); __SetPageUptodate(new_page); pgtable = pmd_pgtable(_pmd); @@ -1087,8 +1173,6 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address, true); - mem_cgroup_commit_charge(new_page, memcg, false, true); - count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); lru_cache_add_active_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); @@ -1102,10 +1186,11 @@ static void collapse_huge_page(struct mm_struct *mm, out_up_write: up_write(&mm->mmap_sem); out_nolock: + if (!IS_ERR_OR_NULL(*hpage)) + mem_cgroup_uncharge(*hpage); trace_mm_collapse_huge_page(mm, isolated, result); return; out: - mem_cgroup_cancel_charge(new_page, memcg, true); goto out_up_write; } @@ -1116,7 +1201,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, { pmd_t *pmd; pte_t *pte, *_pte; - int ret = 0, none_or_zero = 0, result = 0, referenced = 0; + int ret = 0, result = 0, referenced = 0; + int none_or_zero = 0, shared = 0; struct page *page = NULL; unsigned long _address; spinlock_t *ptl; @@ -1188,12 +1274,14 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, goto out_unmap; } - /* TODO: teach khugepaged to collapse THP mapped with pte */ - if (PageCompound(page)) { - result = SCAN_PAGE_COMPOUND; + if (page_mapcount(page) > 1 && + ++shared > khugepaged_max_ptes_shared) { + result = SCAN_EXCEED_SHARED_PTE; goto out_unmap; } + page = compound_head(page); + /* * Record which node the original page is from and save this * information to khugepaged_node_load[]. @@ -1220,11 +1308,23 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, } /* - * cannot use mapcount: can't collapse if there's a gup pin. - * The page must only be referenced by the scanned process - * and page swap cache. + * Check if the page has any GUP (or other external) pins. + * + * Here the check is racy it may see totmal_mapcount > refcount + * in some cases. + * For example, one process with one forked child process. + * The parent has the PMD split due to MADV_DONTNEED, then + * the child is trying unmap the whole PMD, but khugepaged + * may be scanning the parent between the child has + * PageDoubleMap flag cleared and dec the mapcount. So + * khugepaged may see total_mapcount > refcount. + * + * But such case is ephemeral we could always retry collapse + * later. However it may report false positive if the page + * has excessive GUP pins (i.e. 512). Anyway the same check + * will be done again later the risk seems low. */ - if (page_count(page) != 1 + PageSwapCache(page)) { + if (!is_refcount_suitable(page)) { result = SCAN_PAGE_COUNT; goto out_unmap; } @@ -1233,22 +1333,21 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, mmu_notifier_test_young(vma->vm_mm, address)) referenced++; } - if (writable) { - if (referenced) { - result = SCAN_SUCCEED; - ret = 1; - } else { - result = SCAN_LACK_REFERENCED_PAGE; - } - } else { + if (!writable) { result = SCAN_PAGE_RO; + } else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) { + result = SCAN_LACK_REFERENCED_PAGE; + } else { + result = SCAN_SUCCEED; + ret = 1; } out_unmap: pte_unmap_unlock(pte, ptl); if (ret) { node = khugepaged_find_target_node(); /* collapse_huge_page will return with the mmap_sem released */ - collapse_huge_page(mm, address, hpage, node, referenced); + collapse_huge_page(mm, address, hpage, node, + referenced, unmapped); } out: trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, @@ -1515,7 +1614,6 @@ static void collapse_file(struct mm_struct *mm, struct address_space *mapping = file->f_mapping; gfp_t gfp; struct page *new_page; - struct mem_cgroup *memcg; pgoff_t index, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); @@ -1534,10 +1632,11 @@ static void collapse_file(struct mm_struct *mm, goto out; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { + if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out; } + count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); /* This will be less messy when we use multi-index entries */ do { @@ -1547,7 +1646,6 @@ static void collapse_file(struct mm_struct *mm, break; xas_unlock_irq(&xas); if (!xas_nomem(&xas, GFP_KERNEL)) { - mem_cgroup_cancel_charge(new_page, memcg, true); result = SCAN_FAIL; goto out; } @@ -1692,6 +1790,7 @@ static void collapse_file(struct mm_struct *mm, if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) { result = SCAN_PAGE_HAS_PRIVATE; + putback_lru_page(page); goto out_unlock; } @@ -1740,12 +1839,9 @@ out_unlock: } if (nr_none) { - struct zone *zone = page_zone(new_page); - - __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); + __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none); if (is_shmem) - __mod_node_page_state(zone->zone_pgdat, - NR_SHMEM, nr_none); + __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none); } xa_locked: @@ -1783,15 +1879,9 @@ xa_unlocked: SetPageUptodate(new_page); page_ref_add(new_page, HPAGE_PMD_NR - 1); - mem_cgroup_commit_charge(new_page, memcg, false, true); - - if (is_shmem) { + if (is_shmem) set_page_dirty(new_page); - lru_cache_add_anon(new_page); - } else { - lru_cache_add_file(new_page); - } - count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); + lru_cache_add(new_page); /* * Remove pte page tables, so we can re-fault the page as huge. @@ -1838,13 +1928,14 @@ xa_unlocked: VM_BUG_ON(nr_none); xas_unlock_irq(&xas); - mem_cgroup_cancel_charge(new_page, memcg, true); new_page->mapping = NULL; } unlock_page(new_page); out: VM_BUG_ON(!list_empty(&pagelist)); + if (!IS_ERR_OR_NULL(*hpage)) + mem_cgroup_uncharge(*hpage); /* TODO: tracepoints */ } @@ -2083,6 +2174,8 @@ static void khugepaged_do_scan(void) barrier(); /* write khugepaged_pages_to_scan to local stack */ + lru_add_drain_all(); + while (progress < pages) { if (!khugepaged_prealloc_page(&hpage, &wait)) break; diff --git a/mm/madvise.c b/mm/madvise.c index 4bb30ed6c8d2..8cbd8c1bfe15 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -27,6 +27,7 @@ #include <linux/swapops.h> #include <linux/shmem_fs.h> #include <linux/mmu_notifier.h> +#include <linux/sched/mm.h> #include <asm/tlb.h> @@ -1090,6 +1091,23 @@ int do_madvise(unsigned long start, size_t len_in, int behavior) if (write) { if (down_write_killable(¤t->mm->mmap_sem)) return -EINTR; + + /* + * We may have stolen the mm from another process + * that is undergoing core dumping. + * + * Right now that's io_ring, in the future it may + * be remote process management and not "current" + * at all. + * + * We need to fix core dumping to not do this, + * but for now we have the mmget_still_valid() + * model. + */ + if (!mmget_still_valid(current->mm)) { + up_write(¤t->mm->mmap_sem); + return -EINTR; + } } else { down_read(¤t->mm->mmap_sem); } diff --git a/mm/memblock.c b/mm/memblock.c index c79ba6f9920c..743659d88fc4 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -620,7 +620,7 @@ repeat: * area, insert that portion. */ if (rbase > base) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#ifdef CONFIG_NEED_MULTIPLE_NODES WARN_ON(nid != memblock_get_region_node(rgn)); #endif WARN_ON(flags != rgn->flags); @@ -1197,7 +1197,6 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, *idx = ULLONG_MAX; } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* * Common iterator interface used to define for_each_mem_pfn_range(). */ @@ -1207,13 +1206,15 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, { struct memblock_type *type = &memblock.memory; struct memblock_region *r; + int r_nid; while (++*idx < type->cnt) { r = &type->regions[*idx]; + r_nid = memblock_get_region_node(r); if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size)) continue; - if (nid == MAX_NUMNODES || nid == r->nid) + if (nid == MAX_NUMNODES || nid == r_nid) break; } if (*idx >= type->cnt) { @@ -1226,7 +1227,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, if (out_end_pfn) *out_end_pfn = PFN_DOWN(r->base + r->size); if (out_nid) - *out_nid = r->nid; + *out_nid = r_nid; } /** @@ -1245,6 +1246,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, struct memblock_type *type, int nid) { +#ifdef CONFIG_NEED_MULTIPLE_NODES int start_rgn, end_rgn; int i, ret; @@ -1256,9 +1258,10 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, memblock_set_region_node(&type->regions[i], nid); memblock_merge_regions(type); +#endif return 0; } -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /** * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone() @@ -1797,7 +1800,6 @@ bool __init_memblock memblock_is_map_memory(phys_addr_t addr) return !memblock_is_nomap(&memblock.memory.regions[i]); } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int __init_memblock memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn) { @@ -1810,9 +1812,8 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn, *start_pfn = PFN_DOWN(type->regions[mid].base); *end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size); - return type->regions[mid].nid; + return memblock_get_region_node(&type->regions[mid]); } -#endif /** * memblock_is_region_memory - check if a region is a subset of memory @@ -1903,7 +1904,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type) size = rgn->size; end = base + size - 1; flags = rgn->flags; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#ifdef CONFIG_NEED_MULTIPLE_NODES if (memblock_get_region_node(rgn) != MAX_NUMNODES) snprintf(nid_buf, sizeof(nid_buf), " on node %d", memblock_get_region_node(rgn)); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5beea03dd58a..5381afb23d58 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -83,9 +83,9 @@ static bool cgroup_memory_nokmem; /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP -int do_swap_account __read_mostly; +bool cgroup_memory_noswap __read_mostly; #else -#define do_swap_account 0 +#define cgroup_memory_noswap 1 #endif #ifdef CONFIG_CGROUP_WRITEBACK @@ -95,7 +95,7 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); /* Whether legacy memory+swap accounting is active */ static bool do_memsw_account(void) { - return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account; + return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; } #define THRESHOLDS_EVENTS_TARGET 128 @@ -834,25 +834,8 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, struct page *page, - bool compound, int nr_pages) + int nr_pages) { - /* - * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is - * counted as CACHE even if it's on ANON LRU. - */ - if (PageAnon(page)) - __mod_memcg_state(memcg, MEMCG_RSS, nr_pages); - else { - __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages); - if (PageSwapBacked(page)) - __mod_memcg_state(memcg, NR_SHMEM, nr_pages); - } - - if (compound) { - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages); - } - /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) __count_memcg_events(memcg, PGPGIN, 1); @@ -1218,9 +1201,8 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, * @page: the page * @pgdat: pgdat of the page * - * This function is only safe when following the LRU page isolation - * and putback protocol: the LRU lock must be held, and the page must - * either be PageLRU() or the caller must have isolated/allocated it. + * This function relies on page->mem_cgroup being stable - see the + * access rules in commit_charge(). */ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat) { @@ -1314,7 +1296,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) if (do_memsw_account()) { count = page_counter_read(&memcg->memsw); limit = READ_ONCE(memcg->memsw.max); - if (count <= limit) + if (count < limit) margin = min(margin, limit - count); else margin = 0; @@ -1389,10 +1371,10 @@ static char *memory_stat_format(struct mem_cgroup *memcg) */ seq_buf_printf(&s, "anon %llu\n", - (u64)memcg_page_state(memcg, MEMCG_RSS) * + (u64)memcg_page_state(memcg, NR_ANON_MAPPED) * PAGE_SIZE); seq_buf_printf(&s, "file %llu\n", - (u64)memcg_page_state(memcg, MEMCG_CACHE) * + (u64)memcg_page_state(memcg, NR_FILE_PAGES) * PAGE_SIZE); seq_buf_printf(&s, "kernel_stack %llu\n", (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * @@ -1418,15 +1400,11 @@ static char *memory_stat_format(struct mem_cgroup *memcg) (u64)memcg_page_state(memcg, NR_WRITEBACK) * PAGE_SIZE); - /* - * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter - * with the NR_ANON_THP vm counter, but right now it's a pain in the - * arse because it requires migrating the work out of rmap to a place - * where the page->mem_cgroup is set up and stable. - */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE seq_buf_printf(&s, "anon_thp %llu\n", - (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) * - PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_ANON_THPS) * + HPAGE_PMD_SIZE); +#endif for (i = 0; i < NR_LRU_LISTS; i++) seq_buf_printf(&s, "%s %llu\n", lru_list_name(i), @@ -1451,6 +1429,8 @@ static char *memory_stat_format(struct mem_cgroup *memcg) memcg_page_state(memcg, WORKINGSET_REFAULT)); seq_buf_printf(&s, "workingset_activate %lu\n", memcg_page_state(memcg, WORKINGSET_ACTIVATE)); + seq_buf_printf(&s, "workingset_restore %lu\n", + memcg_page_state(memcg, WORKINGSET_RESTORE)); seq_buf_printf(&s, "workingset_nodereclaim %lu\n", memcg_page_state(memcg, WORKINGSET_NODERECLAIM)); @@ -1979,6 +1959,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) */ struct mem_cgroup *lock_page_memcg(struct page *page) { + struct page *head = compound_head(page); /* rmap on tail pages */ struct mem_cgroup *memcg; unsigned long flags; @@ -1998,7 +1979,7 @@ struct mem_cgroup *lock_page_memcg(struct page *page) if (mem_cgroup_disabled()) return NULL; again: - memcg = page->mem_cgroup; + memcg = head->mem_cgroup; if (unlikely(!memcg)) return NULL; @@ -2006,7 +1987,7 @@ again: return memcg; spin_lock_irqsave(&memcg->move_lock, flags); - if (memcg != page->mem_cgroup) { + if (memcg != head->mem_cgroup) { spin_unlock_irqrestore(&memcg->move_lock, flags); goto again; } @@ -2049,7 +2030,9 @@ void __unlock_page_memcg(struct mem_cgroup *memcg) */ void unlock_page_memcg(struct page *page) { - __unlock_page_memcg(page->mem_cgroup); + struct page *head = compound_head(page); + + __unlock_page_memcg(head->mem_cgroup); } EXPORT_SYMBOL(unlock_page_memcg); @@ -2250,7 +2233,8 @@ static void reclaim_high(struct mem_cgroup *memcg, gfp_t gfp_mask) { do { - if (page_counter_read(&memcg->memory) <= READ_ONCE(memcg->high)) + if (page_counter_read(&memcg->memory) <= + READ_ONCE(memcg->memory.high)) continue; memcg_memory_event(memcg, MEMCG_HIGH); try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); @@ -2319,41 +2303,64 @@ static void high_work_func(struct work_struct *work) #define MEMCG_DELAY_PRECISION_SHIFT 20 #define MEMCG_DELAY_SCALING_SHIFT 14 -/* - * Get the number of jiffies that we should penalise a mischievous cgroup which - * is exceeding its memory.high by checking both it and its ancestors. - */ -static unsigned long calculate_high_delay(struct mem_cgroup *memcg, - unsigned int nr_pages) +static u64 calculate_overage(unsigned long usage, unsigned long high) { - unsigned long penalty_jiffies; - u64 max_overage = 0; + u64 overage; - do { - unsigned long usage, high; - u64 overage; + if (usage <= high) + return 0; - usage = page_counter_read(&memcg->memory); - high = READ_ONCE(memcg->high); + /* + * Prevent division by 0 in overage calculation by acting as if + * it was a threshold of 1 page + */ + high = max(high, 1UL); - if (usage <= high) - continue; + overage = usage - high; + overage <<= MEMCG_DELAY_PRECISION_SHIFT; + return div64_u64(overage, high); +} - /* - * Prevent division by 0 in overage calculation by acting as if - * it was a threshold of 1 page - */ - high = max(high, 1UL); +static u64 mem_find_max_overage(struct mem_cgroup *memcg) +{ + u64 overage, max_overage = 0; + + do { + overage = calculate_overage(page_counter_read(&memcg->memory), + READ_ONCE(memcg->memory.high)); + max_overage = max(overage, max_overage); + } while ((memcg = parent_mem_cgroup(memcg)) && + !mem_cgroup_is_root(memcg)); + + return max_overage; +} - overage = usage - high; - overage <<= MEMCG_DELAY_PRECISION_SHIFT; - overage = div64_u64(overage, high); +static u64 swap_find_max_overage(struct mem_cgroup *memcg) +{ + u64 overage, max_overage = 0; - if (overage > max_overage) - max_overage = overage; + do { + overage = calculate_overage(page_counter_read(&memcg->swap), + READ_ONCE(memcg->swap.high)); + if (overage) + memcg_memory_event(memcg, MEMCG_SWAP_HIGH); + max_overage = max(overage, max_overage); } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); + return max_overage; +} + +/* + * Get the number of jiffies that we should penalise a mischievous cgroup which + * is exceeding its memory.high by checking both it and its ancestors. + */ +static unsigned long calculate_high_delay(struct mem_cgroup *memcg, + unsigned int nr_pages, + u64 max_overage) +{ + unsigned long penalty_jiffies; + if (!max_overage) return 0; @@ -2377,14 +2384,7 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg, * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or * larger the current charge patch is than that. */ - penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; - - /* - * Clamp the max delay per usermode return so as to still keep the - * application moving forwards and also permit diagnostics, albeit - * extremely slowly. - */ - return min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); + return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; } /* @@ -2409,7 +2409,18 @@ void mem_cgroup_handle_over_high(void) * memory.high is breached and reclaim is unable to keep up. Throttle * allocators proactively to slow down excessive growth. */ - penalty_jiffies = calculate_high_delay(memcg, nr_pages); + penalty_jiffies = calculate_high_delay(memcg, nr_pages, + mem_find_max_overage(memcg)); + + penalty_jiffies += calculate_high_delay(memcg, nr_pages, + swap_find_max_overage(memcg)); + + /* + * Clamp the max delay per usermode return so as to still keep the + * application moving forwards and also permit diagnostics, albeit + * extremely slowly. + */ + penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); /* * Don't sleep if the amount of jiffies this memcg owes us is so low @@ -2594,12 +2605,32 @@ done_restock: * reclaim, the cost of mismatch is negligible. */ do { - if (page_counter_read(&memcg->memory) > READ_ONCE(memcg->high)) { - /* Don't bother a random interrupted task */ - if (in_interrupt()) { + bool mem_high, swap_high; + + mem_high = page_counter_read(&memcg->memory) > + READ_ONCE(memcg->memory.high); + swap_high = page_counter_read(&memcg->swap) > + READ_ONCE(memcg->swap.high); + + /* Don't bother a random interrupted task */ + if (in_interrupt()) { + if (mem_high) { schedule_work(&memcg->high_work); break; } + continue; + } + + if (mem_high || swap_high) { + /* + * The allocating tasks in this cgroup will need to do + * reclaim or be throttled to prevent further growth + * of the memory or swap footprints. + * + * Target some best-effort fairness between the tasks, + * and distribute reclaim work and delay penalties + * based on how much each task is actually allocating. + */ current->memcg_nr_pages_over_high += batch; set_notify_resume(current); break; @@ -2609,6 +2640,7 @@ done_restock: return 0; } +#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU) static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { if (mem_cgroup_is_root(memcg)) @@ -2620,70 +2652,20 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) css_put_many(&memcg->css, nr_pages); } +#endif -static void lock_page_lru(struct page *page, int *isolated) -{ - pg_data_t *pgdat = page_pgdat(page); - - spin_lock_irq(&pgdat->lru_lock); - if (PageLRU(page)) { - struct lruvec *lruvec; - - lruvec = mem_cgroup_page_lruvec(page, pgdat); - ClearPageLRU(page); - del_page_from_lru_list(page, lruvec, page_lru(page)); - *isolated = 1; - } else - *isolated = 0; -} - -static void unlock_page_lru(struct page *page, int isolated) -{ - pg_data_t *pgdat = page_pgdat(page); - - if (isolated) { - struct lruvec *lruvec; - - lruvec = mem_cgroup_page_lruvec(page, pgdat); - VM_BUG_ON_PAGE(PageLRU(page), page); - SetPageLRU(page); - add_page_to_lru_list(page, lruvec, page_lru(page)); - } - spin_unlock_irq(&pgdat->lru_lock); -} - -static void commit_charge(struct page *page, struct mem_cgroup *memcg, - bool lrucare) +static void commit_charge(struct page *page, struct mem_cgroup *memcg) { - int isolated; - VM_BUG_ON_PAGE(page->mem_cgroup, page); - /* - * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page - * may already be on some other mem_cgroup's LRU. Take care of it. - */ - if (lrucare) - lock_page_lru(page, &isolated); - - /* - * Nobody should be changing or seriously looking at - * page->mem_cgroup at this point: - * - * - the page is uncharged + * Any of the following ensures page->mem_cgroup stability: * - * - the page is off-LRU - * - * - an anonymous fault has exclusive page access, except for - * a locked page table - * - * - a page cache insertion, a swapin fault, or a migration - * have the page locked + * - the page lock + * - LRU isolation + * - lock_page_memcg() + * - exclusive reference */ page->mem_cgroup = memcg; - - if (lrucare) - unlock_page_lru(page, isolated); } #ifdef CONFIG_MEMCG_KMEM @@ -2802,7 +2784,12 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, static inline bool memcg_kmem_bypass(void) { - if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) + if (in_interrupt()) + return true; + + /* Allow remote memcg charging in kthread contexts. */ + if ((!current->mm || (current->flags & PF_KTHREAD)) && + !current->active_memcg) return true; return false; } @@ -3015,8 +3002,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) for (i = 1; i < HPAGE_PMD_NR; i++) head[i].mem_cgroup = head->mem_cgroup; - - __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -3299,8 +3284,8 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) unsigned long val; if (mem_cgroup_is_root(memcg)) { - val = memcg_page_state(memcg, MEMCG_CACHE) + - memcg_page_state(memcg, MEMCG_RSS); + val = memcg_page_state(memcg, NR_FILE_PAGES) + + memcg_page_state(memcg, NR_ANON_MAPPED); if (swap) val += memcg_page_state(memcg, MEMCG_SWAP); } else { @@ -3688,7 +3673,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, unsigned int lru_mask) + int nid, unsigned int lru_mask, bool tree) { struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); unsigned long nr = 0; @@ -3699,13 +3684,17 @@ static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, for_each_lru(lru) { if (!(BIT(lru) & lru_mask)) continue; - nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); + if (tree) + nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); + else + nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); } return nr; } static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, - unsigned int lru_mask) + unsigned int lru_mask, + bool tree) { unsigned long nr = 0; enum lru_list lru; @@ -3713,7 +3702,10 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, for_each_lru(lru) { if (!(BIT(lru) & lru_mask)) continue; - nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); + if (tree) + nr += memcg_page_state(memcg, NR_LRU_BASE + lru); + else + nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); } return nr; } @@ -3733,34 +3725,28 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) }; const struct numa_stat *stat; int nid; - unsigned long nr; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { - nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); - seq_printf(m, "%s=%lu", stat->name, nr); - for_each_node_state(nid, N_MEMORY) { - nr = mem_cgroup_node_nr_lru_pages(memcg, nid, - stat->lru_mask); - seq_printf(m, " N%d=%lu", nid, nr); - } + seq_printf(m, "%s=%lu", stat->name, + mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, + false)); + for_each_node_state(nid, N_MEMORY) + seq_printf(m, " N%d=%lu", nid, + mem_cgroup_node_nr_lru_pages(memcg, nid, + stat->lru_mask, false)); seq_putc(m, '\n'); } for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { - struct mem_cgroup *iter; - - nr = 0; - for_each_mem_cgroup_tree(iter, memcg) - nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); - seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); - for_each_node_state(nid, N_MEMORY) { - nr = 0; - for_each_mem_cgroup_tree(iter, memcg) - nr += mem_cgroup_node_nr_lru_pages( - iter, nid, stat->lru_mask); - seq_printf(m, " N%d=%lu", nid, nr); - } + + seq_printf(m, "hierarchical_%s=%lu", stat->name, + mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, + true)); + for_each_node_state(nid, N_MEMORY) + seq_printf(m, " N%d=%lu", nid, + mem_cgroup_node_nr_lru_pages(memcg, nid, + stat->lru_mask, true)); seq_putc(m, '\n'); } @@ -3769,9 +3755,11 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) #endif /* CONFIG_NUMA */ static const unsigned int memcg1_stats[] = { - MEMCG_CACHE, - MEMCG_RSS, - MEMCG_RSS_HUGE, + NR_FILE_PAGES, + NR_ANON_MAPPED, +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + NR_ANON_THPS, +#endif NR_SHMEM, NR_FILE_MAPPED, NR_FILE_DIRTY, @@ -3782,7 +3770,9 @@ static const unsigned int memcg1_stats[] = { static const char *const memcg1_stat_names[] = { "cache", "rss", +#ifdef CONFIG_TRANSPARENT_HUGEPAGE "rss_huge", +#endif "shmem", "mapped_file", "dirty", @@ -3808,11 +3798,16 @@ static int memcg_stat_show(struct seq_file *m, void *v) BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { + unsigned long nr; + if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; - seq_printf(m, "%s %lu\n", memcg1_stat_names[i], - memcg_page_state_local(memcg, memcg1_stats[i]) * - PAGE_SIZE); + nr = memcg_page_state_local(memcg, memcg1_stats[i]); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (memcg1_stats[i] == NR_ANON_THPS) + nr *= HPAGE_PMD_NR; +#endif + seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) @@ -3858,23 +3853,17 @@ static int memcg_stat_show(struct seq_file *m, void *v) { pg_data_t *pgdat; struct mem_cgroup_per_node *mz; - struct zone_reclaim_stat *rstat; - unsigned long recent_rotated[2] = {0, 0}; - unsigned long recent_scanned[2] = {0, 0}; + unsigned long anon_cost = 0; + unsigned long file_cost = 0; for_each_online_pgdat(pgdat) { mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); - rstat = &mz->lruvec.reclaim_stat; - recent_rotated[0] += rstat->recent_rotated[0]; - recent_rotated[1] += rstat->recent_rotated[1]; - recent_scanned[0] += rstat->recent_scanned[0]; - recent_scanned[1] += rstat->recent_scanned[1]; + anon_cost += mz->lruvec.anon_cost; + file_cost += mz->lruvec.file_cost; } - seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); - seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); - seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); - seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); + seq_printf(m, "anon_cost %lu\n", anon_cost); + seq_printf(m, "file_cost %lu\n", file_cost); } #endif @@ -4330,7 +4319,6 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY); - /* this should eventually include NR_UNSTABLE_NFS */ *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK); *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) + memcg_exact_page_state(memcg, NR_ACTIVE_FILE); @@ -4338,7 +4326,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, while ((parent = parent_mem_cgroup(memcg))) { unsigned long ceiling = min(READ_ONCE(memcg->memory.max), - READ_ONCE(memcg->high)); + READ_ONCE(memcg->memory.high)); unsigned long used = page_counter_read(&memcg->memory); *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); @@ -4990,19 +4978,22 @@ static struct mem_cgroup *mem_cgroup_alloc(void) unsigned int size; int node; int __maybe_unused i; + long error = -ENOMEM; size = sizeof(struct mem_cgroup); size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); memcg = kzalloc(size, GFP_KERNEL); if (!memcg) - return NULL; + return ERR_PTR(error); memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX, GFP_KERNEL); - if (memcg->id.id < 0) + if (memcg->id.id < 0) { + error = memcg->id.id; goto fail; + } memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu); if (!memcg->vmstats_local) @@ -5046,7 +5037,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) fail: mem_cgroup_id_remove(memcg); __mem_cgroup_free(memcg); - return NULL; + return ERR_PTR(error); } static struct cgroup_subsys_state * __ref @@ -5057,11 +5048,12 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) long error = -ENOMEM; memcg = mem_cgroup_alloc(); - if (!memcg) - return ERR_PTR(error); + if (IS_ERR(memcg)) + return ERR_CAST(memcg); - WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX); + page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; + page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); if (parent) { memcg->swappiness = mem_cgroup_swappiness(parent); memcg->oom_kill_disable = parent->oom_kill_disable; @@ -5108,7 +5100,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) fail: mem_cgroup_id_remove(memcg); mem_cgroup_free(memcg); - return ERR_PTR(-ENOMEM); + return ERR_PTR(error); } static int mem_cgroup_css_online(struct cgroup_subsys_state *css) @@ -5213,8 +5205,9 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); - WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX); + page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; + page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); memcg_wb_domain_size_changed(memcg); } @@ -5305,8 +5298,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, * we call find_get_page() with swapper_space directly. */ page = find_get_page(swap_address_space(ent), swp_offset(ent)); - if (do_memsw_account()) - entry->val = ent.val; + entry->val = ent.val; return page; } @@ -5340,8 +5332,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, page = find_get_entry(mapping, pgoff); if (xa_is_value(page)) { swp_entry_t swp = radix_to_swp_entry(page); - if (do_memsw_account()) - *entry = swp; + *entry = swp; page = find_get_page(swap_address_space(swp), swp_offset(swp)); } @@ -5372,10 +5363,8 @@ static int mem_cgroup_move_account(struct page *page, { struct lruvec *from_vec, *to_vec; struct pglist_data *pgdat; - unsigned long flags; unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; int ret; - bool anon; VM_BUG_ON(from == to); VM_BUG_ON_PAGE(PageLRU(page), page); @@ -5393,30 +5382,47 @@ static int mem_cgroup_move_account(struct page *page, if (page->mem_cgroup != from) goto out_unlock; - anon = PageAnon(page); - pgdat = page_pgdat(page); from_vec = mem_cgroup_lruvec(from, pgdat); to_vec = mem_cgroup_lruvec(to, pgdat); - spin_lock_irqsave(&from->move_lock, flags); + lock_page_memcg(page); - if (!anon && page_mapped(page)) { - __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); - __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); - } + if (PageAnon(page)) { + if (page_mapped(page)) { + __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); + __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); + if (PageTransHuge(page)) { + __mod_lruvec_state(from_vec, NR_ANON_THPS, + -nr_pages); + __mod_lruvec_state(to_vec, NR_ANON_THPS, + nr_pages); + } - /* - * move_lock grabbed above and caller set from->moving_account, so - * mod_memcg_page_state will serialize updates to PageDirty. - * So mapping should be stable for dirty pages. - */ - if (!anon && PageDirty(page)) { - struct address_space *mapping = page_mapping(page); + } + } else { + __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); + __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages); - if (mapping_cap_account_dirty(mapping)) { - __mod_lruvec_state(from_vec, NR_FILE_DIRTY, -nr_pages); - __mod_lruvec_state(to_vec, NR_FILE_DIRTY, nr_pages); + if (PageSwapBacked(page)) { + __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages); + __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages); + } + + if (page_mapped(page)) { + __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); + __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); + } + + if (PageDirty(page)) { + struct address_space *mapping = page_mapping(page); + + if (mapping_cap_account_dirty(mapping)) { + __mod_lruvec_state(from_vec, NR_FILE_DIRTY, + -nr_pages); + __mod_lruvec_state(to_vec, NR_FILE_DIRTY, + nr_pages); + } } } @@ -5426,22 +5432,30 @@ static int mem_cgroup_move_account(struct page *page, } /* + * All state has been migrated, let's switch to the new memcg. + * * It is safe to change page->mem_cgroup here because the page - * is referenced, charged, and isolated - we can't race with - * uncharging, charging, migration, or LRU putback. + * is referenced, charged, isolated, and locked: we can't race + * with (un)charging, migration, LRU putback, or anything else + * that would rely on a stable page->mem_cgroup. + * + * Note that lock_page_memcg is a memcg lock, not a page lock, + * to save space. As soon as we switch page->mem_cgroup to a + * new memcg that isn't locked, the above state can change + * concurrently again. Make sure we're truly done with it. */ + smp_mb(); - /* caller should have done css_get */ - page->mem_cgroup = to; + page->mem_cgroup = to; /* caller should have done css_get */ - spin_unlock_irqrestore(&from->move_lock, flags); + __unlock_page_memcg(from); ret = 0; local_irq_disable(); - mem_cgroup_charge_statistics(to, page, compound, nr_pages); + mem_cgroup_charge_statistics(to, page, nr_pages); memcg_check_events(to, page); - mem_cgroup_charge_statistics(from, page, compound, -nr_pages); + mem_cgroup_charge_statistics(from, page, -nr_pages); memcg_check_events(from, page); local_irq_enable(); out_unlock: @@ -6012,7 +6026,8 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, static int memory_high_show(struct seq_file *m, void *v) { - return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high)); + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.high)); } static ssize_t memory_high_write(struct kernfs_open_file *of, @@ -6029,7 +6044,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, if (err) return err; - WRITE_ONCE(memcg->high, high); + page_counter_set_high(&memcg->memory, high); for (;;) { unsigned long nr_pages = page_counter_read(&memcg->memory); @@ -6224,7 +6239,6 @@ static struct cftype memory_files[] = { }, { .name = "stat", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = memory_stat_show, }, { @@ -6427,125 +6441,63 @@ out: } /** - * mem_cgroup_try_charge - try charging a page + * mem_cgroup_charge - charge a newly allocated page to a cgroup * @page: page to charge * @mm: mm context of the victim * @gfp_mask: reclaim mode - * @memcgp: charged memcg return - * @compound: charge the page as compound or small page * * Try to charge @page to the memcg that @mm belongs to, reclaiming * pages according to @gfp_mask if necessary. * - * Returns 0 on success, with *@memcgp pointing to the charged memcg. - * Otherwise, an error code is returned. - * - * After page->mapping has been set up, the caller must finalize the - * charge with mem_cgroup_commit_charge(). Or abort the transaction - * with mem_cgroup_cancel_charge() in case page instantiation fails. + * Returns 0 on success. Otherwise, an error code is returned. */ -int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp, - bool compound) +int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { + unsigned int nr_pages = hpage_nr_pages(page); struct mem_cgroup *memcg = NULL; - unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; int ret = 0; if (mem_cgroup_disabled()) goto out; if (PageSwapCache(page)) { + swp_entry_t ent = { .val = page_private(page), }; + unsigned short id; + /* * Every swap fault against a single page tries to charge the * page, bail as early as possible. shmem_unuse() encounters - * already charged pages, too. The USED bit is protected by - * the page lock, which serializes swap cache removal, which + * already charged pages, too. page->mem_cgroup is protected + * by the page lock, which serializes swap cache removal, which * in turn serializes uncharging. */ VM_BUG_ON_PAGE(!PageLocked(page), page); if (compound_head(page)->mem_cgroup) goto out; - if (do_swap_account) { - swp_entry_t ent = { .val = page_private(page), }; - unsigned short id = lookup_swap_cgroup_id(ent); - - rcu_read_lock(); - memcg = mem_cgroup_from_id(id); - if (memcg && !css_tryget_online(&memcg->css)) - memcg = NULL; - rcu_read_unlock(); - } + id = lookup_swap_cgroup_id(ent); + rcu_read_lock(); + memcg = mem_cgroup_from_id(id); + if (memcg && !css_tryget_online(&memcg->css)) + memcg = NULL; + rcu_read_unlock(); } if (!memcg) memcg = get_mem_cgroup_from_mm(mm); ret = try_charge(memcg, gfp_mask, nr_pages); + if (ret) + goto out_put; - css_put(&memcg->css); -out: - *memcgp = memcg; - return ret; -} - -int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp, - bool compound) -{ - struct mem_cgroup *memcg; - int ret; - - ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound); - memcg = *memcgp; - mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask); - return ret; -} - -/** - * mem_cgroup_commit_charge - commit a page charge - * @page: page to charge - * @memcg: memcg to charge the page to - * @lrucare: page might be on LRU already - * @compound: charge the page as compound or small page - * - * Finalize a charge transaction started by mem_cgroup_try_charge(), - * after page->mapping has been set up. This must happen atomically - * as part of the page instantiation, i.e. under the page table lock - * for anonymous pages, under the page lock for page and swap cache. - * - * In addition, the page must not be on the LRU during the commit, to - * prevent racing with task migration. If it might be, use @lrucare. - * - * Use mem_cgroup_cancel_charge() to cancel the transaction instead. - */ -void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, - bool lrucare, bool compound) -{ - unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; - - VM_BUG_ON_PAGE(!page->mapping, page); - VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); - - if (mem_cgroup_disabled()) - return; - /* - * Swap faults will attempt to charge the same page multiple - * times. But reuse_swap_page() might have removed the page - * from swapcache already, so we can't check PageSwapCache(). - */ - if (!memcg) - return; - - commit_charge(page, memcg, lrucare); + commit_charge(page, memcg); local_irq_disable(); - mem_cgroup_charge_statistics(memcg, page, compound, nr_pages); + mem_cgroup_charge_statistics(memcg, page, nr_pages); memcg_check_events(memcg, page); local_irq_enable(); - if (do_memsw_account() && PageSwapCache(page)) { + if (PageSwapCache(page)) { swp_entry_t entry = { .val = page_private(page) }; /* * The swap entry might not get freed for a long time, @@ -6554,42 +6506,18 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, */ mem_cgroup_uncharge_swap(entry, nr_pages); } -} -/** - * mem_cgroup_cancel_charge - cancel a page charge - * @page: page to charge - * @memcg: memcg to charge the page to - * @compound: charge the page as compound or small page - * - * Cancel a charge transaction started by mem_cgroup_try_charge(). - */ -void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, - bool compound) -{ - unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; - - if (mem_cgroup_disabled()) - return; - /* - * Swap faults will attempt to charge the same page multiple - * times. But reuse_swap_page() might have removed the page - * from swapcache already, so we can't check PageSwapCache(). - */ - if (!memcg) - return; - - cancel_charge(memcg, nr_pages); +out_put: + css_put(&memcg->css); +out: + return ret; } struct uncharge_gather { struct mem_cgroup *memcg; + unsigned long nr_pages; unsigned long pgpgout; - unsigned long nr_anon; - unsigned long nr_file; unsigned long nr_kmem; - unsigned long nr_huge; - unsigned long nr_shmem; struct page *dummy_page; }; @@ -6600,37 +6528,32 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug) static void uncharge_batch(const struct uncharge_gather *ug) { - unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem; unsigned long flags; if (!mem_cgroup_is_root(ug->memcg)) { - page_counter_uncharge(&ug->memcg->memory, nr_pages); + page_counter_uncharge(&ug->memcg->memory, ug->nr_pages); if (do_memsw_account()) - page_counter_uncharge(&ug->memcg->memsw, nr_pages); + page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages); if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem) page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem); memcg_oom_recover(ug->memcg); } local_irq_save(flags); - __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon); - __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file); - __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge); - __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem); __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); - __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, nr_pages); + __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages); memcg_check_events(ug->memcg, ug->dummy_page); local_irq_restore(flags); if (!mem_cgroup_is_root(ug->memcg)) - css_put_many(&ug->memcg->css, nr_pages); + css_put_many(&ug->memcg->css, ug->nr_pages); } static void uncharge_page(struct page *page, struct uncharge_gather *ug) { + unsigned long nr_pages; + VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) && - !PageHWPoison(page) , page); if (!page->mem_cgroup) return; @@ -6649,23 +6572,13 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) ug->memcg = page->mem_cgroup; } - if (!PageKmemcg(page)) { - unsigned int nr_pages = 1; + nr_pages = compound_nr(page); + ug->nr_pages += nr_pages; - if (PageTransHuge(page)) { - nr_pages = compound_nr(page); - ug->nr_huge += nr_pages; - } - if (PageAnon(page)) - ug->nr_anon += nr_pages; - else { - ug->nr_file += nr_pages; - if (PageSwapBacked(page)) - ug->nr_shmem += nr_pages; - } + if (!PageKmemcg(page)) { ug->pgpgout++; } else { - ug->nr_kmem += compound_nr(page); + ug->nr_kmem += nr_pages; __ClearPageKmemcg(page); } @@ -6702,8 +6615,7 @@ static void uncharge_list(struct list_head *page_list) * mem_cgroup_uncharge - uncharge a page * @page: page to uncharge * - * Uncharge a page previously charged with mem_cgroup_try_charge() and - * mem_cgroup_commit_charge(). + * Uncharge a page previously charged with mem_cgroup_charge(). */ void mem_cgroup_uncharge(struct page *page) { @@ -6726,7 +6638,7 @@ void mem_cgroup_uncharge(struct page *page) * @page_list: list of pages to uncharge * * Uncharge a list of pages previously charged with - * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). + * mem_cgroup_charge(). */ void mem_cgroup_uncharge_list(struct list_head *page_list) { @@ -6779,11 +6691,10 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) page_counter_charge(&memcg->memsw, nr_pages); css_get_many(&memcg->css, nr_pages); - commit_charge(newpage, memcg, false); + commit_charge(newpage, memcg); local_irq_save(flags); - mem_cgroup_charge_statistics(memcg, newpage, PageTransHuge(newpage), - nr_pages); + mem_cgroup_charge_statistics(memcg, newpage, nr_pages); memcg_check_events(memcg, newpage); local_irq_restore(flags); } @@ -6971,7 +6882,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) VM_BUG_ON_PAGE(PageLRU(page), page); VM_BUG_ON_PAGE(page_count(page), page); - if (!do_memsw_account()) + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return; memcg = page->mem_cgroup; @@ -7000,7 +6911,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, nr_entries); - if (memcg != swap_memcg) { + if (!cgroup_memory_noswap && memcg != swap_memcg) { if (!mem_cgroup_is_root(swap_memcg)) page_counter_charge(&swap_memcg->memsw, nr_entries); page_counter_uncharge(&memcg->memsw, nr_entries); @@ -7013,8 +6924,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * only synchronisation we have for updating the per-CPU variables. */ VM_BUG_ON(!irqs_disabled()); - mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page), - -nr_entries); + mem_cgroup_charge_statistics(memcg, page, -nr_entries); memcg_check_events(memcg, page); if (!mem_cgroup_is_root(memcg)) @@ -7037,7 +6947,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) struct mem_cgroup *memcg; unsigned short oldid; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account) + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return 0; memcg = page->mem_cgroup; @@ -7053,7 +6963,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) memcg = mem_cgroup_id_get_online(memcg); - if (!mem_cgroup_is_root(memcg) && + if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { memcg_memory_event(memcg, MEMCG_SWAP_MAX); memcg_memory_event(memcg, MEMCG_SWAP_FAIL); @@ -7081,14 +6991,11 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) struct mem_cgroup *memcg; unsigned short id; - if (!do_swap_account) - return; - id = swap_cgroup_record(entry, 0, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_id(id); if (memcg) { - if (!mem_cgroup_is_root(memcg)) { + if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) { if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) page_counter_uncharge(&memcg->swap, nr_pages); else @@ -7104,7 +7011,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) { long nr_swap_pages = get_nr_swap_pages(); - if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return nr_swap_pages; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) nr_swap_pages = min_t(long, nr_swap_pages, @@ -7121,37 +7028,33 @@ bool mem_cgroup_swap_full(struct page *page) if (vm_swap_full()) return true; - if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return false; memcg = page->mem_cgroup; if (!memcg) return false; - for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) - if (page_counter_read(&memcg->swap) * 2 >= - READ_ONCE(memcg->swap.max)) + for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { + unsigned long usage = page_counter_read(&memcg->swap); + + if (usage * 2 >= READ_ONCE(memcg->swap.high) || + usage * 2 >= READ_ONCE(memcg->swap.max)) return true; + } return false; } -/* for remember boot option*/ -#ifdef CONFIG_MEMCG_SWAP_ENABLED -static int really_do_swap_account __initdata = 1; -#else -static int really_do_swap_account __initdata; -#endif - -static int __init enable_swap_account(char *s) +static int __init setup_swap_account(char *s) { if (!strcmp(s, "1")) - really_do_swap_account = 1; + cgroup_memory_noswap = 0; else if (!strcmp(s, "0")) - really_do_swap_account = 0; + cgroup_memory_noswap = 1; return 1; } -__setup("swapaccount=", enable_swap_account); +__setup("swapaccount=", setup_swap_account); static u64 swap_current_read(struct cgroup_subsys_state *css, struct cftype *cft) @@ -7161,6 +7064,29 @@ static u64 swap_current_read(struct cgroup_subsys_state *css, return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; } +static int swap_high_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->swap.high)); +} + +static ssize_t swap_high_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long high; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &high); + if (err) + return err; + + page_counter_set_high(&memcg->swap, high); + + return nbytes; +} + static int swap_max_show(struct seq_file *m, void *v) { return seq_puts_memcg_tunable(m, @@ -7188,6 +7114,8 @@ static int swap_events_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + seq_printf(m, "high %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH])); seq_printf(m, "max %lu\n", atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); seq_printf(m, "fail %lu\n", @@ -7203,6 +7131,12 @@ static struct cftype swap_files[] = { .read_u64 = swap_current_read, }, { + .name = "swap.high", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = swap_high_show, + .write = swap_high_write, + }, + { .name = "swap.max", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = swap_max_show, @@ -7217,7 +7151,7 @@ static struct cftype swap_files[] = { { } /* terminate */ }; -static struct cftype memsw_cgroup_files[] = { +static struct cftype memsw_files[] = { { .name = "memsw.usage_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), @@ -7246,13 +7180,16 @@ static struct cftype memsw_cgroup_files[] = { static int __init mem_cgroup_swap_init(void) { - if (!mem_cgroup_disabled() && really_do_swap_account) { - do_swap_account = 1; - WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, - swap_files)); - WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, - memsw_cgroup_files)); - } + /* No memory control -> no swap control */ + if (mem_cgroup_disabled()) + cgroup_memory_noswap = true; + + if (cgroup_memory_noswap) + return 0; + + WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); + WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); + return 0; } subsys_initcall(mem_cgroup_swap_init); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a96364be8ab4..ababa368cb68 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -210,14 +210,17 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) { struct task_struct *t = tk->tsk; short addr_lsb = tk->size_shift; - int ret; + int ret = 0; - pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n", - pfn, t->comm, t->pid); + if ((t->mm == current->mm) || !(flags & MF_ACTION_REQUIRED)) + pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n", + pfn, t->comm, t->pid); - if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) { - ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)tk->addr, - addr_lsb); + if (flags & MF_ACTION_REQUIRED) { + if (t->mm == current->mm) + ret = force_sig_mceerr(BUS_MCEERR_AR, + (void __user *)tk->addr, addr_lsb); + /* send no signal to non-current processes */ } else { /* * Don't use force here, it's convenient if the signal @@ -1493,7 +1496,7 @@ static void memory_failure_work_func(struct work_struct *work) unsigned long proc_flags; int gotten; - mf_cpu = this_cpu_ptr(&memory_failure_cpu); + mf_cpu = container_of(work, struct memory_failure_cpu, work); for (;;) { spin_lock_irqsave(&mf_cpu->lock, proc_flags); gotten = kfifo_get(&mf_cpu->fifo, &entry); @@ -1507,6 +1510,19 @@ static void memory_failure_work_func(struct work_struct *work) } } +/* + * Process memory_failure work queued on the specified CPU. + * Used to avoid return-to-userspace racing with the memory_failure workqueue. + */ +void memory_failure_queue_kick(int cpu) +{ + struct memory_failure_cpu *mf_cpu; + + mf_cpu = &per_cpu(memory_failure_cpu, cpu); + cancel_work_sync(&mf_cpu->work); + memory_failure_work_func(&mf_cpu->work); +} + static int __init memory_failure_init(void) { struct memory_failure_cpu *mf_cpu; diff --git a/mm/memory.c b/mm/memory.c index f703fe8c8346..7b70398f76a0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -802,8 +802,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, get_page(page); page_dup_rmap(page, false); rss[mm_counter(page)]++; - } else if (pte_devmap(pte)) { - page = pte_page(pte); } out_set_pte: @@ -2436,10 +2434,9 @@ static inline bool cow_user_page(struct page *dst, struct page *src, if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { /* * Other thread has already handled the fault - * and we don't need to do anything. If it's - * not the case, the fault will be triggered - * again on the same address. + * and update local tlb only */ + update_mmu_tlb(vma, addr, vmf->pte); ret = false; goto pte_unlock; } @@ -2463,7 +2460,8 @@ static inline bool cow_user_page(struct page *dst, struct page *src, vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); locked = true; if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { - /* The PTE changed under us. Retry page fault. */ + /* The PTE changed under us, update local tlb */ + update_mmu_tlb(vma, addr, vmf->pte); ret = false; goto pte_unlock; } @@ -2647,7 +2645,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) struct page *new_page = NULL; pte_t entry; int page_copied = 0; - struct mem_cgroup *memcg; struct mmu_notifier_range range; if (unlikely(anon_vma_prepare(vma))) @@ -2678,8 +2675,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) } } - if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false)) + if (mem_cgroup_charge(new_page, mm, GFP_KERNEL)) goto oom_free_new; + cgroup_throttle_swaprate(new_page, GFP_KERNEL); __SetPageUptodate(new_page); @@ -2704,6 +2702,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); + entry = pte_sw_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* * Clear the pte entry and flush it first, before updating the @@ -2713,7 +2712,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) */ ptep_clear_flush_notify(vma, vmf->address, vmf->pte); page_add_new_anon_rmap(new_page, vma, vmf->address, false); - mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); /* * We call the notify macro here because, when using secondary @@ -2752,7 +2750,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) new_page = old_page; page_copied = 1; } else { - mem_cgroup_cancel_charge(new_page, memcg, false); + update_mmu_tlb(vma, vmf->address, vmf->pte); } if (new_page) @@ -2812,6 +2810,7 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) * pte_offset_map_lock. */ if (!pte_same(*vmf->pte, vmf->orig_pte)) { + update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return VM_FAULT_NOPAGE; } @@ -2936,6 +2935,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!pte_same(*vmf->pte, vmf->orig_pte)) { + update_mmu_tlb(vma, vmf->address, vmf->pte); unlock_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); put_page(vmf->page); @@ -3090,7 +3090,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = NULL, *swapcache; - struct mem_cgroup *memcg; swp_entry_t entry; pte_t pte; int locked; @@ -3131,10 +3130,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (page) { + int err; + __SetPageLocked(page); __SetPageSwapBacked(page); set_page_private(page, entry.val); - lru_cache_add_anon(page); + + /* Tell memcg to use swap ownership records */ + SetPageSwapCache(page); + err = mem_cgroup_charge(page, vma->vm_mm, + GFP_KERNEL); + ClearPageSwapCache(page); + if (err) + goto out_page; + + lru_cache_add(page); swap_readpage(page, true); } } else { @@ -3195,11 +3205,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_page; } - if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, - &memcg, false)) { - ret = VM_FAULT_OOM; - goto out_page; - } + cgroup_throttle_swaprate(page, GFP_KERNEL); /* * Back out if somebody else already faulted in this pte. @@ -3247,11 +3253,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* ksm created a completely new copy */ if (unlikely(page != swapcache && swapcache)) { page_add_new_anon_rmap(page, vma, vmf->address, false); - mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } else { do_page_add_anon_rmap(page, vma, vmf->address, exclusive); - mem_cgroup_commit_charge(page, memcg, true, false); activate_page(page); } @@ -3287,7 +3291,6 @@ unlock: out: return ret; out_nomap: - mem_cgroup_cancel_charge(page, memcg, false); pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: unlock_page(page); @@ -3308,7 +3311,6 @@ out_release: static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct mem_cgroup *memcg; struct page *page; vm_fault_t ret = 0; pte_t entry; @@ -3341,8 +3343,10 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) vma->vm_page_prot)); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (!pte_none(*vmf->pte)) + if (!pte_none(*vmf->pte)) { + update_mmu_tlb(vma, vmf->address, vmf->pte); goto unlock; + } ret = check_stable_address_space(vma->vm_mm); if (ret) goto unlock; @@ -3361,9 +3365,9 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (!page) goto oom; - if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg, - false)) + if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) goto oom_free_page; + cgroup_throttle_swaprate(page, GFP_KERNEL); /* * The memory barrier inside __SetPageUptodate makes sure that @@ -3373,13 +3377,16 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) __SetPageUptodate(page); entry = mk_pte(page, vma->vm_page_prot); + entry = pte_sw_mkyoung(entry); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (!pte_none(*vmf->pte)) + if (!pte_none(*vmf->pte)) { + update_mmu_cache(vma, vmf->address, vmf->pte); goto release; + } ret = check_stable_address_space(vma->vm_mm); if (ret) @@ -3388,14 +3395,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); - mem_cgroup_cancel_charge(page, memcg, false); put_page(page); return handle_userfault(vmf, VM_UFFD_MISSING); } inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); - mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); setpte: set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); @@ -3406,7 +3411,6 @@ unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; release: - mem_cgroup_cancel_charge(page, memcg, false); put_page(page); goto unlock; oom_free_page: @@ -3611,7 +3615,6 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) * mapping. If needed, the fucntion allocates page table or use pre-allocated. * * @vmf: fault environment - * @memcg: memcg to charge page (only for private mappings) * @page: page to map * * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on @@ -3622,8 +3625,7 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) * * Return: %0 on success, %VM_FAULT_ code in case of error. */ -vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, - struct page *page) +vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; @@ -3631,9 +3633,6 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, vm_fault_t ret; if (pmd_none(*vmf->pmd) && PageTransCompound(page)) { - /* THP on COW? */ - VM_BUG_ON_PAGE(memcg, page); - ret = do_set_pmd(vmf, page); if (ret != VM_FAULT_FALLBACK) return ret; @@ -3646,18 +3645,20 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, } /* Re-check under ptl */ - if (unlikely(!pte_none(*vmf->pte))) + if (unlikely(!pte_none(*vmf->pte))) { + update_mmu_tlb(vma, vmf->address, vmf->pte); return VM_FAULT_NOPAGE; + } flush_icache_page(vma, page); entry = mk_pte(page, vma->vm_page_prot); + entry = pte_sw_mkyoung(entry); if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); - mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); @@ -3706,7 +3707,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf) if (!(vmf->vma->vm_flags & VM_SHARED)) ret = check_stable_address_space(vmf->vma->vm_mm); if (!ret) - ret = alloc_set_pte(vmf, vmf->memcg, page); + ret = alloc_set_pte(vmf, page); if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; @@ -3866,11 +3867,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) if (!vmf->cow_page) return VM_FAULT_OOM; - if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL, - &vmf->memcg, false)) { + if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) { put_page(vmf->cow_page); return VM_FAULT_OOM; } + cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL); ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) @@ -3888,7 +3889,6 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) goto uncharge_out; return ret; uncharge_out: - mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false); put_page(vmf->cow_page); return ret; } @@ -4224,8 +4224,10 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); spin_lock(vmf->ptl); entry = vmf->orig_pte; - if (unlikely(!pte_same(*vmf->pte, entry))) + if (unlikely(!pte_same(*vmf->pte, entry))) { + update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); goto unlock; + } if (vmf->flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) return do_wp_page(vmf); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fc0aad0bc1f5..926ec704e835 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -879,13 +879,13 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) } else { int cpu; /* - * Reset the nr_zones, order and classzone_idx before reuse. - * Note that kswapd will init kswapd_classzone_idx properly + * Reset the nr_zones, order and highest_zoneidx before reuse. + * Note that kswapd will init kswapd_highest_zoneidx properly * when it starts in the near future. */ pgdat->nr_zones = 0; pgdat->kswapd_order = 0; - pgdat->kswapd_classzone_idx = 0; + pgdat->kswapd_highest_zoneidx = 0; for_each_online_cpu(cpu) { struct per_cpu_nodestat *p; @@ -1372,11 +1372,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, static int __init cmdline_parse_movable_node(char *p) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP movable_node_enabled = true; -#else - pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n"); -#endif return 0; } early_param("movable_node", cmdline_parse_movable_node); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 48ba9729062e..1965e2681877 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -927,10 +927,7 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr) int locked = 1; err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked); - if (err == 0) { - /* E.g. GUP interrupted by fatal signal */ - err = -EFAULT; - } else if (err > 0) { + if (err > 0) { err = page_to_nid(p); put_page(p); } diff --git a/mm/migrate.c b/mm/migrate.c index 7160c1556f79..7bfd0962149e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -490,11 +490,18 @@ int migrate_page_move_mapping(struct address_space *mapping, * are mapped to swap space. */ if (newzone != oldzone) { - __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES); - __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES); + struct lruvec *old_lruvec, *new_lruvec; + struct mem_cgroup *memcg; + + memcg = page_memcg(page); + old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat); + new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat); + + __dec_lruvec_state(old_lruvec, NR_FILE_PAGES); + __inc_lruvec_state(new_lruvec, NR_FILE_PAGES); if (PageSwapBacked(page) && !PageSwapCache(page)) { - __dec_node_state(oldzone->zone_pgdat, NR_SHMEM); - __inc_node_state(newzone->zone_pgdat, NR_SHMEM); + __dec_lruvec_state(old_lruvec, NR_SHMEM); + __inc_lruvec_state(new_lruvec, NR_SHMEM); } if (dirty && mapping_cap_account_dirty(mapping)) { __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY); @@ -797,11 +804,7 @@ recheck_buffers: if (rc != MIGRATEPAGE_SUCCESS) goto unlock_buffers; - ClearPagePrivate(page); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - get_page(newpage); + attach_page_private(newpage, detach_page_private(page)); bh = head; do { @@ -810,8 +813,6 @@ recheck_buffers: } while (bh != head); - SetPagePrivate(newpage); - if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); else @@ -1032,7 +1033,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * to the LRU. Later, when the IO completes the pages are * marked uptodate and unlocked. However, the queueing * could be merging multiple pages for one bio (e.g. - * mpage_readpages). If an allocation happens for the + * mpage_readahead). If an allocation happens for the * second or third page, the process can end up locking * the same page twice and deadlocking. Rather than * trying to be clever about what pages can be locked, @@ -2739,7 +2740,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, { struct vm_area_struct *vma = migrate->vma; struct mm_struct *mm = vma->vm_mm; - struct mem_cgroup *memcg; bool flush = false; spinlock_t *ptl; pte_t entry; @@ -2786,7 +2786,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, if (unlikely(anon_vma_prepare(vma))) goto abort; - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) + if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) goto abort; /* @@ -2832,7 +2832,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, inc_mm_counter(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, addr, false); - mem_cgroup_commit_charge(page, memcg, false, false); if (!is_zone_device_page(page)) lru_cache_add_active_or_unevictable(page, vma); get_page(page); @@ -2854,7 +2853,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, unlock_abort: pte_unmap_unlock(ptep, ptl); - mem_cgroup_cancel_charge(page, memcg, false); abort: *src &= ~MIGRATE_PFN_MIGRATE; } diff --git a/mm/mm_init.c b/mm/mm_init.c index 7da6991d9435..435e5f794b3b 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -67,26 +67,30 @@ void __init mminit_verify_pageflags_layout(void) unsigned long or_mask, add_mask; shift = 8 * sizeof(unsigned long); - width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT; + width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH + - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH; mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", - "Section %d Node %d Zone %d Lastcpupid %d Flags %d\n", + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n", SECTIONS_WIDTH, NODES_WIDTH, ZONES_WIDTH, LAST_CPUPID_WIDTH, + KASAN_TAG_WIDTH, NR_PAGEFLAGS); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", - "Section %d Node %d Zone %d Lastcpupid %d\n", + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n", SECTIONS_SHIFT, NODES_SHIFT, ZONES_SHIFT, - LAST_CPUPID_SHIFT); + LAST_CPUPID_SHIFT, + KASAN_TAG_WIDTH); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", - "Section %lu Node %lu Zone %lu Lastcpupid %lu\n", + "Section %lu Node %lu Zone %lu Lastcpupid %lu Kasantag %lu\n", (unsigned long)SECTIONS_PGSHIFT, (unsigned long)NODES_PGSHIFT, (unsigned long)ZONES_PGSHIFT, - (unsigned long)LAST_CPUPID_PGSHIFT); + (unsigned long)LAST_CPUPID_PGSHIFT, + (unsigned long)KASAN_TAG_PGSHIFT); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", "Node/Zone ID: %lu -> %lu\n", (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), diff --git a/mm/mremap.c b/mm/mremap.c index c881abeba0bf..6aa6ea605068 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -794,7 +794,7 @@ out: if (locked && new_len > old_len) mm_populate(new_addr + old_len, new_len - old_len); userfaultfd_unmap_complete(mm, &uf_unmap_early); - mremap_userfaultfd_complete(&uf, addr, new_addr, old_len); + mremap_userfaultfd_complete(&uf, addr, ret, old_len); userfaultfd_unmap_complete(mm, &uf_unmap); return ret; } diff --git a/mm/nommu.c b/mm/nommu.c index 318df4e236c9..dfae55f41901 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -140,7 +140,7 @@ void vfree(const void *addr) } EXPORT_SYMBOL(vfree); -void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) +void *__vmalloc(unsigned long size, gfp_t gfp_mask) { /* * You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc() @@ -150,16 +150,25 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) } EXPORT_SYMBOL(__vmalloc); -void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags) +void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, unsigned long vm_flags, int node, + const void *caller) { - return __vmalloc(size, flags, PAGE_KERNEL); + return __vmalloc(size, gfp_mask); +} + +void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, + int node, const void *caller) +{ + return __vmalloc(size, gfp_mask); } static void *__vmalloc_user_flags(unsigned long size, gfp_t flags) { void *ret; - ret = __vmalloc(size, flags, PAGE_KERNEL); + ret = __vmalloc(size, flags); if (ret) { struct vm_area_struct *vma; @@ -179,12 +188,6 @@ void *vmalloc_user(unsigned long size) } EXPORT_SYMBOL(vmalloc_user); -void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags) -{ - return __vmalloc_user_flags(size, flags | __GFP_ZERO); -} -EXPORT_SYMBOL(vmalloc_user_node_flags); - struct page *vmalloc_to_page(const void *addr) { return virt_to_page(addr); @@ -230,7 +233,7 @@ long vwrite(char *buf, char *addr, unsigned long count) */ void *vmalloc(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM); } EXPORT_SYMBOL(vmalloc); @@ -248,8 +251,7 @@ EXPORT_SYMBOL(vmalloc); */ void *vzalloc(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); } EXPORT_SYMBOL(vzalloc); @@ -302,7 +304,7 @@ EXPORT_SYMBOL(vzalloc_node); void *vmalloc_exec(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM); } /** @@ -314,7 +316,7 @@ void *vmalloc_exec(unsigned long size) */ void *vmalloc_32(unsigned long size) { - return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL); } EXPORT_SYMBOL(vmalloc_32); @@ -351,7 +353,7 @@ void vunmap(const void *addr) } EXPORT_SYMBOL(vunmap); -void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) +void *vm_map_ram(struct page **pages, unsigned int count, int node) { BUG(); return NULL; @@ -369,18 +371,6 @@ void vm_unmap_aliases(void) } EXPORT_SYMBOL_GPL(vm_unmap_aliases); -/* - * Implement a stub for vmalloc_sync_[un]mapping() if the architecture - * chose not to have one. - */ -void __weak vmalloc_sync_mappings(void) -{ -} - -void __weak vmalloc_sync_unmappings(void) -{ -} - struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) { BUG(); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index dfc357614e56..4daedf7b91f6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -254,7 +254,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) { struct zone *zone; struct zoneref *z; - enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask); + enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask); bool cpuset_limited = false; int nid; @@ -294,7 +294,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) /* Check this allocation failure is caused by cpuset's wall function */ for_each_zone_zonelist_nodemask(zone, z, oc->zonelist, - high_zoneidx, oc->nodemask) + highest_zoneidx, oc->nodemask) if (!cpuset_zone_allowed(zone, oc->gfp_mask)) cpuset_limited = true; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7326b54ab728..d79ed1f88c7a 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -387,8 +387,7 @@ static unsigned long global_dirtyable_memory(void) * Calculate @dtc->thresh and ->bg_thresh considering * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller * must ensure that @dtc->avail is set before calling this function. The - * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and - * real-time tasks. + * dirty limits will be lifted by 1/4 for real-time tasks. */ static void domain_dirty_limits(struct dirty_throttle_control *dtc) { @@ -436,7 +435,7 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc) if (bg_thresh >= thresh) bg_thresh = thresh / 2; tsk = current; - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { + if (rt_task(tsk)) { bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; } @@ -486,7 +485,7 @@ static unsigned long node_dirty_limit(struct pglist_data *pgdat) else dirty = vm_dirty_ratio * node_memory / 100; - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) + if (rt_task(tsk)) dirty += dirty / 4; return dirty; @@ -505,15 +504,13 @@ bool node_dirty_ok(struct pglist_data *pgdat) unsigned long nr_pages = 0; nr_pages += node_page_state(pgdat, NR_FILE_DIRTY); - nr_pages += node_page_state(pgdat, NR_UNSTABLE_NFS); nr_pages += node_page_state(pgdat, NR_WRITEBACK); return nr_pages <= limit; } int dirty_background_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -524,8 +521,7 @@ int dirty_background_ratio_handler(struct ctl_table *table, int write, } int dirty_background_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -535,9 +531,8 @@ int dirty_background_bytes_handler(struct ctl_table *table, int write, return ret; } -int dirty_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; int ret; @@ -551,8 +546,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write, } int dirty_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned long old_bytes = vm_dirty_bytes; int ret; @@ -759,7 +753,7 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. * * Return: @wb's dirty limit in pages. The term "dirty" in the context of - * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. + * dirty balancing includes all PG_dirty and PG_writeback pages. */ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) { @@ -1567,7 +1561,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; struct dirty_throttle_control *sdtc; - unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ + unsigned long nr_reclaimable; /* = file_dirty */ long period; long pause; long max_pause; @@ -1587,14 +1581,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, unsigned long m_thresh = 0; unsigned long m_bg_thresh = 0; - /* - * Unstable writes are a feature of certain networked - * filesystems (i.e. NFS) in which data may have been - * written to the server's write cache, but has not yet - * been flushed to permanent storage. - */ - nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) + - global_node_page_state(NR_UNSTABLE_NFS); + nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); gdtc->avail = global_dirtyable_memory(); gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); @@ -1653,8 +1640,12 @@ static void balance_dirty_pages(struct bdi_writeback *wb, if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && (!mdtc || m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { - unsigned long intv = dirty_poll_interval(dirty, thresh); - unsigned long m_intv = ULONG_MAX; + unsigned long intv; + unsigned long m_intv; + +free_running: + intv = dirty_poll_interval(dirty, thresh); + m_intv = ULONG_MAX; current->dirty_paused_when = now; current->nr_dirtied = 0; @@ -1673,9 +1664,20 @@ static void balance_dirty_pages(struct bdi_writeback *wb, * Calculate global domain's pos_ratio and select the * global dtc by default. */ - if (!strictlimit) + if (!strictlimit) { wb_dirty_limits(gdtc); + if ((current->flags & PF_LOCAL_THROTTLE) && + gdtc->wb_dirty < + dirty_freerun_ceiling(gdtc->wb_thresh, + gdtc->wb_bg_thresh)) + /* + * LOCAL_THROTTLE tasks must not be throttled + * when below the per-wb freerun ceiling. + */ + goto free_running; + } + dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && ((gdtc->dirty > gdtc->thresh) || strictlimit); @@ -1689,9 +1691,20 @@ static void balance_dirty_pages(struct bdi_writeback *wb, * both global and memcg domains. Choose the one * w/ lower pos_ratio. */ - if (!strictlimit) + if (!strictlimit) { wb_dirty_limits(mdtc); + if ((current->flags & PF_LOCAL_THROTTLE) && + mdtc->wb_dirty < + dirty_freerun_ceiling(mdtc->wb_thresh, + mdtc->wb_bg_thresh)) + /* + * LOCAL_THROTTLE tasks must not be + * throttled when below the per-wb + * freerun ceiling. + */ + goto free_running; + } dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && ((mdtc->dirty > mdtc->thresh) || strictlimit); @@ -1938,8 +1951,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) * as we're trying to decide whether to put more under writeback. */ gdtc->avail = global_dirtyable_memory(); - gdtc->dirty = global_node_page_state(NR_FILE_DIRTY) + - global_node_page_state(NR_UNSTABLE_NFS); + gdtc->dirty = global_node_page_state(NR_FILE_DIRTY); domain_dirty_limits(gdtc); if (gdtc->dirty > gdtc->bg_thresh) @@ -1972,7 +1984,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { unsigned int old_interval = dirty_writeback_interval; int ret; @@ -2164,7 +2176,6 @@ int write_cache_pages(struct address_space *mapping, int error; struct pagevec pvec; int nr_pages; - pgoff_t uninitialized_var(writeback_index); pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; @@ -2173,8 +2184,7 @@ int write_cache_pages(struct address_space *mapping, pagevec_init(&pvec); if (wbc->range_cyclic) { - writeback_index = mapping->writeback_index; /* prev offset */ - index = writeback_index; + index = mapping->writeback_index; /* prev offset */ end = -1; } else { index = wbc->range_start >> PAGE_SHIFT; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 69827d4fa052..07ae77d97952 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -68,6 +68,7 @@ #include <linux/lockdep.h> #include <linux/nmi.h> #include <linux/psi.h> +#include <linux/padata.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -302,14 +303,14 @@ const char * const migratetype_names[MIGRATE_TYPES] = { #endif }; -compound_page_dtor * const compound_page_dtors[] = { - NULL, - free_compound_page, +compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { + [NULL_COMPOUND_DTOR] = NULL, + [COMPOUND_PAGE_DTOR] = free_compound_page, #ifdef CONFIG_HUGETLB_PAGE - free_huge_page, + [HUGETLB_PAGE_DTOR] = free_huge_page, #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE - free_transhuge_page, + [TRANSHUGE_PAGE_DTOR] = free_transhuge_page, #endif }; @@ -335,7 +336,6 @@ static unsigned long nr_kernel_pages __initdata; static unsigned long nr_all_pages __initdata; static unsigned long dma_reserve __initdata; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata; static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata; static unsigned long required_kernelcore __initdata; @@ -348,7 +348,6 @@ static bool mirrored_kernelcore __meminitdata; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; EXPORT_SYMBOL(movable_zone); -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ #if MAX_NUMNODES > 1 unsigned int nr_node_ids __read_mostly = MAX_NUMNODES; @@ -609,8 +608,7 @@ static inline int __maybe_unused bad_range(struct zone *zone, struct page *page) } #endif -static void bad_page(struct page *page, const char *reason, - unsigned long bad_flags) +static void bad_page(struct page *page, const char *reason) { static unsigned long resume; static unsigned long nr_shown; @@ -639,10 +637,6 @@ static void bad_page(struct page *page, const char *reason, pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", current->comm, page_to_pfn(page)); __dump_page(page, reason); - bad_flags &= page->flags; - if (bad_flags) - pr_alert("bad because of flags: %#lx(%pGp)\n", - bad_flags, &bad_flags); dump_page_owner(page); print_modules(); @@ -1077,13 +1071,9 @@ static inline bool page_expected_state(struct page *page, return true; } -static void free_pages_check_bad(struct page *page) +static const char *page_bad_reason(struct page *page, unsigned long flags) { - const char *bad_reason; - unsigned long bad_flags; - - bad_reason = NULL; - bad_flags = 0; + const char *bad_reason = NULL; if (unlikely(atomic_read(&page->_mapcount) != -1)) bad_reason = "nonzero mapcount"; @@ -1091,24 +1081,32 @@ static void free_pages_check_bad(struct page *page) bad_reason = "non-NULL mapping"; if (unlikely(page_ref_count(page) != 0)) bad_reason = "nonzero _refcount"; - if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { - bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; - bad_flags = PAGE_FLAGS_CHECK_AT_FREE; + if (unlikely(page->flags & flags)) { + if (flags == PAGE_FLAGS_CHECK_AT_PREP) + bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set"; + else + bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; } #ifdef CONFIG_MEMCG if (unlikely(page->mem_cgroup)) bad_reason = "page still charged to cgroup"; #endif - bad_page(page, bad_reason, bad_flags); + return bad_reason; } -static inline int free_pages_check(struct page *page) +static void check_free_page_bad(struct page *page) +{ + bad_page(page, + page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); +} + +static inline int check_free_page(struct page *page) { if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) return 0; /* Something has gone sideways, find it */ - free_pages_check_bad(page); + check_free_page_bad(page); return 1; } @@ -1130,7 +1128,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) case 1: /* the first tail page: ->mapping may be compound_mapcount() */ if (unlikely(compound_mapcount(page))) { - bad_page(page, "nonzero compound_mapcount", 0); + bad_page(page, "nonzero compound_mapcount"); goto out; } break; @@ -1142,17 +1140,17 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) break; default: if (page->mapping != TAIL_MAPPING) { - bad_page(page, "corrupted mapping in tail page", 0); + bad_page(page, "corrupted mapping in tail page"); goto out; } break; } if (unlikely(!PageTail(page))) { - bad_page(page, "PageTail not set", 0); + bad_page(page, "PageTail not set"); goto out; } if (unlikely(compound_head(page) != head_page)) { - bad_page(page, "compound_head not consistent", 0); + bad_page(page, "compound_head not consistent"); goto out; } ret = 0; @@ -1194,7 +1192,7 @@ static __always_inline bool free_pages_prepare(struct page *page, for (i = 1; i < (1 << order); i++) { if (compound) bad += free_tail_pages_check(page, page + i); - if (unlikely(free_pages_check(page + i))) { + if (unlikely(check_free_page(page + i))) { bad++; continue; } @@ -1206,7 +1204,7 @@ static __always_inline bool free_pages_prepare(struct page *page, if (memcg_kmem_enabled() && PageKmemcg(page)) __memcg_kmem_uncharge_page(page, order); if (check_free) - bad += free_pages_check(page); + bad += check_free_page(page); if (bad) return false; @@ -1253,7 +1251,7 @@ static bool free_pcp_prepare(struct page *page) static bool bulkfree_pcp_prepare(struct page *page) { if (debug_pagealloc_enabled_static()) - return free_pages_check(page); + return check_free_page(page); else return false; } @@ -1274,7 +1272,7 @@ static bool free_pcp_prepare(struct page *page) static bool bulkfree_pcp_prepare(struct page *page) { - return free_pages_check(page); + return check_free_page(page); } #endif /* CONFIG_DEBUG_VM */ @@ -1499,45 +1497,49 @@ void __free_pages_core(struct page *page, unsigned int order) __free_pages(page, order); } -#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \ - defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) +#ifdef CONFIG_NEED_MULTIPLE_NODES static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; -int __meminit early_pfn_to_nid(unsigned long pfn) +#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID + +/* + * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. + */ +int __meminit __early_pfn_to_nid(unsigned long pfn, + struct mminit_pfnnid_cache *state) { - static DEFINE_SPINLOCK(early_pfn_lock); + unsigned long start_pfn, end_pfn; int nid; - spin_lock(&early_pfn_lock); - nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); - if (nid < 0) - nid = first_online_node; - spin_unlock(&early_pfn_lock); + if (state->last_start <= pfn && pfn < state->last_end) + return state->last_nid; + + nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); + if (nid != NUMA_NO_NODE) { + state->last_start = start_pfn; + state->last_end = end_pfn; + state->last_nid = nid; + } return nid; } -#endif +#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ -#ifdef CONFIG_NODES_SPAN_OTHER_NODES -/* Only safe to use early in boot when initialisation is single-threaded */ -static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +int __meminit early_pfn_to_nid(unsigned long pfn) { + static DEFINE_SPINLOCK(early_pfn_lock); int nid; + spin_lock(&early_pfn_lock); nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); - if (nid >= 0 && nid != node) - return false; - return true; -} + if (nid < 0) + nid = first_online_node; + spin_unlock(&early_pfn_lock); -#else -static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) -{ - return true; + return nid; } -#endif - +#endif /* CONFIG_NEED_MULTIPLE_NODES */ void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) @@ -1607,6 +1609,7 @@ void set_zone_contiguous(struct zone *zone) if (!__pageblock_pfn_to_page(block_start_pfn, block_end_pfn, zone)) return; + cond_resched(); } /* We confirm that there is no hole */ @@ -1691,7 +1694,6 @@ static void __init deferred_free_pages(unsigned long pfn, } else if (!(pfn & nr_pgmask)) { deferred_free_range(pfn - nr_free, nr_free); nr_free = 1; - touch_nmi_watchdog(); } else { nr_free++; } @@ -1721,7 +1723,6 @@ static unsigned long __init deferred_init_pages(struct zone *zone, continue; } else if (!page || !(pfn & nr_pgmask)) { page = pfn_to_page(pfn); - touch_nmi_watchdog(); } else { page++; } @@ -1815,16 +1816,43 @@ deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn, return nr_pages; } +static void __init +deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, + void *arg) +{ + unsigned long spfn, epfn; + struct zone *zone = arg; + u64 i; + + deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn); + + /* + * Initialize and free pages in MAX_ORDER sized increments so that we + * can avoid introducing any issues with the buddy allocator. + */ + while (spfn < end_pfn) { + deferred_init_maxorder(&i, zone, &spfn, &epfn); + cond_resched(); + } +} + +/* An arch may override for more concurrency. */ +__weak int __init +deferred_page_init_max_threads(const struct cpumask *node_cpumask) +{ + return 1; +} + /* Initialise remaining memory on a node */ static int __init deferred_init_memmap(void *data) { pg_data_t *pgdat = data; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); - unsigned long spfn = 0, epfn = 0, nr_pages = 0; + unsigned long spfn = 0, epfn = 0; unsigned long first_init_pfn, flags; unsigned long start = jiffies; struct zone *zone; - int zid; + int zid, max_threads; u64 i; /* Bind memory initialisation thread to a local node if possible */ @@ -1844,6 +1872,13 @@ static int __init deferred_init_memmap(void *data) BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); pgdat->first_deferred_pfn = ULONG_MAX; + /* + * Once we unlock here, the zone cannot be grown anymore, thus if an + * interrupt thread must allocate this early in boot, zone must be + * pre-grown prior to start of deferred page initialization. + */ + pgdat_resize_unlock(pgdat, &flags); + /* Only the highest zone is deferred so find it */ for (zid = 0; zid < MAX_NR_ZONES; zid++) { zone = pgdat->node_zones + zid; @@ -1856,21 +1891,30 @@ static int __init deferred_init_memmap(void *data) first_init_pfn)) goto zone_empty; - /* - * Initialize and free pages in MAX_ORDER sized increments so - * that we can avoid introducing any issues with the buddy - * allocator. - */ - while (spfn < epfn) - nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); -zone_empty: - pgdat_resize_unlock(pgdat, &flags); + max_threads = deferred_page_init_max_threads(cpumask); + while (spfn < epfn) { + unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION); + struct padata_mt_job job = { + .thread_fn = deferred_init_memmap_chunk, + .fn_arg = zone, + .start = spfn, + .size = epfn_align - spfn, + .align = PAGES_PER_SECTION, + .min_chunk = PAGES_PER_SECTION, + .max_threads = max_threads, + }; + + padata_do_multithreaded(&job); + deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, + epfn_align); + } +zone_empty: /* Sanity check that the next zone really is unpopulated */ WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); - pr_info("node %d initialised, %lu pages in %ums\n", - pgdat->node_id, nr_pages, jiffies_to_msecs(jiffies - start)); + pr_info("node %d deferred pages initialised in %ums\n", + pgdat->node_id, jiffies_to_msecs(jiffies - start)); pgdat_init_report_one_done(); return 0; @@ -1908,17 +1952,6 @@ deferred_grow_zone(struct zone *zone, unsigned int order) pgdat_resize_lock(pgdat, &flags); /* - * If deferred pages have been initialized while we were waiting for - * the lock, return true, as the zone was grown. The caller will retry - * this zone. We won't return to this function since the caller also - * has this static branch. - */ - if (!static_branch_unlikely(&deferred_pages)) { - pgdat_resize_unlock(pgdat, &flags); - return true; - } - - /* * If someone grew this zone while we were waiting for spinlock, return * true, as there might be enough pages already. */ @@ -1946,6 +1979,7 @@ deferred_grow_zone(struct zone *zone, unsigned int order) first_deferred_pfn = spfn; nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); + touch_nmi_watchdog(); /* We should only stop along section boundaries */ if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION) @@ -2091,31 +2125,14 @@ static inline void expand(struct zone *zone, struct page *page, static void check_new_page_bad(struct page *page) { - const char *bad_reason = NULL; - unsigned long bad_flags = 0; - - if (unlikely(atomic_read(&page->_mapcount) != -1)) - bad_reason = "nonzero mapcount"; - if (unlikely(page->mapping != NULL)) - bad_reason = "non-NULL mapping"; - if (unlikely(page_ref_count(page) != 0)) - bad_reason = "nonzero _refcount"; if (unlikely(page->flags & __PG_HWPOISON)) { - bad_reason = "HWPoisoned (hardware-corrupted)"; - bad_flags = __PG_HWPOISON; /* Don't complain about hwpoisoned pages */ page_mapcount_reset(page); /* remove PageBuddy */ return; } - if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { - bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; - bad_flags = PAGE_FLAGS_CHECK_AT_PREP; - } -#ifdef CONFIG_MEMCG - if (unlikely(page->mem_cgroup)) - bad_reason = "page still charged to cgroup"; -#endif - bad_page(page, bad_reason, bad_flags); + + bad_page(page, + page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP)); } /* @@ -2400,6 +2417,14 @@ static inline void boost_watermark(struct zone *zone) if (!watermark_boost_factor) return; + /* + * Don't bother in zones that are unlikely to produce results. + * On small machines, including kdump capture kernels running + * in a small area, boosting the watermark can cause an out of + * memory situation immediately. + */ + if ((pageblock_nr_pages * 4) > zone_managed_pages(zone)) + return; max_boost = mult_frac(zone->_watermark[WMARK_HIGH], watermark_boost_factor, 10000); @@ -2600,7 +2625,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, int order; bool ret; - for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, + for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, ac->nodemask) { /* * Preserve at least one pageblock unless memory pressure @@ -2759,6 +2784,20 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, { struct page *page; +#ifdef CONFIG_CMA + /* + * Balance movable allocations between regular and CMA areas by + * allocating from CMA when over half of the zone's free memory + * is in the CMA area. + */ + if (migratetype == MIGRATE_MOVABLE && + zone_page_state(zone, NR_FREE_CMA_PAGES) > + zone_page_state(zone, NR_FREE_PAGES) / 2) { + page = __rmqueue_cma_fallback(zone, order); + if (page) + return page; + } +#endif retry: page = __rmqueue_smallest(zone, order, migratetype); if (unlikely(!page)) { @@ -3455,7 +3494,7 @@ ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); * to check in the allocation paths if no pages are free. */ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, - int classzone_idx, unsigned int alloc_flags, + int highest_zoneidx, unsigned int alloc_flags, long free_pages) { long min = mark; @@ -3500,7 +3539,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, * are not met, then a high-order request also cannot go ahead * even if a suitable page happened to be free. */ - if (free_pages <= min + z->lowmem_reserve[classzone_idx]) + if (free_pages <= min + z->lowmem_reserve[highest_zoneidx]) return false; /* If this is an order-0 request then the watermark is fine */ @@ -3533,14 +3572,15 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, } bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, - int classzone_idx, unsigned int alloc_flags) + int highest_zoneidx, unsigned int alloc_flags) { - return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, zone_page_state(z, NR_FREE_PAGES)); } static inline bool zone_watermark_fast(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, unsigned int alloc_flags) + unsigned long mark, int highest_zoneidx, + unsigned int alloc_flags) { long free_pages = zone_page_state(z, NR_FREE_PAGES); long cma_pages = 0; @@ -3558,22 +3598,23 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, * the caller is !atomic then it'll uselessly search the free * list. That corner case is then slower but it is harmless. */ - if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx]) + if (!order && (free_pages - cma_pages) > + mark + z->lowmem_reserve[highest_zoneidx]) return true; - return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, free_pages); } bool zone_watermark_ok_safe(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx) + unsigned long mark, int highest_zoneidx) { long free_pages = zone_page_state(z, NR_FREE_PAGES); if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); - return __zone_watermark_ok(z, order, mark, classzone_idx, 0, + return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0, free_pages); } @@ -3650,8 +3691,8 @@ retry: */ no_fallback = alloc_flags & ALLOC_NOFRAGMENT; z = ac->preferred_zoneref; - for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { struct page *page; unsigned long mark; @@ -3706,7 +3747,7 @@ retry: mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); if (!zone_watermark_fast(zone, order, mark, - ac_classzone_idx(ac), alloc_flags)) { + ac->highest_zoneidx, alloc_flags)) { int ret; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT @@ -3739,7 +3780,7 @@ retry: default: /* did we reclaim enough */ if (zone_watermark_ok(zone, order, mark, - ac_classzone_idx(ac), alloc_flags)) + ac->highest_zoneidx, alloc_flags)) goto try_this_zone; continue; @@ -3898,7 +3939,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (gfp_mask & __GFP_RETRY_MAYFAIL) goto out; /* The OOM killer does not needlessly kill tasks for lowmem */ - if (ac->high_zoneidx < ZONE_NORMAL) + if (ac->highest_zoneidx < ZONE_NORMAL) goto out; if (pm_suspended_storage()) goto out; @@ -4101,10 +4142,10 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla * Let's give them a good hope and keep retrying while the order-0 * watermarks are OK. */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { if (zone_watermark_ok(zone, 0, min_wmark_pages(zone), - ac_classzone_idx(ac), alloc_flags)) + ac->highest_zoneidx, alloc_flags)) return true; } return false; @@ -4228,12 +4269,12 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, struct zoneref *z; struct zone *zone; pg_data_t *last_pgdat = NULL; - enum zone_type high_zoneidx = ac->high_zoneidx; + enum zone_type highest_zoneidx = ac->highest_zoneidx; - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx, + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx, ac->nodemask) { if (last_pgdat != zone->zone_pgdat) - wakeup_kswapd(zone, gfp_mask, order, high_zoneidx); + wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx); last_pgdat = zone->zone_pgdat; } } @@ -4276,7 +4317,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) alloc_flags |= ALLOC_HARDER; #ifdef CONFIG_CMA - if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif return alloc_flags; @@ -4368,8 +4409,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * request even if all reclaimable pages are considered then we are * screwed and have to go OOM. */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { unsigned long available; unsigned long reclaimable; unsigned long min_wmark = min_wmark_pages(zone); @@ -4383,7 +4424,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * reclaimable pages? */ wmark = __zone_watermark_ok(zone, order, min_wmark, - ac_classzone_idx(ac), alloc_flags, available); + ac->highest_zoneidx, alloc_flags, available); trace_reclaim_retry_zone(z, order, reclaimable, available, min_wmark, *no_progress_loops, wmark); if (wmark) { @@ -4502,7 +4543,7 @@ retry_cpuset: * could end up iterating over non-eligible zones endlessly. */ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, - ac->high_zoneidx, ac->nodemask); + ac->highest_zoneidx, ac->nodemask); if (!ac->preferred_zoneref->zone) goto nopage; @@ -4589,7 +4630,7 @@ retry: if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { ac->nodemask = NULL; ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, - ac->high_zoneidx, ac->nodemask); + ac->highest_zoneidx, ac->nodemask); } /* Attempt with potentially adjusted zonelist and alloc_flags */ @@ -4723,10 +4764,10 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac, gfp_t *alloc_mask, unsigned int *alloc_flags) { - ac->high_zoneidx = gfp_zone(gfp_mask); + ac->highest_zoneidx = gfp_zone(gfp_mask); ac->zonelist = node_zonelist(preferred_nid, gfp_mask); ac->nodemask = nodemask; - ac->migratetype = gfpflags_to_migratetype(gfp_mask); + ac->migratetype = gfp_migratetype(gfp_mask); if (cpusets_enabled()) { *alloc_mask |= __GFP_HARDWALL; @@ -4762,7 +4803,7 @@ static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) * may get reset for allocations that ignore memory policies. */ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, - ac->high_zoneidx, ac->nodemask); + ac->highest_zoneidx, ac->nodemask); } /* @@ -5310,7 +5351,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" " active_file:%lu inactive_file:%lu isolated_file:%lu\n" - " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" + " unevictable:%lu dirty:%lu writeback:%lu\n" " slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" " free:%lu free_pcp:%lu free_cma:%lu\n", @@ -5323,7 +5364,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) global_node_page_state(NR_UNEVICTABLE), global_node_page_state(NR_FILE_DIRTY), global_node_page_state(NR_WRITEBACK), - global_node_page_state(NR_UNSTABLE_NFS), global_node_page_state(NR_SLAB_RECLAIMABLE), global_node_page_state(NR_SLAB_UNRECLAIMABLE), global_node_page_state(NR_FILE_MAPPED), @@ -5356,7 +5396,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " anon_thp: %lukB" #endif " writeback_tmp:%lukB" - " unstable:%lukB" " all_unreclaimable? %s" "\n", pgdat->node_id, @@ -5378,7 +5417,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), #endif K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), - K(node_page_state(pgdat, NR_UNSTABLE_NFS)), pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? "yes" : "no"); } @@ -5411,6 +5449,9 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " managed:%lukB" " mlocked:%lukB" " kernel_stack:%lukB" +#ifdef CONFIG_SHADOW_CALL_STACK + " shadow_call_stack:%lukB" +#endif " pagetables:%lukB" " bounce:%lukB" " free_pcp:%lukB" @@ -5433,6 +5474,9 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(zone_managed_pages(zone)), K(zone_page_state(zone, NR_MLOCK)), zone_page_state(zone, NR_KERNEL_STACK_KB), +#ifdef CONFIG_SHADOW_CALL_STACK + zone_page_state(zone, NR_KERNEL_SCS_KB), +#endif K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_BOUNCE)), K(free_pcp), @@ -5546,21 +5590,11 @@ char numa_zonelist_order[] = "Node"; * sysctl handler for numa_zonelist_order */ int numa_zonelist_order_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, - loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { - char *str; - int ret; - - if (!write) - return proc_dostring(table, write, buffer, length, ppos); - str = memdup_user_nul(buffer, 16); - if (IS_ERR(str)) - return PTR_ERR(str); - - ret = __parse_numa_zonelist_order(str); - kfree(str); - return ret; + if (write) + return __parse_numa_zonelist_order(buffer); + return proc_dostring(table, write, buffer, length, ppos); } @@ -5680,14 +5714,13 @@ static void build_zonelists(pg_data_t *pgdat) { static int node_order[MAX_NUMNODES]; int node, load, nr_nodes = 0; - nodemask_t used_mask; + nodemask_t used_mask = NODE_MASK_NONE; int local_node, prev_node; /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; load = nr_online_nodes; prev_node = local_node; - nodes_clear(used_mask); memset(node_order, 0, sizeof(node_order)); while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { @@ -5899,7 +5932,6 @@ void __ref build_all_zonelists(pg_data_t *pgdat) static bool __meminit overlap_memmap_init(unsigned long zone, unsigned long *pfn) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP static struct memblock_region *r; if (mirrored_kernelcore && zone == ZONE_MOVABLE) { @@ -5915,27 +5947,9 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) return true; } } -#endif return false; } -#ifdef CONFIG_SPARSEMEM -/* Skip PFNs that belong to non-present sections */ -static inline __meminit unsigned long next_pfn(unsigned long pfn) -{ - const unsigned long section_nr = pfn_to_section_nr(++pfn); - - if (present_section_nr(section_nr)) - return pfn; - return section_nr_to_pfn(next_present_section_nr(section_nr)); -} -#else -static inline __meminit unsigned long next_pfn(unsigned long pfn) -{ - return pfn++; -} -#endif - /* * Initially all pages are reserved - free ones are freed * up by memblock_free_all() once the early boot process is @@ -5975,14 +5989,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * function. They do not exist on hotplugged memory. */ if (context == MEMMAP_EARLY) { - if (!early_pfn_valid(pfn)) { - pfn = next_pfn(pfn); - continue; - } - if (!early_pfn_in_nid(pfn, nid)) { - pfn++; - continue; - } if (overlap_memmap_init(zone, &pfn)) continue; if (defer_init(nid, pfn, end_pfn)) @@ -6098,9 +6104,23 @@ static void __meminit zone_init_free_lists(struct zone *zone) } void __meminit __weak memmap_init(unsigned long size, int nid, - unsigned long zone, unsigned long start_pfn) + unsigned long zone, + unsigned long range_start_pfn) { - memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, NULL); + unsigned long start_pfn, end_pfn; + unsigned long range_end_pfn = range_start_pfn + size; + int i; + + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { + start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); + end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); + + if (end_pfn > start_pfn) { + size = end_pfn - start_pfn; + memmap_init_zone(size, nid, zone, start_pfn, + MEMMAP_EARLY, NULL); + } + } } static int zone_batchsize(struct zone *zone) @@ -6252,10 +6272,25 @@ void __init setup_per_cpu_pageset(void) { struct pglist_data *pgdat; struct zone *zone; + int __maybe_unused cpu; for_each_populated_zone(zone) setup_zone_pageset(zone); +#ifdef CONFIG_NUMA + /* + * Unpopulated zones continue using the boot pagesets. + * The numa stats for these pagesets need to be reset. + * Otherwise, they will end up skewing the stats of + * the nodes these zones are associated with. + */ + for_each_possible_cpu(cpu) { + struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu); + memset(pcp->vm_numa_stat_diff, 0, + sizeof(pcp->vm_numa_stat_diff)); + } +#endif + for_each_online_pgdat(pgdat) pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); @@ -6298,57 +6333,6 @@ void __meminit init_currently_empty_zone(struct zone *zone, zone->initialized = 1; } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID - -/* - * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. - */ -int __meminit __early_pfn_to_nid(unsigned long pfn, - struct mminit_pfnnid_cache *state) -{ - unsigned long start_pfn, end_pfn; - int nid; - - if (state->last_start <= pfn && pfn < state->last_end) - return state->last_nid; - - nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); - if (nid != NUMA_NO_NODE) { - state->last_start = start_pfn; - state->last_end = end_pfn; - state->last_nid = nid; - } - - return nid; -} -#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ - -/** - * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range - * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. - * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid - * - * If an architecture guarantees that all ranges registered contain no holes - * and may be freed, this this function may be used instead of calling - * memblock_free_early_nid() manually. - */ -void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) -{ - unsigned long start_pfn, end_pfn; - int i, this_nid; - - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { - start_pfn = min(start_pfn, max_low_pfn); - end_pfn = min(end_pfn, max_low_pfn); - - if (start_pfn < end_pfn) - memblock_free_early_nid(PFN_PHYS(start_pfn), - (end_pfn - start_pfn) << PAGE_SHIFT, - this_nid); - } -} - /** * sparse_memory_present_with_active_regions - Call memory_present for each active range * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. @@ -6461,8 +6445,7 @@ static unsigned long __init zone_spanned_pages_in_node(int nid, unsigned long node_start_pfn, unsigned long node_end_pfn, unsigned long *zone_start_pfn, - unsigned long *zone_end_pfn, - unsigned long *ignored) + unsigned long *zone_end_pfn) { unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; @@ -6526,8 +6509,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn, static unsigned long __init zone_absent_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, - unsigned long node_end_pfn, - unsigned long *ignored) + unsigned long node_end_pfn) { unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; @@ -6574,45 +6556,9 @@ static unsigned long __init zone_absent_pages_in_node(int nid, return nr_absent; } -#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ -static inline unsigned long __init zone_spanned_pages_in_node(int nid, - unsigned long zone_type, - unsigned long node_start_pfn, - unsigned long node_end_pfn, - unsigned long *zone_start_pfn, - unsigned long *zone_end_pfn, - unsigned long *zones_size) -{ - unsigned int zone; - - *zone_start_pfn = node_start_pfn; - for (zone = 0; zone < zone_type; zone++) - *zone_start_pfn += zones_size[zone]; - - *zone_end_pfn = *zone_start_pfn + zones_size[zone_type]; - - return zones_size[zone_type]; -} - -static inline unsigned long __init zone_absent_pages_in_node(int nid, - unsigned long zone_type, - unsigned long node_start_pfn, - unsigned long node_end_pfn, - unsigned long *zholes_size) -{ - if (!zholes_size) - return 0; - - return zholes_size[zone_type]; -} - -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ - static void __init calculate_node_totalpages(struct pglist_data *pgdat, unsigned long node_start_pfn, - unsigned long node_end_pfn, - unsigned long *zones_size, - unsigned long *zholes_size) + unsigned long node_end_pfn) { unsigned long realtotalpages = 0, totalpages = 0; enum zone_type i; @@ -6620,17 +6566,21 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat, for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; unsigned long zone_start_pfn, zone_end_pfn; + unsigned long spanned, absent; unsigned long size, real_size; - size = zone_spanned_pages_in_node(pgdat->node_id, i, - node_start_pfn, - node_end_pfn, - &zone_start_pfn, - &zone_end_pfn, - zones_size); - real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, - node_start_pfn, node_end_pfn, - zholes_size); + spanned = zone_spanned_pages_in_node(pgdat->node_id, i, + node_start_pfn, + node_end_pfn, + &zone_start_pfn, + &zone_end_pfn); + absent = zone_absent_pages_in_node(pgdat->node_id, i, + node_start_pfn, + node_end_pfn); + + size = spanned; + real_size = size - absent; + if (size) zone->zone_start_pfn = zone_start_pfn; else @@ -6930,10 +6880,8 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) */ if (pgdat == NODE_DATA(0)) { mem_map = NODE_DATA(0)->node_mem_map; -#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM) if (page_to_pfn(mem_map) != pgdat->node_start_pfn) mem_map -= offset; -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ } #endif } @@ -6950,30 +6898,25 @@ static inline void pgdat_set_deferred_range(pg_data_t *pgdat) static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {} #endif -void __init free_area_init_node(int nid, unsigned long *zones_size, - unsigned long node_start_pfn, - unsigned long *zholes_size) +static void __init free_area_init_node(int nid) { pg_data_t *pgdat = NODE_DATA(nid); unsigned long start_pfn = 0; unsigned long end_pfn = 0; /* pg_data_t should be reset to zero when it's allocated */ - WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); + WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx); + + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); pgdat->node_id = nid; - pgdat->node_start_pfn = node_start_pfn; + pgdat->node_start_pfn = start_pfn; pgdat->per_cpu_nodestats = NULL; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); + pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, (u64)start_pfn << PAGE_SHIFT, end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); -#else - start_pfn = node_start_pfn; -#endif - calculate_node_totalpages(pgdat, start_pfn, end_pfn, - zones_size, zholes_size); + calculate_node_totalpages(pgdat, start_pfn, end_pfn); alloc_node_mem_map(pgdat); pgdat_set_deferred_range(pgdat); @@ -6981,6 +6924,11 @@ void __init free_area_init_node(int nid, unsigned long *zones_size, free_area_init_core(pgdat); } +void __init free_area_init_memoryless_node(int nid) +{ + free_area_init_node(nid); +} + #if !defined(CONFIG_FLAT_NODE_MEM_MAP) /* * Initialize all valid struct pages in the range [spfn, epfn) and mark them @@ -7064,8 +7012,6 @@ static inline void __init init_unavailable_mem(void) } #endif /* !CONFIG_FLAT_NODE_MEM_MAP */ -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - #if MAX_NUMNODES > 1 /* * Figure out the number of possible node ids. @@ -7129,24 +7075,6 @@ unsigned long __init node_map_pfn_alignment(void) return ~accl_mask + 1; } -/* Find the lowest pfn for a node */ -static unsigned long __init find_min_pfn_for_node(int nid) -{ - unsigned long min_pfn = ULONG_MAX; - unsigned long start_pfn; - int i; - - for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) - min_pfn = min(min_pfn, start_pfn); - - if (min_pfn == ULONG_MAX) { - pr_warn("Could not find start_pfn for node %d\n", nid); - return 0; - } - - return min_pfn; -} - /** * find_min_pfn_with_active_regions - Find the minimum PFN registered * @@ -7155,7 +7083,7 @@ static unsigned long __init find_min_pfn_for_node(int nid) */ unsigned long __init find_min_pfn_with_active_regions(void) { - return find_min_pfn_for_node(MAX_NUMNODES); + return PHYS_PFN(memblock_start_of_DRAM()); } /* @@ -7208,7 +7136,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) if (!memblock_is_hotpluggable(r)) continue; - nid = r->nid; + nid = memblock_get_region_node(r); usable_startpfn = PFN_DOWN(r->base); zone_movable_pfn[nid] = zone_movable_pfn[nid] ? @@ -7229,7 +7157,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) if (memblock_is_mirror(r)) continue; - nid = r->nid; + nid = memblock_get_region_node(r); usable_startpfn = memblock_region_memory_base_pfn(r); @@ -7244,7 +7172,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) } if (mem_below_4gb_not_mirrored) - pr_warn("This configuration results in unmirrored kernel memory."); + pr_warn("This configuration results in unmirrored kernel memory.\n"); goto out2; } @@ -7409,8 +7337,17 @@ static void check_for_memory(pg_data_t *pgdat, int nid) } } +/* + * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For + * such cases we allow max_zone_pfn sorted in the descending order + */ +bool __weak arch_has_descending_max_zone_pfns(void) +{ + return false; +} + /** - * free_area_init_nodes - Initialise all pg_data_t and zone data + * free_area_init - Initialise all pg_data_t and zone data * @max_zone_pfn: an array of max PFNs for each zone * * This will call free_area_init_node() for each active node in the system. @@ -7422,10 +7359,11 @@ static void check_for_memory(pg_data_t *pgdat, int nid) * starts where the previous one ended. For example, ZONE_DMA32 starts * at arch_max_dma_pfn. */ -void __init free_area_init_nodes(unsigned long *max_zone_pfn) +void __init free_area_init(unsigned long *max_zone_pfn) { unsigned long start_pfn, end_pfn; - int i, nid; + int i, nid, zone; + bool descending; /* Record where the zone boundaries are */ memset(arch_zone_lowest_possible_pfn, 0, @@ -7434,14 +7372,20 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) sizeof(arch_zone_highest_possible_pfn)); start_pfn = find_min_pfn_with_active_regions(); + descending = arch_has_descending_max_zone_pfns(); for (i = 0; i < MAX_NR_ZONES; i++) { - if (i == ZONE_MOVABLE) + if (descending) + zone = MAX_NR_ZONES - i - 1; + else + zone = i; + + if (zone == ZONE_MOVABLE) continue; - end_pfn = max(max_zone_pfn[i], start_pfn); - arch_zone_lowest_possible_pfn[i] = start_pfn; - arch_zone_highest_possible_pfn[i] = end_pfn; + end_pfn = max(max_zone_pfn[zone], start_pfn); + arch_zone_lowest_possible_pfn[zone] = start_pfn; + arch_zone_highest_possible_pfn[zone] = end_pfn; start_pfn = end_pfn; } @@ -7494,8 +7438,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) init_unavailable_mem(); for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); - free_area_init_node(nid, NULL, - find_min_pfn_for_node(nid), NULL); + free_area_init_node(nid); /* Any memory on that node */ if (pgdat->node_present_pages) @@ -7560,8 +7503,6 @@ static int __init cmdline_parse_movablecore(char *p) early_param("kernelcore", cmdline_parse_kernelcore); early_param("movablecore", cmdline_parse_movablecore); -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ - void adjust_managed_page_count(struct page *page, long count) { atomic_long_add(count, &page_zone(page)->managed_pages); @@ -7684,13 +7625,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) dma_reserve = new_dma_reserve; } -void __init free_area_init(unsigned long *zones_size) -{ - init_unavailable_mem(); - free_area_init_node(0, zones_size, - __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); -} - static int page_alloc_cpu_dead(unsigned int cpu) { @@ -7808,9 +7742,10 @@ static void setup_per_zone_lowmem_reserve(void) idx--; lower_zone = pgdat->node_zones + idx; - if (sysctl_lowmem_reserve_ratio[idx] < 1) { - sysctl_lowmem_reserve_ratio[idx] = 0; + if (!sysctl_lowmem_reserve_ratio[idx] || + !zone_managed_pages(lower_zone)) { lower_zone->lowmem_reserve[j] = 0; + continue; } else { lower_zone->lowmem_reserve[j] = managed_pages / sysctl_lowmem_reserve_ratio[idx]; @@ -7875,9 +7810,9 @@ static void __setup_per_zone_wmarks(void) mult_frac(zone_managed_pages(zone), watermark_scale_factor, 10000)); + zone->watermark_boost = 0; zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; - zone->watermark_boost = 0; spin_unlock_irqrestore(&zone->lock, flags); } @@ -7963,7 +7898,7 @@ core_initcall(init_per_zone_wmark_min) * changes. */ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -7978,20 +7913,8 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, return 0; } -int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) -{ - int rc; - - rc = proc_dointvec_minmax(table, write, buffer, length, ppos); - if (rc) - return rc; - - return 0; -} - int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -8021,7 +7944,7 @@ static void setup_min_unmapped_ratio(void) int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -8048,7 +7971,7 @@ static void setup_min_slab_ratio(void) } int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -8072,9 +7995,17 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, * if in function of the boot time zone sizes. */ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { + int i; + proc_dointvec_minmax(table, write, buffer, length, ppos); + + for (i = 0; i < MAX_NR_ZONES; i++) { + if (sysctl_lowmem_reserve_ratio[i] < 1) + sysctl_lowmem_reserve_ratio[i] = 0; + } + setup_per_zone_lowmem_reserve(); return 0; } @@ -8094,7 +8025,7 @@ static void __zone_pcp_update(struct zone *zone) * pagelist can have before it gets flushed back to buddy allocator. */ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { struct zone *zone; int old_percpu_pagelist_fraction; @@ -8238,7 +8169,7 @@ void *__init alloc_large_system_hash(const char *tablename, table = memblock_alloc_raw(size, SMP_CACHE_BYTES); } else if (get_order(size) >= MAX_ORDER || hashdist) { - table = __vmalloc(size, gfp_flags, PAGE_KERNEL); + table = __vmalloc(size, gfp_flags); virt = true; } else { /* @@ -8402,7 +8333,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, unsigned long start, unsigned long end) { /* This function is based on compact_zone() from compaction.c. */ - unsigned long nr_reclaimed; + unsigned int nr_reclaimed; unsigned long pfn = start; unsigned int tries = 0; int ret = 0; diff --git a/mm/page_owner.c b/mm/page_owner.c index 18ecde9f45b2..360461509423 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -312,8 +312,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, continue; page_owner = get_page_owner(page_ext); - page_mt = gfpflags_to_migratetype( - page_owner->gfp_mask); + page_mt = gfp_migratetype(page_owner->gfp_mask); if (pageblock_mt != page_mt) { if (is_migrate_cma(pageblock_mt)) count[MIGRATE_MOVABLE]++; @@ -359,7 +358,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, /* Print information relevant to grouping pages by mobility */ pageblock_mt = get_pageblock_migratetype(page); - page_mt = gfpflags_to_migratetype(page_owner->gfp_mask); + page_mt = gfp_migratetype(page_owner->gfp_mask); ret += snprintf(kbuf + ret, count - ret, "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n", pfn, @@ -416,7 +415,7 @@ void __dump_page_owner(struct page *page) page_owner = get_page_owner(page_ext); gfp_mask = page_owner->gfp_mask; - mt = gfpflags_to_migratetype(gfp_mask); + mt = gfp_migratetype(gfp_mask); if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { pr_alert("page_owner info is not present (never set?)\n"); diff --git a/mm/percpu.c b/mm/percpu.c index d7e3bc649f4e..696367b18222 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -80,6 +80,7 @@ #include <linux/workqueue.h> #include <linux/kmemleak.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <asm/cacheflush.h> #include <asm/sections.h> @@ -481,7 +482,7 @@ static void *pcpu_mem_zalloc(size_t size, gfp_t gfp) if (size <= PAGE_SIZE) return kzalloc(size, gfp); else - return __vmalloc(size, gfp | __GFP_ZERO, PAGE_KERNEL); + return __vmalloc(size, gfp | __GFP_ZERO); } /** @@ -1557,10 +1558,9 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, gfp_t gfp) { - /* whitelisted flags that can be passed to the backing allocators */ - gfp_t pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); - bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; - bool do_warn = !(gfp & __GFP_NOWARN); + gfp_t pcpu_gfp; + bool is_atomic; + bool do_warn; static int warn_limit = 10; struct pcpu_chunk *chunk, *next; const char *err; @@ -1569,6 +1569,12 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, void __percpu *ptr; size_t bits, bit_align; + gfp = current_gfp_context(gfp); + /* whitelisted flags that can be passed to the backing allocators */ + pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); + is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; + do_warn = !(gfp & __GFP_NOWARN); + /* * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE, * therefore alignment must be a minimum of that many bytes. diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 3d7c01e76efc..d18f0e1b6792 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -194,7 +194,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { - pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mknotpresent(*pmdp)); + pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return old; } diff --git a/mm/ptdump.c b/mm/ptdump.c index 26208d0d03b7..f4ce916f5602 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -36,6 +36,9 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr, return note_kasan_page_table(walk, addr); #endif + if (st->effective_prot) + st->effective_prot(st, 0, pgd_val(val)); + if (pgd_leaf(val)) st->note_page(st, addr, 0, pgd_val(val)); @@ -53,6 +56,9 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr, return note_kasan_page_table(walk, addr); #endif + if (st->effective_prot) + st->effective_prot(st, 1, p4d_val(val)); + if (p4d_leaf(val)) st->note_page(st, addr, 1, p4d_val(val)); @@ -70,6 +76,9 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr, return note_kasan_page_table(walk, addr); #endif + if (st->effective_prot) + st->effective_prot(st, 2, pud_val(val)); + if (pud_leaf(val)) st->note_page(st, addr, 2, pud_val(val)); @@ -87,6 +96,8 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr, return note_kasan_page_table(walk, addr); #endif + if (st->effective_prot) + st->effective_prot(st, 3, pmd_val(val)); if (pmd_leaf(val)) st->note_page(st, addr, 3, pmd_val(val)); @@ -97,8 +108,12 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { struct ptdump_state *st = walk->private; + pte_t val = READ_ONCE(*pte); + + if (st->effective_prot) + st->effective_prot(st, 4, pte_val(val)); - st->note_page(st, addr, 4, pte_val(READ_ONCE(*pte))); + st->note_page(st, addr, 4, pte_val(val)); return 0; } diff --git a/mm/readahead.c b/mm/readahead.c index 2fe72cd29b47..3c9a8dd7c56c 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -22,6 +22,7 @@ #include <linux/mm_inline.h> #include <linux/blk-cgroup.h> #include <linux/fadvise.h> +#include <linux/sched/mm.h> #include "internal.h" @@ -113,94 +114,126 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, EXPORT_SYMBOL(read_cache_pages); -static int read_pages(struct address_space *mapping, struct file *filp, - struct list_head *pages, unsigned int nr_pages, gfp_t gfp) +static void read_pages(struct readahead_control *rac, struct list_head *pages, + bool skip_page) { + const struct address_space_operations *aops = rac->mapping->a_ops; + struct page *page; struct blk_plug plug; - unsigned page_idx; - int ret; + + if (!readahead_count(rac)) + goto out; blk_start_plug(&plug); - if (mapping->a_ops->readpages) { - ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); + if (aops->readahead) { + aops->readahead(rac); + /* Clean up the remaining pages */ + while ((page = readahead_page(rac))) { + unlock_page(page); + put_page(page); + } + } else if (aops->readpages) { + aops->readpages(rac->file, rac->mapping, pages, + readahead_count(rac)); /* Clean up the remaining pages */ put_pages_list(pages); - goto out; - } - - for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page = lru_to_page(pages); - list_del(&page->lru); - if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) - mapping->a_ops->readpage(filp, page); - put_page(page); + rac->_index += rac->_nr_pages; + rac->_nr_pages = 0; + } else { + while ((page = readahead_page(rac))) { + aops->readpage(rac->file, page); + put_page(page); + } } - ret = 0; -out: blk_finish_plug(&plug); - return ret; + BUG_ON(!list_empty(pages)); + BUG_ON(readahead_count(rac)); + +out: + if (skip_page) + rac->_index++; } -/* - * __do_page_cache_readahead() actually reads a chunk of disk. It allocates - * the pages first, then submits them for I/O. This avoids the very bad - * behaviour which would occur if page allocations are causing VM writeback. - * We really don't want to intermingle reads and writes like that. +/** + * page_cache_readahead_unbounded - Start unchecked readahead. + * @mapping: File address space. + * @file: This instance of the open file; used for authentication. + * @index: First page index to read. + * @nr_to_read: The number of pages to read. + * @lookahead_size: Where to start the next readahead. * - * Returns the number of pages requested, or the maximum amount of I/O allowed. + * This function is for filesystems to call when they want to start + * readahead beyond a file's stated i_size. This is almost certainly + * not the function you want to call. Use page_cache_async_readahead() + * or page_cache_sync_readahead() instead. + * + * Context: File is referenced by caller. Mutexes may be held by caller. + * May sleep, but will not reenter filesystem to reclaim memory. */ -unsigned int __do_page_cache_readahead(struct address_space *mapping, - struct file *filp, pgoff_t offset, unsigned long nr_to_read, +void page_cache_readahead_unbounded(struct address_space *mapping, + struct file *file, pgoff_t index, unsigned long nr_to_read, unsigned long lookahead_size) { - struct inode *inode = mapping->host; - struct page *page; - unsigned long end_index; /* The last page we want to read */ LIST_HEAD(page_pool); - int page_idx; - unsigned int nr_pages = 0; - loff_t isize = i_size_read(inode); gfp_t gfp_mask = readahead_gfp_mask(mapping); + struct readahead_control rac = { + .mapping = mapping, + .file = file, + ._index = index, + }; + unsigned long i; - if (isize == 0) - goto out; - - end_index = ((isize - 1) >> PAGE_SHIFT); + /* + * Partway through the readahead operation, we will have added + * locked pages to the page cache, but will not yet have submitted + * them for I/O. Adding another page may need to allocate memory, + * which can trigger memory reclaim. Telling the VM we're in + * the middle of a filesystem operation will cause it to not + * touch file-backed pages, preventing a deadlock. Most (all?) + * filesystems already specify __GFP_NOFS in their mapping's + * gfp_mask, but let's be explicit here. + */ + unsigned int nofs = memalloc_nofs_save(); /* * Preallocate as many pages as we will need. */ - for (page_idx = 0; page_idx < nr_to_read; page_idx++) { - pgoff_t page_offset = offset + page_idx; + for (i = 0; i < nr_to_read; i++) { + struct page *page = xa_load(&mapping->i_pages, index + i); - if (page_offset > end_index) - break; + BUG_ON(index + i != rac._index + rac._nr_pages); - page = xa_load(&mapping->i_pages, page_offset); if (page && !xa_is_value(page)) { /* - * Page already present? Kick off the current batch of - * contiguous pages before continuing with the next - * batch. + * Page already present? Kick off the current batch + * of contiguous pages before continuing with the + * next batch. This page may be the one we would + * have intended to mark as Readahead, but we don't + * have a stable reference to this page, and it's + * not worth getting one just for that. */ - if (nr_pages) - read_pages(mapping, filp, &page_pool, nr_pages, - gfp_mask); - nr_pages = 0; + read_pages(&rac, &page_pool, true); continue; } page = __page_cache_alloc(gfp_mask); if (!page) break; - page->index = page_offset; - list_add(&page->lru, &page_pool); - if (page_idx == nr_to_read - lookahead_size) + if (mapping->a_ops->readpages) { + page->index = index + i; + list_add(&page->lru, &page_pool); + } else if (add_to_page_cache_lru(page, mapping, index + i, + gfp_mask) < 0) { + put_page(page); + read_pages(&rac, &page_pool, true); + continue; + } + if (i == nr_to_read - lookahead_size) SetPageReadahead(page); - nr_pages++; + rac._nr_pages++; } /* @@ -208,26 +241,53 @@ unsigned int __do_page_cache_readahead(struct address_space *mapping, * uptodate then the caller will launch readpage again, and * will then handle the error. */ - if (nr_pages) - read_pages(mapping, filp, &page_pool, nr_pages, gfp_mask); - BUG_ON(!list_empty(&page_pool)); -out: - return nr_pages; + read_pages(&rac, &page_pool, false); + memalloc_nofs_restore(nofs); +} +EXPORT_SYMBOL_GPL(page_cache_readahead_unbounded); + +/* + * __do_page_cache_readahead() actually reads a chunk of disk. It allocates + * the pages first, then submits them for I/O. This avoids the very bad + * behaviour which would occur if page allocations are causing VM writeback. + * We really don't want to intermingle reads and writes like that. + */ +void __do_page_cache_readahead(struct address_space *mapping, + struct file *file, pgoff_t index, unsigned long nr_to_read, + unsigned long lookahead_size) +{ + struct inode *inode = mapping->host; + loff_t isize = i_size_read(inode); + pgoff_t end_index; /* The last page we want to read */ + + if (isize == 0) + return; + + end_index = (isize - 1) >> PAGE_SHIFT; + if (index > end_index) + return; + /* Don't read past the page containing the last byte of the file */ + if (nr_to_read > end_index - index) + nr_to_read = end_index - index + 1; + + page_cache_readahead_unbounded(mapping, file, index, nr_to_read, + lookahead_size); } /* * Chunk the readahead into 2 megabyte units, so that we don't pin too much * memory at once. */ -int force_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read) +void force_page_cache_readahead(struct address_space *mapping, + struct file *filp, pgoff_t index, unsigned long nr_to_read) { struct backing_dev_info *bdi = inode_to_bdi(mapping->host); struct file_ra_state *ra = &filp->f_ra; unsigned long max_pages; - if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) - return -EINVAL; + if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages && + !mapping->a_ops->readahead)) + return; /* * If the request exceeds the readahead window, allow the read to @@ -240,12 +300,11 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, if (this_chunk > nr_to_read) this_chunk = nr_to_read; - __do_page_cache_readahead(mapping, filp, offset, this_chunk, 0); + __do_page_cache_readahead(mapping, filp, index, this_chunk, 0); - offset += this_chunk; + index += this_chunk; nr_to_read -= this_chunk; } - return 0; } /* @@ -324,21 +383,21 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra, */ /* - * Count contiguously cached pages from @offset-1 to @offset-@max, + * Count contiguously cached pages from @index-1 to @index-@max, * this count is a conservative estimation of * - length of the sequential read sequence, or * - thrashing threshold in memory tight systems */ static pgoff_t count_history_pages(struct address_space *mapping, - pgoff_t offset, unsigned long max) + pgoff_t index, unsigned long max) { pgoff_t head; rcu_read_lock(); - head = page_cache_prev_miss(mapping, offset - 1, max); + head = page_cache_prev_miss(mapping, index - 1, max); rcu_read_unlock(); - return offset - 1 - head; + return index - 1 - head; } /* @@ -346,13 +405,13 @@ static pgoff_t count_history_pages(struct address_space *mapping, */ static int try_context_readahead(struct address_space *mapping, struct file_ra_state *ra, - pgoff_t offset, + pgoff_t index, unsigned long req_size, unsigned long max) { pgoff_t size; - size = count_history_pages(mapping, offset, max); + size = count_history_pages(mapping, index, max); /* * not enough history pages: @@ -365,10 +424,10 @@ static int try_context_readahead(struct address_space *mapping, * starts from beginning of file: * it is a strong indication of long-run stream (or whole-file-read) */ - if (size >= offset) + if (size >= index) size *= 2; - ra->start = offset; + ra->start = index; ra->size = min(size + req_size, max); ra->async_size = 1; @@ -378,16 +437,15 @@ static int try_context_readahead(struct address_space *mapping, /* * A minimal readahead algorithm for trivial sequential/random reads. */ -static unsigned long -ondemand_readahead(struct address_space *mapping, - struct file_ra_state *ra, struct file *filp, - bool hit_readahead_marker, pgoff_t offset, - unsigned long req_size) +static void ondemand_readahead(struct address_space *mapping, + struct file_ra_state *ra, struct file *filp, + bool hit_readahead_marker, pgoff_t index, + unsigned long req_size) { struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long max_pages = ra->ra_pages; unsigned long add_pages; - pgoff_t prev_offset; + pgoff_t prev_index; /* * If the request exceeds the readahead window, allow the read to @@ -399,15 +457,15 @@ ondemand_readahead(struct address_space *mapping, /* * start of file */ - if (!offset) + if (!index) goto initial_readahead; /* - * It's the expected callback offset, assume sequential access. + * It's the expected callback index, assume sequential access. * Ramp up sizes, and push forward the readahead window. */ - if ((offset == (ra->start + ra->size - ra->async_size) || - offset == (ra->start + ra->size))) { + if ((index == (ra->start + ra->size - ra->async_size) || + index == (ra->start + ra->size))) { ra->start += ra->size; ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; @@ -424,14 +482,14 @@ ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = page_cache_next_miss(mapping, offset + 1, max_pages); + start = page_cache_next_miss(mapping, index + 1, max_pages); rcu_read_unlock(); - if (!start || start - offset > max_pages) - return 0; + if (!start || start - index > max_pages) + return; ra->start = start; - ra->size = start - offset; /* old async_size */ + ra->size = start - index; /* old async_size */ ra->size += req_size; ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; @@ -446,28 +504,29 @@ ondemand_readahead(struct address_space *mapping, /* * sequential cache miss - * trivial case: (offset - prev_offset) == 1 - * unaligned reads: (offset - prev_offset) == 0 + * trivial case: (index - prev_index) == 1 + * unaligned reads: (index - prev_index) == 0 */ - prev_offset = (unsigned long long)ra->prev_pos >> PAGE_SHIFT; - if (offset - prev_offset <= 1UL) + prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT; + if (index - prev_index <= 1UL) goto initial_readahead; /* * Query the page cache and look for the traces(cached history pages) * that a sequential stream would leave behind. */ - if (try_context_readahead(mapping, ra, offset, req_size, max_pages)) + if (try_context_readahead(mapping, ra, index, req_size, max_pages)) goto readit; /* * standalone, small random read * Read as is, and do not pollute the readahead state. */ - return __do_page_cache_readahead(mapping, filp, offset, req_size, 0); + __do_page_cache_readahead(mapping, filp, index, req_size, 0); + return; initial_readahead: - ra->start = offset; + ra->start = index; ra->size = get_init_ra_size(req_size, max_pages); ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; @@ -478,7 +537,7 @@ readit: * the resulted next readahead window into the current one. * Take care of maximum IO pages as above. */ - if (offset == ra->start && ra->size == ra->async_size) { + if (index == ra->start && ra->size == ra->async_size) { add_pages = get_next_ra_size(ra, max_pages); if (ra->size + add_pages <= max_pages) { ra->async_size = add_pages; @@ -489,7 +548,7 @@ readit: } } - return ra_submit(ra, mapping, filp); + ra_submit(ra, mapping, filp); } /** @@ -497,9 +556,8 @@ readit: * @mapping: address_space which holds the pagecache and I/O vectors * @ra: file_ra_state which holds the readahead state * @filp: passed on to ->readpage() and ->readpages() - * @offset: start offset into @mapping, in pagecache page-sized units - * @req_size: hint: total size of the read which the caller is performing in - * pagecache pages + * @index: Index of first page to be read. + * @req_count: Total number of pages being read by the caller. * * page_cache_sync_readahead() should be called when a cache miss happened: * it will submit the read. The readahead logic may decide to piggyback more @@ -508,7 +566,7 @@ readit: */ void page_cache_sync_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *filp, - pgoff_t offset, unsigned long req_size) + pgoff_t index, unsigned long req_count) { /* no read-ahead */ if (!ra->ra_pages) @@ -519,12 +577,12 @@ void page_cache_sync_readahead(struct address_space *mapping, /* be dumb */ if (filp && (filp->f_mode & FMODE_RANDOM)) { - force_page_cache_readahead(mapping, filp, offset, req_size); + force_page_cache_readahead(mapping, filp, index, req_count); return; } /* do read-ahead */ - ondemand_readahead(mapping, ra, filp, false, offset, req_size); + ondemand_readahead(mapping, ra, filp, false, index, req_count); } EXPORT_SYMBOL_GPL(page_cache_sync_readahead); @@ -533,21 +591,20 @@ EXPORT_SYMBOL_GPL(page_cache_sync_readahead); * @mapping: address_space which holds the pagecache and I/O vectors * @ra: file_ra_state which holds the readahead state * @filp: passed on to ->readpage() and ->readpages() - * @page: the page at @offset which has the PG_readahead flag set - * @offset: start offset into @mapping, in pagecache page-sized units - * @req_size: hint: total size of the read which the caller is performing in - * pagecache pages + * @page: The page at @index which triggered the readahead call. + * @index: Index of first page to be read. + * @req_count: Total number of pages being read by the caller. * * page_cache_async_readahead() should be called when a page is used which - * has the PG_readahead flag; this is a marker to suggest that the application + * is marked as PageReadahead; this is a marker to suggest that the application * has used up enough of the readahead window that we should start pulling in * more pages. */ void page_cache_async_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *filp, - struct page *page, pgoff_t offset, - unsigned long req_size) + struct page *page, pgoff_t index, + unsigned long req_count) { /* no read-ahead */ if (!ra->ra_pages) @@ -571,7 +628,7 @@ page_cache_async_readahead(struct address_space *mapping, return; /* do read-ahead */ - ondemand_readahead(mapping, ra, filp, true, offset, req_size); + ondemand_readahead(mapping, ra, filp, true, index, req_count); } EXPORT_SYMBOL_GPL(page_cache_async_readahead); diff --git a/mm/rmap.c b/mm/rmap.c index f79a206b271a..ad4a0fdcc94c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1114,6 +1114,11 @@ void do_page_add_anon_rmap(struct page *page, bool compound = flags & RMAP_COMPOUND; bool first; + if (unlikely(PageKsm(page))) + lock_page_memcg(page); + else + VM_BUG_ON_PAGE(!PageLocked(page), page); + if (compound) { atomic_t *mapcount; VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -1133,13 +1138,14 @@ void do_page_add_anon_rmap(struct page *page, * disabled. */ if (compound) - __inc_node_page_state(page, NR_ANON_THPS); - __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); + __inc_lruvec_page_state(page, NR_ANON_THPS); + __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); } - if (unlikely(PageKsm(page))) - return; - VM_BUG_ON_PAGE(!PageLocked(page), page); + if (unlikely(PageKsm(page))) { + unlock_page_memcg(page); + return; + } /* address might be in next vma when migration races vma_adjust */ if (first) @@ -1174,14 +1180,14 @@ void page_add_new_anon_rmap(struct page *page, if (hpage_pincount_available(page)) atomic_set(compound_pincount_ptr(page), 0); - __inc_node_page_state(page, NR_ANON_THPS); + __inc_lruvec_page_state(page, NR_ANON_THPS); } else { /* Anon THP always mapped first with PMD */ VM_BUG_ON_PAGE(PageTransCompound(page), page); /* increment count (starts at -1) */ atomic_set(&page->_mapcount, 0); } - __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); + __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); __page_set_anon_rmap(page, vma, address, 1); } @@ -1230,13 +1236,12 @@ static void page_remove_file_rmap(struct page *page, bool compound) int i, nr = 1; VM_BUG_ON_PAGE(compound && !PageHead(page), page); - lock_page_memcg(page); /* Hugepages are not counted in NR_FILE_MAPPED for now. */ if (unlikely(PageHuge(page))) { /* hugetlb pages are always mapped with pmds */ atomic_dec(compound_mapcount_ptr(page)); - goto out; + return; } /* page still mapped by someone else? */ @@ -1246,14 +1251,14 @@ static void page_remove_file_rmap(struct page *page, bool compound) nr++; } if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) - goto out; + return; if (PageSwapBacked(page)) __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); else __dec_node_page_state(page, NR_FILE_PMDMAPPED); } else { if (!atomic_add_negative(-1, &page->_mapcount)) - goto out; + return; } /* @@ -1265,8 +1270,6 @@ static void page_remove_file_rmap(struct page *page, bool compound) if (unlikely(PageMlocked(page))) clear_page_mlock(page); -out: - unlock_page_memcg(page); } static void page_remove_anon_compound_rmap(struct page *page) @@ -1283,7 +1286,7 @@ static void page_remove_anon_compound_rmap(struct page *page) if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return; - __dec_node_page_state(page, NR_ANON_THPS); + __dec_lruvec_page_state(page, NR_ANON_THPS); if (TestClearPageDoubleMap(page)) { /* @@ -1310,7 +1313,7 @@ static void page_remove_anon_compound_rmap(struct page *page) clear_page_mlock(page); if (nr) - __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr); + __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr); } /** @@ -1322,22 +1325,28 @@ static void page_remove_anon_compound_rmap(struct page *page) */ void page_remove_rmap(struct page *page, bool compound) { - if (!PageAnon(page)) - return page_remove_file_rmap(page, compound); + lock_page_memcg(page); - if (compound) - return page_remove_anon_compound_rmap(page); + if (!PageAnon(page)) { + page_remove_file_rmap(page, compound); + goto out; + } + + if (compound) { + page_remove_anon_compound_rmap(page); + goto out; + } /* page still mapped by someone else? */ if (!atomic_add_negative(-1, &page->_mapcount)) - return; + goto out; /* * We use the irq-unsafe __{inc|mod}_zone_page_stat because * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption disabled. */ - __dec_node_page_state(page, NR_ANON_MAPPED); + __dec_lruvec_page_state(page, NR_ANON_MAPPED); if (unlikely(PageMlocked(page))) clear_page_mlock(page); @@ -1354,6 +1363,8 @@ void page_remove_rmap(struct page *page, bool compound) * Leaving it set also helps swapoff to reinstate ptes * faster for those pages still in swapcache. */ +out: + unlock_page_memcg(page); } /* diff --git a/mm/shmem.c b/mm/shmem.c index bd8840082c94..ea95a3e46fbb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -605,11 +605,13 @@ static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo) */ static int shmem_add_to_page_cache(struct page *page, struct address_space *mapping, - pgoff_t index, void *expected, gfp_t gfp) + pgoff_t index, void *expected, gfp_t gfp, + struct mm_struct *charge_mm) { XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page)); unsigned long i = 0; unsigned long nr = compound_nr(page); + int error; VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(index != round_down(index, nr), page); @@ -621,6 +623,18 @@ static int shmem_add_to_page_cache(struct page *page, page->mapping = mapping; page->index = index; + if (!PageSwapCache(page)) { + error = mem_cgroup_charge(page, charge_mm, gfp); + if (error) { + if (PageTransHuge(page)) { + count_vm_event(THP_FILE_FALLBACK); + count_vm_event(THP_FILE_FALLBACK_CHARGE); + } + goto error; + } + } + cgroup_throttle_swaprate(page, gfp); + do { void *entry; xas_lock_irq(&xas); @@ -641,19 +655,22 @@ next: __inc_node_page_state(page, NR_SHMEM_THPS); } mapping->nrpages += nr; - __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); - __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr); + __mod_lruvec_page_state(page, NR_FILE_PAGES, nr); + __mod_lruvec_page_state(page, NR_SHMEM, nr); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); if (xas_error(&xas)) { - page->mapping = NULL; - page_ref_sub(page, nr); - return xas_error(&xas); + error = xas_error(&xas); + goto error; } return 0; +error: + page->mapping = NULL; + page_ref_sub(page, nr); + return error; } /* @@ -670,8 +687,8 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) error = shmem_replace_entry(mapping, page->index, page, radswap); page->mapping = NULL; mapping->nrpages--; - __dec_node_page_state(page, NR_FILE_PAGES); - __dec_node_page_state(page, NR_SHMEM); + __dec_lruvec_page_state(page, NR_FILE_PAGES); + __dec_lruvec_page_state(page, NR_SHMEM); xa_unlock_irq(&mapping->i_pages); put_page(page); BUG_ON(error); @@ -1578,8 +1595,9 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, xa_lock_irq(&swap_mapping->i_pages); error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage); if (!error) { - __inc_node_page_state(newpage, NR_FILE_PAGES); - __dec_node_page_state(oldpage, NR_FILE_PAGES); + mem_cgroup_migrate(oldpage, newpage); + __inc_lruvec_page_state(newpage, NR_FILE_PAGES); + __dec_lruvec_page_state(oldpage, NR_FILE_PAGES); } xa_unlock_irq(&swap_mapping->i_pages); @@ -1591,8 +1609,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, */ oldpage = newpage; } else { - mem_cgroup_migrate(oldpage, newpage); - lru_cache_add_anon(newpage); + lru_cache_add(newpage); *pagep = newpage; } @@ -1619,7 +1636,6 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm; - struct mem_cgroup *memcg; struct page *page; swp_entry_t swap; int error; @@ -1664,31 +1680,12 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, goto failed; } - error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, - false); - if (!error) { - error = shmem_add_to_page_cache(page, mapping, index, - swp_to_radix_entry(swap), gfp); - /* - * We already confirmed swap under page lock, and make - * no memory allocation here, so usually no possibility - * of error; but free_swap_and_cache() only trylocks a - * page, so it is just possible that the entry has been - * truncated or holepunched since swap was confirmed. - * shmem_undo_range() will have done some of the - * unaccounting, now delete_from_swap_cache() will do - * the rest. - */ - if (error) { - mem_cgroup_cancel_charge(page, memcg, false); - delete_from_swap_cache(page); - } - } + error = shmem_add_to_page_cache(page, mapping, index, + swp_to_radix_entry(swap), gfp, + charge_mm); if (error) goto failed; - mem_cgroup_commit_charge(page, memcg, true, false); - spin_lock_irq(&info->lock); info->swapped--; shmem_recalc_inode(inode); @@ -1734,7 +1731,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo; struct mm_struct *charge_mm; - struct mem_cgroup *memcg; struct page *page; enum sgp_type sgp_huge = sgp; pgoff_t hindex = index; @@ -1859,25 +1855,12 @@ alloc_nohuge: if (sgp == SGP_WRITE) __SetPageReferenced(page); - error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, - PageTransHuge(page)); - if (error) { - if (PageTransHuge(page)) { - count_vm_event(THP_FILE_FALLBACK); - count_vm_event(THP_FILE_FALLBACK_CHARGE); - } - goto unacct; - } error = shmem_add_to_page_cache(page, mapping, hindex, - NULL, gfp & GFP_RECLAIM_MASK); - if (error) { - mem_cgroup_cancel_charge(page, memcg, - PageTransHuge(page)); + NULL, gfp & GFP_RECLAIM_MASK, + charge_mm); + if (error) goto unacct; - } - mem_cgroup_commit_charge(page, memcg, false, - PageTransHuge(page)); - lru_cache_add_anon(page); + lru_cache_add(page); spin_lock_irq(&info->lock); info->alloced += compound_nr(page); @@ -2314,7 +2297,6 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, struct address_space *mapping = inode->i_mapping; gfp_t gfp = mapping_gfp_mask(mapping); pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); - struct mem_cgroup *memcg; spinlock_t *ptl; void *page_kaddr; struct page *page; @@ -2364,16 +2346,10 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (unlikely(offset >= max_off)) goto out_release; - ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false); - if (ret) - goto out_release; - ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, - gfp & GFP_RECLAIM_MASK); + gfp & GFP_RECLAIM_MASK, dst_mm); if (ret) - goto out_release_uncharge; - - mem_cgroup_commit_charge(page, memcg, false, false); + goto out_release; _dst_pte = mk_pte(page, dst_vma->vm_page_prot); if (dst_vma->vm_flags & VM_WRITE) @@ -2394,13 +2370,13 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, ret = -EFAULT; max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(offset >= max_off)) - goto out_release_uncharge_unlock; + goto out_release_unlock; ret = -EEXIST; if (!pte_none(*dst_pte)) - goto out_release_uncharge_unlock; + goto out_release_unlock; - lru_cache_add_anon(page); + lru_cache_add(page); spin_lock_irq(&info->lock); info->alloced++; @@ -2419,12 +2395,10 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, ret = 0; out: return ret; -out_release_uncharge_unlock: +out_release_unlock: pte_unmap_unlock(dst_pte, ptl); ClearPageDirty(page); delete_from_page_cache(page); -out_release_uncharge: - mem_cgroup_cancel_charge(page, memcg, false); out_release: unlock_page(page); put_page(page); diff --git a/mm/slab.c b/mm/slab.c index a89633603b2d..9350062ffc1a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3106,7 +3106,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) struct zonelist *zonelist; struct zoneref *z; struct zone *zone; - enum zone_type high_zoneidx = gfp_zone(flags); + enum zone_type highest_zoneidx = gfp_zone(flags); void *obj = NULL; struct page *page; int nid; @@ -3124,7 +3124,7 @@ retry: * Look through allowed nodes for objects available * from existing per node queues. */ - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { nid = zone_to_nid(zone); if (cpuset_zone_allowed(zone, flags) && diff --git a/mm/slab_common.c b/mm/slab_common.c index 23c7500eea7d..9e72ba224175 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1303,7 +1303,8 @@ void __init create_kmalloc_caches(slab_flags_t flags) kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache( kmalloc_info[i].name[KMALLOC_DMA], kmalloc_info[i].size, - SLAB_CACHE_DMA | flags, 0, 0); + SLAB_CACHE_DMA | flags, 0, + kmalloc_info[i].size); } } #endif diff --git a/mm/slob.c b/mm/slob.c index fa53e9f73893..ac2aecfbc7a8 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -524,6 +524,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller) { return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller); } +EXPORT_SYMBOL(__kmalloc_track_caller); #ifdef CONFIG_NUMA void *__kmalloc_node_track_caller(size_t size, gfp_t gfp, @@ -531,6 +532,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfp, { return __do_kmalloc_node(size, gfp, node, caller); } +EXPORT_SYMBOL(__kmalloc_node_track_caller); #endif void kfree(const void *block) diff --git a/mm/slub.c b/mm/slub.c index 9bf44955c4f1..d52487919278 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -551,15 +551,32 @@ static void print_section(char *level, char *text, u8 *addr, metadata_access_disable(); } +/* + * See comment in calculate_sizes(). + */ +static inline bool freeptr_outside_object(struct kmem_cache *s) +{ + return s->offset >= s->inuse; +} + +/* + * Return offset of the end of info block which is inuse + free pointer if + * not overlapping with object. + */ +static inline unsigned int get_info_end(struct kmem_cache *s) +{ + if (freeptr_outside_object(s)) + return s->inuse + sizeof(void *); + else + return s->inuse; +} + static struct track *get_track(struct kmem_cache *s, void *object, enum track_item alloc) { struct track *p; - if (s->offset) - p = object + s->offset + sizeof(void *); - else - p = object + s->inuse; + p = object + get_info_end(s); return p + alloc; } @@ -662,6 +679,20 @@ static void slab_fix(struct kmem_cache *s, char *fmt, ...) va_end(args); } +static bool freelist_corrupted(struct kmem_cache *s, struct page *page, + void *freelist, void *nextfree) +{ + if ((s->flags & SLAB_CONSISTENCY_CHECKS) && + !check_valid_pointer(s, page, nextfree)) { + object_err(s, page, freelist, "Freechain corrupt"); + freelist = NULL; + slab_fix(s, "Isolate corrupted freechain"); + return true; + } + + return false; +} + static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) { unsigned int off; /* Offset of last byte */ @@ -686,10 +717,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) print_section(KERN_ERR, "Redzone ", p + s->object_size, s->inuse - s->object_size); - if (s->offset) - off = s->offset + sizeof(void *); - else - off = s->inuse; + off = get_info_end(s); if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); @@ -782,7 +810,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, * object address * Bytes of the object to be managed. * If the freepointer may overlay the object then the free - * pointer is the first word of the object. + * pointer is at the middle of the object. * * Poisoning uses 0x6b (POISON_FREE) and the last byte is * 0xa5 (POISON_END) @@ -816,11 +844,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) { - unsigned long off = s->inuse; /* The end of info */ - - if (s->offset) - /* Freepointer is placed after the object. */ - off += sizeof(void *); + unsigned long off = get_info_end(s); /* The end of info */ if (s->flags & SLAB_STORE_USER) /* We also have user information there */ @@ -907,7 +931,7 @@ static int check_object(struct kmem_cache *s, struct page *page, check_pad_bytes(s, page, p); } - if (!s->offset && val == SLUB_RED_ACTIVE) + if (!freeptr_outside_object(s) && val == SLUB_RED_ACTIVE) /* * Object and freepointer overlap. Cannot check * freepointer while object is allocated. @@ -1400,6 +1424,11 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} +static bool freelist_corrupted(struct kmem_cache *s, struct page *page, + void *freelist, void *nextfree) +{ + return false; +} #endif /* CONFIG_SLUB_DEBUG */ /* @@ -1909,7 +1938,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, struct zonelist *zonelist; struct zoneref *z; struct zone *zone; - enum zone_type high_zoneidx = gfp_zone(flags); + enum zone_type highest_zoneidx = gfp_zone(flags); void *object; unsigned int cpuset_mems_cookie; @@ -1938,7 +1967,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, do { cpuset_mems_cookie = read_mems_allowed_begin(); zonelist = node_zonelist(mempolicy_slab_node(), flags); - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { struct kmem_cache_node *n; n = get_node(s, zone_to_nid(zone)); @@ -2083,6 +2112,14 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, void *prior; unsigned long counters; + /* + * If 'nextfree' is invalid, it is possible that the object at + * 'freelist' is already corrupted. So isolate all objects + * starting at 'freelist'. + */ + if (freelist_corrupted(s, page, freelist, nextfree)) + break; + do { prior = page->freelist; counters = page->counters; @@ -3587,6 +3624,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * * This is the case if we do RCU, have a constructor or * destructor or are poisoning the objects. + * + * The assumption that s->offset >= s->inuse means free + * pointer is outside of the object is used in the + * freeptr_outside_object() function. If that is no + * longer true, the function needs to be modified. */ s->offset = size; size += sizeof(void *); @@ -3724,12 +3766,14 @@ error: } static void list_slab_objects(struct kmem_cache *s, struct page *page, - const char *text) + const char *text, unsigned long *map) { #ifdef CONFIG_SLUB_DEBUG void *addr = page_address(page); void *p; - unsigned long *map; + + if (!map) + return; slab_err(s, page, text, s->name); slab_lock(page); @@ -3742,8 +3786,6 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, print_tracking(s, p); } } - put_map(map); - slab_unlock(page); #endif } @@ -3757,6 +3799,11 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { LIST_HEAD(discard); struct page *page, *h; + unsigned long *map = NULL; + +#ifdef CONFIG_SLUB_DEBUG + map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL); +#endif BUG_ON(irqs_disabled()); spin_lock_irq(&n->list_lock); @@ -3766,11 +3813,16 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) list_add(&page->slab_list, &discard); } else { list_slab_objects(s, page, - "Objects remaining in %s on __kmem_cache_shutdown()"); + "Objects remaining in %s on __kmem_cache_shutdown()", + map); } } spin_unlock_irq(&n->list_lock); +#ifdef CONFIG_SLUB_DEBUG + bitmap_free(map); +#endif + list_for_each_entry_safe(page, h, &discard, slab_list) discard_slab(s, page); } @@ -4393,6 +4445,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) return ret; } +EXPORT_SYMBOL(__kmalloc_track_caller); #ifdef CONFIG_NUMA void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, @@ -4423,6 +4476,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, return ret; } +EXPORT_SYMBOL(__kmalloc_node_track_caller); #endif #ifdef CONFIG_SYSFS @@ -5639,7 +5693,8 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) */ if (buffer) buf = buffer; - else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf)) + else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf) && + !IS_ENABLED(CONFIG_SLUB_STATS)) buf = mbuf; else { buffer = (char *) get_zeroed_page(GFP_KERNEL); @@ -5673,19 +5728,6 @@ static struct kobj_type slab_ktype = { .release = kmem_cache_release, }; -static int uevent_filter(struct kset *kset, struct kobject *kobj) -{ - struct kobj_type *ktype = get_ktype(kobj); - - if (ktype == &slab_ktype) - return 1; - return 0; -} - -static const struct kset_uevent_ops slab_uevent_ops = { - .filter = uevent_filter, -}; - static struct kset *slab_kset; static inline struct kset *cache_kset(struct kmem_cache *s) @@ -5753,7 +5795,6 @@ static void sysfs_slab_remove_workfn(struct work_struct *work) #ifdef CONFIG_MEMCG kset_unregister(s->memcg_kset); #endif - kobject_uevent(&s->kobj, KOBJ_REMOVE); out: kobject_put(&s->kobj); } @@ -5794,8 +5835,10 @@ static int sysfs_slab_add(struct kmem_cache *s) s->kobj.kset = kset; err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); - if (err) + if (err) { + kobject_put(&s->kobj); goto out; + } err = sysfs_create_group(&s->kobj, &slab_attr_group); if (err) @@ -5811,7 +5854,6 @@ static int sysfs_slab_add(struct kmem_cache *s) } #endif - kobject_uevent(&s->kobj, KOBJ_ADD); if (!unmergeable) { /* Setup first alias */ sysfs_slab_alias(s, s->name); @@ -5892,7 +5934,7 @@ static int __init slab_sysfs_init(void) mutex_lock(&slab_mutex); - slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); + slab_kset = kset_create_and_add("slab", NULL, kernel_kobj); if (!slab_kset) { mutex_unlock(&slab_mutex); pr_err("Cannot register slab subsystem.\n"); diff --git a/mm/swap.c b/mm/swap.c index bf9a79fed62d..dbcab84c6fce 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -35,6 +35,7 @@ #include <linux/uio.h> #include <linux/hugetlb.h> #include <linux/page_idle.h> +#include <linux/local_lock.h> #include "internal.h" @@ -44,14 +45,32 @@ /* How many pages do we try to swap or page in/out together? */ int page_cluster; -static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); -static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); -static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); -static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); -static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs); +/* Protecting only lru_rotate.pvec which requires disabling interrupts */ +struct lru_rotate { + local_lock_t lock; + struct pagevec pvec; +}; +static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = { + .lock = INIT_LOCAL_LOCK(lock), +}; + +/* + * The following struct pagevec are grouped together because they are protected + * by disabling preemption (and interrupts remain enabled). + */ +struct lru_pvecs { + local_lock_t lock; + struct pagevec lru_add; + struct pagevec lru_deactivate_file; + struct pagevec lru_deactivate; + struct pagevec lru_lazyfree; #ifdef CONFIG_SMP -static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); + struct pagevec activate_page; #endif +}; +static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = { + .lock = INIT_LOCAL_LOCK(lock), +}; /* * This path almost never happens for VM activity - pages are normally @@ -83,8 +102,6 @@ static void __put_single_page(struct page *page) static void __put_compound_page(struct page *page) { - compound_page_dtor *dtor; - /* * __page_cache_release() is supposed to be called for thp, not for * hugetlb. This is because hugetlb page does never have PageLRU set @@ -93,8 +110,7 @@ static void __put_compound_page(struct page *page) */ if (!PageHuge(page)) __page_cache_release(page); - dtor = get_compound_page_dtor(page); - (*dtor)(page); + destroy_compound_page(page); } void __put_page(struct page *page) @@ -225,7 +241,7 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec, del_page_from_lru_list(page, lruvec, page_lru(page)); ClearPageActive(page); add_page_to_lru_list_tail(page, lruvec, page_lru(page)); - (*pgmoved)++; + (*pgmoved) += hpage_nr_pages(page); } } @@ -254,30 +270,57 @@ void rotate_reclaimable_page(struct page *page) unsigned long flags; get_page(page); - local_irq_save(flags); - pvec = this_cpu_ptr(&lru_rotate_pvecs); + local_lock_irqsave(&lru_rotate.lock, flags); + pvec = this_cpu_ptr(&lru_rotate.pvec); if (!pagevec_add(pvec, page) || PageCompound(page)) pagevec_move_tail(pvec); - local_irq_restore(flags); + local_unlock_irqrestore(&lru_rotate.lock, flags); } } -static void update_page_reclaim_stat(struct lruvec *lruvec, - int file, int rotated) +void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) { - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; + do { + unsigned long lrusize; + + /* Record cost event */ + if (file) + lruvec->file_cost += nr_pages; + else + lruvec->anon_cost += nr_pages; + + /* + * Decay previous events + * + * Because workloads change over time (and to avoid + * overflow) we keep these statistics as a floating + * average, which ends up weighing recent refaults + * more than old ones. + */ + lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) + + lruvec_page_state(lruvec, NR_ACTIVE_ANON) + + lruvec_page_state(lruvec, NR_INACTIVE_FILE) + + lruvec_page_state(lruvec, NR_ACTIVE_FILE); + + if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) { + lruvec->file_cost /= 2; + lruvec->anon_cost /= 2; + } + } while ((lruvec = parent_lruvec(lruvec))); +} - reclaim_stat->recent_scanned[file]++; - if (rotated) - reclaim_stat->recent_rotated[file]++; +void lru_note_cost_page(struct page *page) +{ + lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)), + page_is_file_lru(page), hpage_nr_pages(page)); } static void __activate_page(struct page *page, struct lruvec *lruvec, void *arg) { if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - int file = page_is_file_lru(page); int lru = page_lru_base_type(page); + int nr_pages = hpage_nr_pages(page); del_page_from_lru_list(page, lruvec, lru); SetPageActive(page); @@ -285,15 +328,16 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, add_page_to_lru_list(page, lruvec, lru); trace_mm_lru_activate(page); - __count_vm_event(PGACTIVATE); - update_page_reclaim_stat(lruvec, file, 1); + __count_vm_events(PGACTIVATE, nr_pages); + __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, + nr_pages); } } #ifdef CONFIG_SMP static void activate_page_drain(int cpu) { - struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); + struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu); if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, __activate_page, NULL); @@ -301,19 +345,21 @@ static void activate_page_drain(int cpu) static bool need_activate_page_drain(int cpu) { - return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; + return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0; } void activate_page(struct page *page) { page = compound_head(page); if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); + struct pagevec *pvec; + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.activate_page); get_page(page); if (!pagevec_add(pvec, page) || PageCompound(page)) pagevec_lru_move_fn(pvec, __activate_page, NULL); - put_cpu_var(activate_page_pvecs); + local_unlock(&lru_pvecs.lock); } } @@ -335,9 +381,12 @@ void activate_page(struct page *page) static void __lru_cache_activate_page(struct page *page) { - struct pagevec *pvec = &get_cpu_var(lru_add_pvec); + struct pagevec *pvec; int i; + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_add); + /* * Search backwards on the optimistic assumption that the page being * activated has just been added to this pagevec. Note that only @@ -357,7 +406,7 @@ static void __lru_cache_activate_page(struct page *page) } } - put_cpu_var(lru_add_pvec); + local_unlock(&lru_pvecs.lock); } /* @@ -385,7 +434,7 @@ void mark_page_accessed(struct page *page) } else if (!PageActive(page)) { /* * If the page is on the LRU, queue it for activation via - * activate_page_pvecs. Otherwise, assume the page is on a + * lru_pvecs.activate_page. Otherwise, assume the page is on a * pagevec, mark it active and it'll be moved to the active * LRU on the next drain. */ @@ -402,35 +451,6 @@ void mark_page_accessed(struct page *page) } EXPORT_SYMBOL(mark_page_accessed); -static void __lru_cache_add(struct page *page) -{ - struct pagevec *pvec = &get_cpu_var(lru_add_pvec); - - get_page(page); - if (!pagevec_add(pvec, page) || PageCompound(page)) - __pagevec_lru_add(pvec); - put_cpu_var(lru_add_pvec); -} - -/** - * lru_cache_add_anon - add a page to the page lists - * @page: the page to add - */ -void lru_cache_add_anon(struct page *page) -{ - if (PageActive(page)) - ClearPageActive(page); - __lru_cache_add(page); -} - -void lru_cache_add_file(struct page *page) -{ - if (PageActive(page)) - ClearPageActive(page); - __lru_cache_add(page); -} -EXPORT_SYMBOL(lru_cache_add_file); - /** * lru_cache_add - add a page to a page list * @page: the page to be added to the LRU. @@ -442,10 +462,19 @@ EXPORT_SYMBOL(lru_cache_add_file); */ void lru_cache_add(struct page *page) { + struct pagevec *pvec; + VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); VM_BUG_ON_PAGE(PageLRU(page), page); - __lru_cache_add(page); + + get_page(page); + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_add); + if (!pagevec_add(pvec, page) || PageCompound(page)) + __pagevec_lru_add(pvec); + local_unlock(&lru_pvecs.lock); } +EXPORT_SYMBOL(lru_cache_add); /** * lru_cache_add_active_or_unevictable @@ -501,8 +530,9 @@ void lru_cache_add_active_or_unevictable(struct page *page, static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, void *arg) { - int lru, file; + int lru; bool active; + int nr_pages = hpage_nr_pages(page); if (!PageLRU(page)) return; @@ -515,7 +545,6 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, return; active = PageActive(page); - file = page_is_file_lru(page); lru = page_lru_base_type(page); del_page_from_lru_list(page, lruvec, lru + active); @@ -536,28 +565,31 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, * We moves tha page into tail of inactive. */ add_page_to_lru_list_tail(page, lruvec, lru); - __count_vm_event(PGROTATED); + __count_vm_events(PGROTATED, nr_pages); } - if (active) - __count_vm_event(PGDEACTIVATE); - update_page_reclaim_stat(lruvec, file, 0); + if (active) { + __count_vm_events(PGDEACTIVATE, nr_pages); + __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, + nr_pages); + } } static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, void *arg) { if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { - int file = page_is_file_lru(page); int lru = page_lru_base_type(page); + int nr_pages = hpage_nr_pages(page); del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); ClearPageActive(page); ClearPageReferenced(page); add_page_to_lru_list(page, lruvec, lru); - __count_vm_events(PGDEACTIVATE, hpage_nr_pages(page)); - update_page_reclaim_stat(lruvec, file, 0); + __count_vm_events(PGDEACTIVATE, nr_pages); + __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, + nr_pages); } } @@ -567,6 +599,7 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page) && !PageUnevictable(page)) { bool active = PageActive(page); + int nr_pages = hpage_nr_pages(page); del_page_from_lru_list(page, lruvec, LRU_INACTIVE_ANON + active); @@ -580,9 +613,9 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, ClearPageSwapBacked(page); add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE); - __count_vm_events(PGLAZYFREE, hpage_nr_pages(page)); - count_memcg_page_event(page, PGLAZYFREE); - update_page_reclaim_stat(lruvec, 1, 0); + __count_vm_events(PGLAZYFREE, nr_pages); + __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, + nr_pages); } } @@ -593,30 +626,30 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, */ void lru_add_drain_cpu(int cpu) { - struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu); + struct pagevec *pvec = &per_cpu(lru_pvecs.lru_add, cpu); if (pagevec_count(pvec)) __pagevec_lru_add(pvec); - pvec = &per_cpu(lru_rotate_pvecs, cpu); + pvec = &per_cpu(lru_rotate.pvec, cpu); if (pagevec_count(pvec)) { unsigned long flags; /* No harm done if a racing interrupt already did this */ - local_irq_save(flags); + local_lock_irqsave(&lru_rotate.lock, flags); pagevec_move_tail(pvec); - local_irq_restore(flags); + local_unlock_irqrestore(&lru_rotate.lock, flags); } - pvec = &per_cpu(lru_deactivate_file_pvecs, cpu); + pvec = &per_cpu(lru_pvecs.lru_deactivate_file, cpu); if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); - pvec = &per_cpu(lru_deactivate_pvecs, cpu); + pvec = &per_cpu(lru_pvecs.lru_deactivate, cpu); if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); - pvec = &per_cpu(lru_lazyfree_pvecs, cpu); + pvec = &per_cpu(lru_pvecs.lru_lazyfree, cpu); if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); @@ -641,11 +674,14 @@ void deactivate_file_page(struct page *page) return; if (likely(get_page_unless_zero(page))) { - struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs); + struct pagevec *pvec; + + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file); if (!pagevec_add(pvec, page) || PageCompound(page)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); - put_cpu_var(lru_deactivate_file_pvecs); + local_unlock(&lru_pvecs.lock); } } @@ -660,12 +696,14 @@ void deactivate_file_page(struct page *page) void deactivate_page(struct page *page) { if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { - struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); + struct pagevec *pvec; + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate); get_page(page); if (!pagevec_add(pvec, page) || PageCompound(page)) pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); - put_cpu_var(lru_deactivate_pvecs); + local_unlock(&lru_pvecs.lock); } } @@ -680,19 +718,30 @@ void mark_page_lazyfree(struct page *page) { if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page) && !PageUnevictable(page)) { - struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs); + struct pagevec *pvec; + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree); get_page(page); if (!pagevec_add(pvec, page) || PageCompound(page)) pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); - put_cpu_var(lru_lazyfree_pvecs); + local_unlock(&lru_pvecs.lock); } } void lru_add_drain(void) { - lru_add_drain_cpu(get_cpu()); - put_cpu(); + local_lock(&lru_pvecs.lock); + lru_add_drain_cpu(smp_processor_id()); + local_unlock(&lru_pvecs.lock); +} + +void lru_add_drain_cpu_zone(struct zone *zone) +{ + local_lock(&lru_pvecs.lock); + lru_add_drain_cpu(smp_processor_id()); + drain_local_pages(zone); + local_unlock(&lru_pvecs.lock); } #ifdef CONFIG_SMP @@ -743,11 +792,11 @@ void lru_add_drain_all(void) for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); - if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || - pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || - pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || - pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || - pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) || + if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) || + pagevec_count(&per_cpu(lru_rotate.pvec, cpu)) || + pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) || + pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || + pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); @@ -890,8 +939,6 @@ EXPORT_SYMBOL(__pagevec_release); void lru_add_page_tail(struct page *page, struct page *page_tail, struct lruvec *lruvec, struct list_head *list) { - const int file = 0; - VM_BUG_ON_PAGE(!PageHead(page), page); VM_BUG_ON_PAGE(PageCompound(page_tail), page); VM_BUG_ON_PAGE(PageLRU(page_tail), page); @@ -917,9 +964,6 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, add_page_to_lru_list_tail(page_tail, lruvec, page_lru(page_tail)); } - - if (!PageUnevictable(page)) - update_page_reclaim_stat(lruvec, file, PageActive(page_tail)); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -928,6 +972,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, { enum lru_list lru; int was_unevictable = TestClearPageUnevictable(page); + int nr_pages = hpage_nr_pages(page); VM_BUG_ON_PAGE(PageLRU(page), page); @@ -962,16 +1007,14 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, if (page_evictable(page)) { lru = page_lru(page); - update_page_reclaim_stat(lruvec, page_is_file_lru(page), - PageActive(page)); if (was_unevictable) - count_vm_event(UNEVICTABLE_PGRESCUED); + __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } else { lru = LRU_UNEVICTABLE; ClearPageActive(page); SetPageUnevictable(page); if (!was_unevictable) - count_vm_event(UNEVICTABLE_PGCULLED); + __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); } add_page_to_lru_list(page, lruvec, lru); diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index 45affaef3bc6..7f34343c075a 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -171,9 +171,6 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) unsigned long length; struct swap_cgroup_ctrl *ctrl; - if (!do_swap_account) - return 0; - length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); array_size = length * sizeof(void *); @@ -209,9 +206,6 @@ void swap_cgroup_swapoff(int type) unsigned long i, length; struct swap_cgroup_ctrl *ctrl; - if (!do_swap_account) - return; - mutex_lock(&swap_cgroup_mutex); ctrl = &swap_cgroup_ctrl[type]; map = ctrl->map; diff --git a/mm/swap_state.c b/mm/swap_state.c index ebed37bbf7a3..9d20b00627af 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -360,12 +360,13 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, bool *new_page_allocated) { - struct page *found_page = NULL, *new_page = NULL; struct swap_info_struct *si; - int err; + struct page *page; + *new_page_allocated = false; - do { + for (;;) { + int err; /* * First check the swap cache. Since this is normally * called after lookup_swap_cache() failed, re-calling @@ -373,12 +374,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, */ si = get_swap_device(entry); if (!si) - break; - found_page = find_get_page(swap_address_space(entry), - swp_offset(entry)); + return NULL; + page = find_get_page(swap_address_space(entry), + swp_offset(entry)); put_swap_device(si); - if (found_page) - break; + if (page) + return page; /* * Just skip read ahead for unused swap slot. @@ -389,54 +390,71 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * else swap_off will be aborted if we return NULL. */ if (!__swp_swapcount(entry) && swap_slot_cache_enabled) - break; + return NULL; /* - * Get a new page to read into from swap. + * Get a new page to read into from swap. Allocate it now, + * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will + * cause any racers to loop around until we add it to cache. */ - if (!new_page) { - new_page = alloc_page_vma(gfp_mask, vma, addr); - if (!new_page) - break; /* Out of memory */ - } + page = alloc_page_vma(gfp_mask, vma, addr); + if (!page) + return NULL; /* * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry); - if (err == -EEXIST) { - /* - * We might race against get_swap_page() and stumble - * across a SWAP_HAS_CACHE swap_map entry whose page - * has not been brought into the swapcache yet. - */ - cond_resched(); - continue; - } else if (err) /* swp entry is obsolete ? */ + if (!err) break; - /* May fail (-ENOMEM) if XArray node allocation failed. */ - __SetPageLocked(new_page); - __SetPageSwapBacked(new_page); - err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); - if (likely(!err)) { - /* Initiate read into locked page */ - SetPageWorkingset(new_page); - lru_cache_add_anon(new_page); - *new_page_allocated = true; - return new_page; - } - __ClearPageLocked(new_page); + put_page(page); + if (err != -EEXIST) + return NULL; + /* - * add_to_swap_cache() doesn't return -EEXIST, so we can safely - * clear SWAP_HAS_CACHE flag. + * We might race against __delete_from_swap_cache(), and + * stumble across a swap_map entry whose SWAP_HAS_CACHE + * has not yet been cleared. Or race against another + * __read_swap_cache_async(), which has set SWAP_HAS_CACHE + * in swap_map, but not yet added its page to swap cache. */ - put_swap_page(new_page, entry); - } while (err != -ENOMEM); + cond_resched(); + } + + /* + * The swap entry is ours to swap in. Prepare the new page. + */ + + __SetPageLocked(page); + __SetPageSwapBacked(page); + + /* May fail (-ENOMEM) if XArray node allocation failed. */ + if (add_to_swap_cache(page, entry, gfp_mask & GFP_KERNEL)) { + put_swap_page(page, entry); + goto fail_unlock; + } + + if (mem_cgroup_charge(page, NULL, gfp_mask)) { + delete_from_swap_cache(page); + goto fail_unlock; + } + + /* XXX: Move to lru_cache_add() when it supports new vs putback */ + spin_lock_irq(&page_pgdat(page)->lru_lock); + lru_note_cost_page(page); + spin_unlock_irq(&page_pgdat(page)->lru_lock); + + /* Caller will initiate read into locked page */ + SetPageWorkingset(page); + lru_cache_add(page); + *new_page_allocated = true; + return page; - if (new_page) - put_page(new_page); - return found_page; +fail_unlock: + unlock_page(page); + put_page(page); + return NULL; } /* @@ -509,10 +527,11 @@ static unsigned long swapin_nr_pages(unsigned long offset) return 1; hits = atomic_xchg(&swapin_readahead_hits, 0); - pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages, + pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits, + max_pages, atomic_read(&last_readahead_pages)); if (!hits) - prev_offset = offset; + WRITE_ONCE(prev_offset, offset); atomic_set(&last_readahead_pages, pages); return pages; diff --git a/mm/swapfile.c b/mm/swapfile.c index 5871a2aa86a5..a3d191e205f2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -601,7 +601,6 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, { struct percpu_cluster *cluster; struct swap_cluster_info *ci; - bool found_free; unsigned long tmp, max; new_cluster: @@ -614,17 +613,17 @@ new_cluster: } else if (!cluster_list_empty(&si->discard_clusters)) { /* * we don't have free cluster but have some clusters in - * discarding, do discard now and reclaim them + * discarding, do discard now and reclaim them, then + * reread cluster_next_cpu since we dropped si->lock */ swap_do_scheduled_discard(si); - *scan_base = *offset = si->cluster_next; + *scan_base = this_cpu_read(*si->cluster_next_cpu); + *offset = *scan_base; goto new_cluster; } else return false; } - found_free = false; - /* * Other CPUs can use our cluster if they can't find a free cluster, * check if there is still free entry in the cluster @@ -632,27 +631,23 @@ new_cluster: tmp = cluster->next; max = min_t(unsigned long, si->max, (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER); - if (tmp >= max) { - cluster_set_null(&cluster->index); - goto new_cluster; - } - ci = lock_cluster(si, tmp); - while (tmp < max) { - if (!si->swap_map[tmp]) { - found_free = true; - break; + if (tmp < max) { + ci = lock_cluster(si, tmp); + while (tmp < max) { + if (!si->swap_map[tmp]) + break; + tmp++; } - tmp++; + unlock_cluster(ci); } - unlock_cluster(ci); - if (!found_free) { + if (tmp >= max) { cluster_set_null(&cluster->index); goto new_cluster; } cluster->next = tmp + 1; *offset = tmp; *scan_base = tmp; - return found_free; + return true; } static void __del_from_avail_list(struct swap_info_struct *p) @@ -729,6 +724,34 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, } } +static void set_cluster_next(struct swap_info_struct *si, unsigned long next) +{ + unsigned long prev; + + if (!(si->flags & SWP_SOLIDSTATE)) { + si->cluster_next = next; + return; + } + + prev = this_cpu_read(*si->cluster_next_cpu); + /* + * Cross the swap address space size aligned trunk, choose + * another trunk randomly to avoid lock contention on swap + * address space if possible. + */ + if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) != + (next >> SWAP_ADDRESS_SPACE_SHIFT)) { + /* No free swap slots available */ + if (si->highest_bit <= si->lowest_bit) + return; + next = si->lowest_bit + + prandom_u32_max(si->highest_bit - si->lowest_bit + 1); + next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES); + next = max_t(unsigned int, next, si->lowest_bit); + } + this_cpu_write(*si->cluster_next_cpu, next); +} + static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[]) @@ -739,9 +762,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si, unsigned long last_in_cluster = 0; int latency_ration = LATENCY_LIMIT; int n_ret = 0; - - if (nr > SWAP_BATCH) - nr = SWAP_BATCH; + bool scanned_many = false; /* * We try to cluster swap pages by allocating them sequentially @@ -755,17 +776,22 @@ static int scan_swap_map_slots(struct swap_info_struct *si, */ si->flags += SWP_SCANNING; - scan_base = offset = si->cluster_next; + /* + * Use percpu scan base for SSD to reduce lock contention on + * cluster and swap cache. For HDD, sequential access is more + * important. + */ + if (si->flags & SWP_SOLIDSTATE) + scan_base = this_cpu_read(*si->cluster_next_cpu); + else + scan_base = si->cluster_next; + offset = scan_base; /* SSD algorithm */ if (si->cluster_info) { - if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) - goto checks; - else + if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) goto scan; - } - - if (unlikely(!si->cluster_nr--)) { + } else if (unlikely(!si->cluster_nr--)) { if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; @@ -848,7 +874,6 @@ checks: unlock_cluster(ci); swap_range_alloc(si, offset, 1); - si->cluster_next = offset + 1; slots[n_ret++] = swp_entry(si->type, offset); /* got enough slots or reach max slots? */ @@ -871,19 +896,33 @@ checks: if (si->cluster_info) { if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) goto checks; - else - goto done; - } - /* non-ssd case */ - ++offset; - - /* non-ssd case, still more slots in cluster? */ - if (si->cluster_nr && !si->swap_map[offset]) { + } else if (si->cluster_nr && !si->swap_map[++offset]) { + /* non-ssd case, still more slots in cluster? */ --si->cluster_nr; goto checks; } + /* + * Even if there's no free clusters available (fragmented), + * try to scan a little more quickly with lock held unless we + * have scanned too many slots already. + */ + if (!scanned_many) { + unsigned long scan_limit; + + if (offset < scan_base) + scan_limit = scan_base; + else + scan_limit = si->highest_bit; + for (; offset <= scan_limit && --latency_ration > 0; + offset++) { + if (!si->swap_map[offset]) + goto checks; + } + } + done: + set_cluster_next(si, offset + 1); si->flags -= SWP_SCANNING; return n_ret; @@ -901,6 +940,7 @@ scan: if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; + scanned_many = true; } } offset = si->lowest_bit; @@ -916,6 +956,7 @@ scan: if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; + scanned_many = true; } offset++; } @@ -1004,11 +1045,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) if (avail_pgs <= 0) goto noswap; - if (n_goal > SWAP_BATCH) - n_goal = SWAP_BATCH; - - if (n_goal > avail_pgs) - n_goal = avail_pgs; + n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs); atomic_long_sub(n_goal * size, &nr_swap_pages); @@ -1275,13 +1312,14 @@ unlock_out: } static unsigned char __swap_entry_free(struct swap_info_struct *p, - swp_entry_t entry, unsigned char usage) + swp_entry_t entry) { struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); + unsigned char usage; ci = lock_cluster_or_swap_info(p, offset); - usage = __swap_entry_free_locked(p, offset, usage); + usage = __swap_entry_free_locked(p, offset, 1); unlock_cluster_or_swap_info(p, ci); if (!usage) free_swap_slot(entry); @@ -1316,7 +1354,7 @@ void swap_free(swp_entry_t entry) p = _swap_info_get(entry); if (p) - __swap_entry_free(p, entry, 1); + __swap_entry_free(p, entry); } /* @@ -1739,7 +1777,7 @@ int free_swap_and_cache(swp_entry_t entry) p = _swap_info_get(entry); if (p) { - count = __swap_entry_free(p, entry, 1); + count = __swap_entry_free(p, entry); if (count == SWAP_HAS_CACHE && !swap_page_trans_huge_swapped(p, entry)) __try_to_reclaim_swap(p, swp_offset(entry), @@ -1854,7 +1892,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct page *page) { struct page *swapcache; - struct mem_cgroup *memcg; spinlock_t *ptl; pte_t *pte; int ret = 1; @@ -1864,15 +1901,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, if (unlikely(!page)) return -ENOMEM; - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, - &memcg, false)) { - ret = -ENOMEM; - goto out_nolock; - } - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { - mem_cgroup_cancel_charge(page, memcg, false); ret = 0; goto out; } @@ -1884,10 +1914,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, pte_mkold(mk_pte(page, vma->vm_page_prot))); if (page == swapcache) { page_add_anon_rmap(page, vma, addr, false); - mem_cgroup_commit_charge(page, memcg, true, false); } else { /* ksm created a completely new copy */ page_add_new_anon_rmap(page, vma, addr, false); - mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } swap_free(entry); @@ -1898,7 +1926,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, activate_page(page); out: pte_unmap_unlock(pte, ptl); -out_nolock: if (page != swapcache) { unlock_page(page); put_page(page); @@ -1937,10 +1964,14 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pte_unmap(pte); swap_map = &si->swap_map[offset]; - vmf.vma = vma; - vmf.address = addr; - vmf.pmd = pmd; - page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); + page = lookup_swap_cache(entry, vma, addr); + if (!page) { + vmf.vma = vma; + vmf.address = addr; + vmf.pmd = pmd; + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, + &vmf); + } if (!page) { if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD) goto try_next; @@ -2650,6 +2681,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mutex_unlock(&swapon_mutex); free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; + free_percpu(p->cluster_next_cpu); + p->cluster_next_cpu = NULL; vfree(swap_map); kvfree(cluster_info); kvfree(frontswap_map); @@ -2757,20 +2790,24 @@ static int swap_show(struct seq_file *swap, void *v) struct swap_info_struct *si = v; struct file *file; int len; + unsigned int bytes, inuse; if (si == SEQ_START_TOKEN) { - seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); + seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); return 0; } + bytes = si->pages << (PAGE_SHIFT - 10); + inuse = si->inuse_pages << (PAGE_SHIFT - 10); + file = si->swap_file; len = seq_file_path(swap, file, " \t\n\\"); - seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", + seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n", len < 40 ? 40 - len : 1, " ", S_ISBLK(file_inode(file)->i_mode) ? "partition" : "file\t", - si->pages << (PAGE_SHIFT - 10), - si->inuse_pages << (PAGE_SHIFT - 10), + bytes, bytes < 10000000 ? "\t" : "", + inuse, inuse < 10000000 ? "\t" : "", si->prio); return 0; } @@ -3202,11 +3239,19 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) unsigned long ci, nr_cluster; p->flags |= SWP_SOLIDSTATE; + p->cluster_next_cpu = alloc_percpu(unsigned int); + if (!p->cluster_next_cpu) { + error = -ENOMEM; + goto bad_swap_unlock_inode; + } /* * select a random position to start with to help wear leveling * SSD */ - p->cluster_next = 1 + (prandom_u32() % p->highest_bit); + for_each_possible_cpu(cpu) { + per_cpu(*p->cluster_next_cpu, cpu) = + 1 + prandom_u32_max(p->highest_bit); + } nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info), @@ -3322,6 +3367,8 @@ bad_swap_unlock_inode: bad_swap: free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; + free_percpu(p->cluster_next_cpu); + p->cluster_next_cpu = NULL; if (inode && S_ISBLK(inode->i_mode) && p->bdev) { set_blocksize(p->bdev, p->old_block_size); blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); @@ -3654,7 +3701,7 @@ static bool swap_count_continued(struct swap_info_struct *si, spin_lock(&si->cont_lock); offset &= ~PAGE_MASK; - page = list_entry(head->lru.next, struct page, lru); + page = list_next_entry(head, lru); map = kmap_atomic(page) + offset; if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ @@ -3666,13 +3713,13 @@ static bool swap_count_continued(struct swap_info_struct *si, */ while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { kunmap_atomic(map); - page = list_entry(page->lru.next, struct page, lru); + page = list_next_entry(page, lru); BUG_ON(page == head); map = kmap_atomic(page) + offset; } if (*map == SWAP_CONT_MAX) { kunmap_atomic(map); - page = list_entry(page->lru.next, struct page, lru); + page = list_next_entry(page, lru); if (page == head) { ret = false; /* add count continuation */ goto out; @@ -3682,12 +3729,10 @@ init_map: *map = 0; /* we didn't zero the page */ } *map += 1; kunmap_atomic(map); - page = list_entry(page->lru.prev, struct page, lru); - while (page != head) { + while ((page = list_prev_entry(page, lru)) != head) { map = kmap_atomic(page) + offset; *map = COUNT_CONTINUED; kunmap_atomic(map); - page = list_entry(page->lru.prev, struct page, lru); } ret = true; /* incremented */ @@ -3698,7 +3743,7 @@ init_map: *map = 0; /* we didn't zero the page */ BUG_ON(count != COUNT_CONTINUED); while (*map == COUNT_CONTINUED) { kunmap_atomic(map); - page = list_entry(page->lru.next, struct page, lru); + page = list_next_entry(page, lru); BUG_ON(page == head); map = kmap_atomic(page) + offset; } @@ -3707,13 +3752,11 @@ init_map: *map = 0; /* we didn't zero the page */ if (*map == 0) count = 0; kunmap_atomic(map); - page = list_entry(page->lru.prev, struct page, lru); - while (page != head) { + while ((page = list_prev_entry(page, lru)) != head) { map = kmap_atomic(page) + offset; *map = SWAP_CONT_MAX | count; count = COUNT_CONTINUED; kunmap_atomic(map); - page = list_entry(page->lru.prev, struct page, lru); } ret = count == COUNT_CONTINUED; } @@ -3745,11 +3788,12 @@ static void free_swap_count_continuations(struct swap_info_struct *si) } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node, - gfp_t gfp_mask) +void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) { struct swap_info_struct *si, *next; - if (!(gfp_mask & __GFP_IO) || !memcg) + int nid = page_to_nid(page); + + if (!(gfp_mask & __GFP_IO)) return; if (!blk_cgroup_congested()) @@ -3763,11 +3807,10 @@ void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node, return; spin_lock(&swap_avail_lock); - plist_for_each_entry_safe(si, next, &swap_avail_heads[node], - avail_lists[node]) { + plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], + avail_lists[nid]) { if (si->bdev) { - blkcg_schedule_throttle(bdev_get_queue(si->bdev), - true); + blkcg_schedule_throttle(bdev_get_queue(si->bdev), true); break; } } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 512576e171ce..7f5194046b01 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -56,7 +56,6 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, struct page **pagep, bool wp_copy) { - struct mem_cgroup *memcg; pte_t _dst_pte, *dst_pte; spinlock_t *ptl; void *page_kaddr; @@ -97,7 +96,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, __SetPageUptodate(page); ret = -ENOMEM; - if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false)) + if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL)) goto out_release; _dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot)); @@ -124,7 +123,6 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, inc_mm_counter(dst_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, dst_vma, dst_addr, false); - mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, dst_vma); set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); @@ -138,7 +136,6 @@ out: return ret; out_release_uncharge_unlock: pte_unmap_unlock(dst_pte, ptl); - mem_cgroup_cancel_charge(page, memcg, false); out_release: put_page(page); goto out; diff --git a/mm/util.c b/mm/util.c index 988d11e6c17c..41b47d8cae09 100644 --- a/mm/util.c +++ b/mm/util.c @@ -580,7 +580,7 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) if (ret || size <= PAGE_SIZE) return ret; - return __vmalloc_node_flags_caller(size, node, flags, + return __vmalloc_node(size, 1, flags, node, __builtin_return_address(0)); } EXPORT_SYMBOL(kvmalloc_node); @@ -717,9 +717,8 @@ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ -int overcommit_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret; @@ -729,9 +728,8 @@ int overcommit_ratio_handler(struct ctl_table *table, int write, return ret; } -int overcommit_kbytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9a8227afa073..1e94497b7388 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -69,7 +69,8 @@ static void free_work(struct work_struct *w) /*** Page table manipulation functions ***/ -static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) +static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { pte_t *pte; @@ -78,73 +79,118 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); WARN_ON(!pte_none(ptent) && !pte_present(ptent)); } while (pte++, addr += PAGE_SIZE, addr != end); + *mask |= PGTBL_PTE_MODIFIED; } -static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) +static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; + int cleared; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (pmd_clear_huge(pmd)) + + cleared = pmd_clear_huge(pmd); + if (cleared || pmd_bad(*pmd)) + *mask |= PGTBL_PMD_MODIFIED; + + if (cleared) continue; if (pmd_none_or_clear_bad(pmd)) continue; - vunmap_pte_range(pmd, addr, next); + vunmap_pte_range(pmd, addr, next, mask); } while (pmd++, addr = next, addr != end); } -static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end) +static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; + int cleared; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); - if (pud_clear_huge(pud)) + + cleared = pud_clear_huge(pud); + if (cleared || pud_bad(*pud)) + *mask |= PGTBL_PUD_MODIFIED; + + if (cleared) continue; if (pud_none_or_clear_bad(pud)) continue; - vunmap_pmd_range(pud, addr, next); + vunmap_pmd_range(pud, addr, next, mask); } while (pud++, addr = next, addr != end); } -static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end) +static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; + int cleared; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); - if (p4d_clear_huge(p4d)) + + cleared = p4d_clear_huge(p4d); + if (cleared || p4d_bad(*p4d)) + *mask |= PGTBL_P4D_MODIFIED; + + if (cleared) continue; if (p4d_none_or_clear_bad(p4d)) continue; - vunmap_pud_range(p4d, addr, next); + vunmap_pud_range(p4d, addr, next, mask); } while (p4d++, addr = next, addr != end); } -static void vunmap_page_range(unsigned long addr, unsigned long end) +/** + * unmap_kernel_range_noflush - unmap kernel VM area + * @start: start of the VM area to unmap + * @size: size of the VM area to unmap + * + * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify + * should have been allocated using get_vm_area() and its friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is responsible + * for calling flush_cache_vunmap() on to-be-mapped areas before calling this + * function and flush_tlb_kernel_range() after. + */ +void unmap_kernel_range_noflush(unsigned long start, unsigned long size) { - pgd_t *pgd; + unsigned long end = start + size; unsigned long next; + pgd_t *pgd; + unsigned long addr = start; + pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); + start = addr; pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); + if (pgd_bad(*pgd)) + mask |= PGTBL_PGD_MODIFIED; if (pgd_none_or_clear_bad(pgd)) continue; - vunmap_p4d_range(pgd, addr, next); + vunmap_p4d_range(pgd, addr, next, &mask); } while (pgd++, addr = next, addr != end); + + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); } static int vmap_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { pte_t *pte; @@ -153,7 +199,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, * callers keep track of where we're up to. */ - pte = pte_alloc_kernel(pmd, addr); + pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; do { @@ -166,94 +212,117 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); + *mask |= PGTBL_PTE_MODIFIED; return 0; } static int vmap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; - pmd = pmd_alloc(&init_mm, pud, addr); + pmd = pmd_alloc_track(&init_mm, pud, addr, mask); if (!pmd) return -ENOMEM; do { next = pmd_addr_end(addr, end); - if (vmap_pte_range(pmd, addr, next, prot, pages, nr)) + if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pmd++, addr = next, addr != end); return 0; } static int vmap_pud_range(p4d_t *p4d, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; - pud = pud_alloc(&init_mm, p4d, addr); + pud = pud_alloc_track(&init_mm, p4d, addr, mask); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); - if (vmap_pmd_range(pud, addr, next, prot, pages, nr)) + if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pud++, addr = next, addr != end); return 0; } static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; - p4d = p4d_alloc(&init_mm, pgd, addr); + p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); if (!p4d) return -ENOMEM; do { next = p4d_addr_end(addr, end); - if (vmap_pud_range(p4d, addr, next, prot, pages, nr)) + if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (p4d++, addr = next, addr != end); return 0; } -/* - * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and - * will have pfns corresponding to the "pages" array. +/** + * map_kernel_range_noflush - map kernel VM area with the specified pages + * @addr: start of the VM area to map + * @size: size of the VM area to map + * @prot: page protection flags to use + * @pages: pages to map * - * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] + * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should + * have been allocated using get_vm_area() and its friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is responsible for + * calling flush_cache_vmap() on to-be-mapped areas before calling this + * function. + * + * RETURNS: + * 0 on success, -errno on failure. */ -static int vmap_page_range_noflush(unsigned long start, unsigned long end, - pgprot_t prot, struct page **pages) +int map_kernel_range_noflush(unsigned long addr, unsigned long size, + pgprot_t prot, struct page **pages) { - pgd_t *pgd; + unsigned long start = addr; + unsigned long end = addr + size; unsigned long next; - unsigned long addr = start; + pgd_t *pgd; int err = 0; int nr = 0; + pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr); + if (pgd_bad(*pgd)) + mask |= PGTBL_PGD_MODIFIED; + err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); if (err) return err; } while (pgd++, addr = next, addr != end); - return nr; + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); + + return 0; } -static int vmap_page_range(unsigned long start, unsigned long end, - pgprot_t prot, struct page **pages) +int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, + struct page **pages) { int ret; - ret = vmap_page_range_noflush(start, end, prot, pages); - flush_cache_vmap(start, end); + ret = map_kernel_range_noflush(start, size, prot, pages); + flush_cache_vmap(start, start + size); return ret; } @@ -1223,14 +1292,6 @@ int unregister_vmap_purge_notifier(struct notifier_block *nb) EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); /* - * Clear the pagetable entries of a given vmap_area - */ -static void unmap_vmap_area(struct vmap_area *va) -{ - vunmap_page_range(va->va_start, va->va_end); -} - -/* * lazy_max_pages is the maximum amount of virtual address space we gather up * before attempting to purge with a TLB flush. * @@ -1293,12 +1354,6 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) return false; /* - * First make sure the mappings are removed from all page-tables - * before they are freed. - */ - vmalloc_sync_unmappings(); - - /* * TODO: to calculate a flush range without looping. * The list can be up to lazy_max_pages() elements. */ @@ -1391,7 +1446,7 @@ static void free_vmap_area_noflush(struct vmap_area *va) static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); - unmap_vmap_area(va); + unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start); if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(va->va_start, va->va_end); @@ -1665,7 +1720,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) return vaddr; } -static void vb_free(const void *addr, unsigned long size) +static void vb_free(unsigned long addr, unsigned long size) { unsigned long offset; unsigned long vb_idx; @@ -1675,24 +1730,22 @@ static void vb_free(const void *addr, unsigned long size) BUG_ON(offset_in_page(size)); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); - flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size); + flush_cache_vunmap(addr, addr + size); order = get_order(size); - offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); - offset >>= PAGE_SHIFT; + offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; - vb_idx = addr_to_vb_idx((unsigned long)addr); + vb_idx = addr_to_vb_idx(addr); rcu_read_lock(); vb = radix_tree_lookup(&vmap_block_tree, vb_idx); rcu_read_unlock(); BUG_ON(!vb); - vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); + unmap_kernel_range_noflush(addr, size); if (debug_pagealloc_enabled_static()) - flush_tlb_kernel_range((unsigned long)addr, - (unsigned long)addr + size); + flush_tlb_kernel_range(addr, addr + size); spin_lock(&vb->lock); @@ -1792,7 +1845,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) if (likely(count <= VMAP_MAX_ALLOC)) { debug_check_no_locks_freed(mem, size); - vb_free(mem, size); + vb_free(addr, size); return; } @@ -1819,7 +1872,7 @@ EXPORT_SYMBOL(vm_unmap_ram); * * Returns: a pointer to the address that has been mapped, or %NULL on failure */ -void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) +void *vm_map_ram(struct page **pages, unsigned int count, int node) { unsigned long size = (unsigned long)count << PAGE_SHIFT; unsigned long addr; @@ -1843,7 +1896,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro kasan_unpoison_vmalloc(mem, size); - if (vmap_page_range(addr, addr + size, prot, pages) < 0) { + if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) { vm_unmap_ram(mem, count); return NULL; } @@ -1988,51 +2041,6 @@ void __init vmalloc_init(void) } /** - * map_kernel_range_noflush - map kernel VM area with the specified pages - * @addr: start of the VM area to map - * @size: size of the VM area to map - * @prot: page protection flags to use - * @pages: pages to map - * - * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size - * specify should have been allocated using get_vm_area() and its - * friends. - * - * NOTE: - * This function does NOT do any cache flushing. The caller is - * responsible for calling flush_cache_vmap() on to-be-mapped areas - * before calling this function. - * - * RETURNS: - * The number of pages mapped on success, -errno on failure. - */ -int map_kernel_range_noflush(unsigned long addr, unsigned long size, - pgprot_t prot, struct page **pages) -{ - return vmap_page_range_noflush(addr, addr + size, prot, pages); -} - -/** - * unmap_kernel_range_noflush - unmap kernel VM area - * @addr: start of the VM area to unmap - * @size: size of the VM area to unmap - * - * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size - * specify should have been allocated using get_vm_area() and its - * friends. - * - * NOTE: - * This function does NOT do any cache flushing. The caller is - * responsible for calling flush_cache_vunmap() on to-be-mapped areas - * before calling this function and flush_tlb_kernel_range() after. - */ -void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) -{ - vunmap_page_range(addr, addr + size); -} -EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush); - -/** * unmap_kernel_range - unmap kernel VM area and flush cache and TLB * @addr: start of the VM area to unmap * @size: size of the VM area to unmap @@ -2045,22 +2053,9 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) unsigned long end = addr + size; flush_cache_vunmap(addr, end); - vunmap_page_range(addr, end); + unmap_kernel_range_noflush(addr, size); flush_tlb_kernel_range(addr, end); } -EXPORT_SYMBOL_GPL(unmap_kernel_range); - -int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages) -{ - unsigned long addr = (unsigned long)area->addr; - unsigned long end = addr + get_vm_area_size(area); - int err; - - err = vmap_page_range(addr, end, prot, pages); - - return err > 0 ? 0 : err; -} -EXPORT_SYMBOL_GPL(map_vm_area); static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) @@ -2128,14 +2123,6 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, return area; } -struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, - unsigned long start, unsigned long end) -{ - return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, - GFP_KERNEL, __builtin_return_address(0)); -} -EXPORT_SYMBOL_GPL(__get_vm_area); - struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, const void *caller) @@ -2441,7 +2428,8 @@ void *vmap(struct page **pages, unsigned int count, if (!area) return NULL; - if (map_vm_area(area, prot, pages)) { + if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot), + pages) < 0) { vunmap(area->addr); return NULL; } @@ -2450,9 +2438,6 @@ void *vmap(struct page **pages, unsigned int count, } EXPORT_SYMBOL(vmap); -static void *__vmalloc_node(unsigned long size, unsigned long align, - gfp_t gfp_mask, pgprot_t prot, - int node, const void *caller); static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node) { @@ -2470,7 +2455,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask, - PAGE_KERNEL, node, area->caller); + node, area->caller); } else { pages = kmalloc_node(array_size, nested_gfp, node); } @@ -2504,8 +2489,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } atomic_long_add(area->nr_pages, &nr_vmalloc_pages); - if (map_vm_area(area, prot, pages)) + if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area), + prot, pages) < 0) goto fail; + return area->addr; fail: @@ -2573,27 +2560,16 @@ fail: return NULL; } -/* - * This is only for performance analysis of vmalloc and stress purpose. - * It is required by vmalloc test module, therefore do not use it other - * than that. - */ -#ifdef CONFIG_TEST_VMALLOC_MODULE -EXPORT_SYMBOL_GPL(__vmalloc_node_range); -#endif - /** * __vmalloc_node - allocate virtually contiguous memory * @size: allocation size * @align: desired alignment * @gfp_mask: flags for the page level allocator - * @prot: protection mask for the allocated pages * @node: node to use for allocation or NUMA_NO_NODE * @caller: caller's return address * - * Allocate enough pages to cover @size from the page level - * allocator with @gfp_mask flags. Map them into contiguous - * kernel virtual space, using a pagetable protection of @prot. + * Allocate enough pages to cover @size from the page level allocator with + * @gfp_mask flags. Map them into contiguous kernel virtual space. * * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL * and __GFP_NOFAIL are not supported @@ -2603,35 +2579,28 @@ EXPORT_SYMBOL_GPL(__vmalloc_node_range); * * Return: pointer to the allocated memory or %NULL on error */ -static void *__vmalloc_node(unsigned long size, unsigned long align, - gfp_t gfp_mask, pgprot_t prot, - int node, const void *caller) +void *__vmalloc_node(unsigned long size, unsigned long align, + gfp_t gfp_mask, int node, const void *caller) { return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, - gfp_mask, prot, 0, node, caller); + gfp_mask, PAGE_KERNEL, 0, node, caller); } +/* + * This is only for performance analysis of vmalloc and stress purpose. + * It is required by vmalloc test module, therefore do not use it other + * than that. + */ +#ifdef CONFIG_TEST_VMALLOC_MODULE +EXPORT_SYMBOL_GPL(__vmalloc_node); +#endif -void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) +void *__vmalloc(unsigned long size, gfp_t gfp_mask) { - return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE, + return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(__vmalloc); -static inline void *__vmalloc_node_flags(unsigned long size, - int node, gfp_t flags) -{ - return __vmalloc_node(size, 1, flags, PAGE_KERNEL, - node, __builtin_return_address(0)); -} - - -void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, - void *caller) -{ - return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller); -} - /** * vmalloc - allocate virtually contiguous memory * @size: allocation size @@ -2646,8 +2615,8 @@ void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, */ void *vmalloc(unsigned long size) { - return __vmalloc_node_flags(size, NUMA_NO_NODE, - GFP_KERNEL); + return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc); @@ -2666,8 +2635,8 @@ EXPORT_SYMBOL(vmalloc); */ void *vzalloc(unsigned long size) { - return __vmalloc_node_flags(size, NUMA_NO_NODE, - GFP_KERNEL | __GFP_ZERO); + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vzalloc); @@ -2704,8 +2673,8 @@ EXPORT_SYMBOL(vmalloc_user); */ void *vmalloc_node(unsigned long size, int node) { - return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL, - node, __builtin_return_address(0)); + return __vmalloc_node(size, 1, GFP_KERNEL, node, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_node); @@ -2718,39 +2687,16 @@ EXPORT_SYMBOL(vmalloc_node); * allocator and map them into contiguous kernel virtual space. * The memory allocated is set to zero. * - * For tight control over page level allocator and protection flags - * use __vmalloc_node() instead. - * * Return: pointer to the allocated memory or %NULL on error */ void *vzalloc_node(unsigned long size, int node) { - return __vmalloc_node_flags(size, node, - GFP_KERNEL | __GFP_ZERO); + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node, + __builtin_return_address(0)); } EXPORT_SYMBOL(vzalloc_node); /** - * vmalloc_user_node_flags - allocate memory for userspace on a specific node - * @size: allocation size - * @node: numa node - * @flags: flags for the page level allocator - * - * The resulting memory area is zeroed so it can be mapped to userspace - * without leaking data. - * - * Return: pointer to the allocated memory or %NULL on error - */ -void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags) -{ - return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, - flags | __GFP_ZERO, PAGE_KERNEL, - VM_USERMAP, node, - __builtin_return_address(0)); -} -EXPORT_SYMBOL(vmalloc_user_node_flags); - -/** * vmalloc_exec - allocate virtually contiguous, executable memory * @size: allocation size * @@ -2793,8 +2739,8 @@ void *vmalloc_exec(unsigned long size) */ void *vmalloc_32(unsigned long size) { - return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, - NUMA_NO_NODE, __builtin_return_address(0)); + return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32); @@ -3137,21 +3083,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, } EXPORT_SYMBOL(remap_vmalloc_range); -/* - * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose - * not to have one. - * - * The purpose of this function is to make sure the vmalloc area - * mappings are identical in all page-tables in the system. - */ -void __weak vmalloc_sync_mappings(void) -{ -} - -void __weak vmalloc_sync_unmappings(void) -{ -} - static int f(pte_t *pte, unsigned long addr, void *data) { pte_t ***p = data; diff --git a/mm/vmscan.c b/mm/vmscan.c index b06868fc4926..3792dd19788c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -79,6 +79,12 @@ struct scan_control { */ struct mem_cgroup *target_mem_cgroup; + /* + * Scan pressure balancing between anon and file LRUs + */ + unsigned long anon_cost; + unsigned long file_cost; + /* Can active pages be deactivated as part of reclaim? */ #define DEACTIVATE_ANON 1 #define DEACTIVATE_FILE 2 @@ -161,7 +167,7 @@ struct scan_control { #endif /* - * From 0 .. 100. Higher means more swappy. + * From 0 .. 200. Higher means more swappy. */ int vm_swappiness = 60; /* @@ -1066,17 +1072,17 @@ static void page_check_dirty_writeback(struct page *page, /* * shrink_page_list() returns the number of reclaimed pages */ -static unsigned long shrink_page_list(struct list_head *page_list, - struct pglist_data *pgdat, - struct scan_control *sc, - enum ttu_flags ttu_flags, - struct reclaim_stat *stat, - bool ignore_references) +static unsigned int shrink_page_list(struct list_head *page_list, + struct pglist_data *pgdat, + struct scan_control *sc, + enum ttu_flags ttu_flags, + struct reclaim_stat *stat, + bool ignore_references) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); - unsigned nr_reclaimed = 0; - unsigned pgactivate = 0; + unsigned int nr_reclaimed = 0; + unsigned int pgactivate = 0; memset(stat, 0, sizeof(*stat)); cond_resched(); @@ -1295,11 +1301,15 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ if (page_mapped(page)) { enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH; + bool was_swapbacked = PageSwapBacked(page); if (unlikely(PageTransHuge(page))) flags |= TTU_SPLIT_HUGE_PMD; + if (!try_to_unmap(page, flags)) { stat->nr_unmap_fail += nr_pages; + if (!was_swapbacked && PageSwapBacked(page)) + stat->nr_lazyfree_fail += nr_pages; goto activate_locked; } } @@ -1349,6 +1359,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, case PAGE_ACTIVATE: goto activate_locked; case PAGE_SUCCESS: + stat->nr_pageout += hpage_nr_pages(page); + if (PageWriteback(page)) goto keep; if (PageDirty(page)) @@ -1438,7 +1450,7 @@ free_it: * appear not as the counts should be low */ if (unlikely(PageTransHuge(page))) - (*get_compound_page_dtor(page))(page); + destroy_compound_page(page); else list_add(&page->lru, &free_pages); continue; @@ -1483,7 +1495,7 @@ keep: return nr_reclaimed; } -unsigned long reclaim_clean_pages_from_list(struct zone *zone, +unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *page_list) { struct scan_control sc = { @@ -1491,8 +1503,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, .priority = DEF_PRIORITY, .may_unmap = 1, }; - struct reclaim_stat dummy_stat; - unsigned long ret; + struct reclaim_stat stat; + unsigned int nr_reclaimed; struct page *page, *next; LIST_HEAD(clean_pages); @@ -1504,11 +1516,21 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, } } - ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, - TTU_IGNORE_ACCESS, &dummy_stat, true); + nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, + TTU_IGNORE_ACCESS, &stat, true); list_splice(&clean_pages, page_list); - mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); - return ret; + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -nr_reclaimed); + /* + * Since lazyfree pages are isolated from file LRU from the beginning, + * they will rotate back to anonymous LRU in the end if it failed to + * discard so isolated count will be mismatched. + * Compensate the isolated count for both LRU lists. + */ + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, + stat.nr_lazyfree_fail); + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, + -stat.nr_lazyfree_fail); + return nr_reclaimed; } /* @@ -1602,10 +1624,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, if (!nr_zone_taken[zid]) continue; - __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); -#ifdef CONFIG_MEMCG - mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); -#endif + update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); } } @@ -1625,7 +1644,6 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, * @dst: The temp list to put pages on to. * @nr_scanned: The number of pages that were scanned. * @sc: The scan_control struct for this reclaim session - * @mode: One of the LRU isolation modes * @lru: LRU list id for isolating * * returns how many pages were moved onto *@dst. @@ -1860,7 +1878,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, if (unlikely(PageCompound(page))) { spin_unlock_irq(&pgdat->lru_lock); - (*get_compound_page_dtor(page))(page); + destroy_compound_page(page); spin_lock_irq(&pgdat->lru_lock); } else list_add(&page->lru, &pages_to_free); @@ -1879,13 +1897,13 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, /* * If a kernel thread (such as nfsd for loop-back mounts) services - * a backing device by writing to the page cache it sets PF_LESS_THROTTLE. + * a backing device by writing to the page cache it sets PF_LOCAL_THROTTLE. * In that case we should only throttle if the backing device it is * writing to is congested. In other cases it is safe to throttle. */ static int current_may_throttle(void) { - return !(current->flags & PF_LESS_THROTTLE) || + return !(current->flags & PF_LOCAL_THROTTLE) || current->backing_dev_info == NULL || bdi_write_congested(current->backing_dev_info); } @@ -1900,13 +1918,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, { LIST_HEAD(page_list); unsigned long nr_scanned; - unsigned long nr_reclaimed = 0; + unsigned int nr_reclaimed = 0; unsigned long nr_taken; struct reclaim_stat stat; - int file = is_file_lru(lru); + bool file = is_file_lru(lru); enum vm_event_item item; struct pglist_data *pgdat = lruvec_pgdat(lruvec); - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; bool stalled = false; while (unlikely(too_many_isolated(pgdat, file, sc))) { @@ -1930,12 +1947,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); - reclaim_stat->recent_scanned[file] += nr_taken; - item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_scanned); __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); + __count_vm_events(PGSCAN_ANON + file, nr_scanned); + spin_unlock_irq(&pgdat->lru_lock); if (nr_taken == 0) @@ -1946,16 +1963,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_lock_irq(&pgdat->lru_lock); + move_pages_to_lru(lruvec, &page_list); + + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); + lru_note_cost(lruvec, file, stat.nr_pageout); item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); - reclaim_stat->recent_rotated[0] += stat.nr_activate[0]; - reclaim_stat->recent_rotated[1] += stat.nr_activate[1]; - - move_pages_to_lru(lruvec, &page_list); - - __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); + __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); spin_unlock_irq(&pgdat->lru_lock); @@ -2002,7 +2018,6 @@ static void shrink_active_list(unsigned long nr_to_scan, LIST_HEAD(l_active); LIST_HEAD(l_inactive); struct page *page; - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; unsigned nr_deactivate, nr_activate; unsigned nr_rotated = 0; int file = is_file_lru(lru); @@ -2016,7 +2031,6 @@ static void shrink_active_list(unsigned long nr_to_scan, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); - reclaim_stat->recent_scanned[file] += nr_taken; __count_vm_events(PGREFILL, nr_scanned); __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); @@ -2043,7 +2057,6 @@ static void shrink_active_list(unsigned long nr_to_scan, if (page_referenced(page, 0, sc->target_mem_cgroup, &vm_flags)) { - nr_rotated += hpage_nr_pages(page); /* * Identify referenced, file-backed active pages and * give them one more trip around the active list. So @@ -2054,6 +2067,7 @@ static void shrink_active_list(unsigned long nr_to_scan, * so we ignore them here. */ if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) { + nr_rotated += hpage_nr_pages(page); list_add(&page->lru, &l_active); continue; } @@ -2068,13 +2082,6 @@ static void shrink_active_list(unsigned long nr_to_scan, * Move pages back to the lru list. */ spin_lock_irq(&pgdat->lru_lock); - /* - * Count referenced pages from currently used mappings as rotated, - * even though only some of them are actually re-activated. This - * helps balance scan pressure between file and anonymous pages in - * get_scan_count. - */ - reclaim_stat->recent_rotated[file] += nr_rotated; nr_activate = move_pages_to_lru(lruvec, &l_active); nr_deactivate = move_pages_to_lru(lruvec, &l_inactive); @@ -2096,7 +2103,7 @@ static void shrink_active_list(unsigned long nr_to_scan, unsigned long reclaim_pages(struct list_head *page_list) { int nid = NUMA_NO_NODE; - unsigned long nr_reclaimed = 0; + unsigned int nr_reclaimed = 0; LIST_HEAD(node_page_list); struct reclaim_stat dummy_stat; struct page *page; @@ -2230,14 +2237,11 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long *nr) { struct mem_cgroup *memcg = lruvec_memcg(lruvec); + unsigned long anon_cost, file_cost, total_cost; int swappiness = mem_cgroup_swappiness(memcg); - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; u64 fraction[2]; u64 denominator = 0; /* gcc */ - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - unsigned long anon_prio, file_prio; enum scan_balance scan_balance; - unsigned long anon, file; unsigned long ap, fp; enum lru_list lru; @@ -2287,57 +2291,35 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, } scan_balance = SCAN_FRACT; - - /* - * With swappiness at 100, anonymous and file have the same priority. - * This scanning priority is essentially the inverse of IO cost. - */ - anon_prio = swappiness; - file_prio = 200 - anon_prio; - /* - * OK, so we have swap space and a fair amount of page cache - * pages. We use the recently rotated / recently scanned - * ratios to determine how valuable each cache is. + * Calculate the pressure balance between anon and file pages. + * + * The amount of pressure we put on each LRU is inversely + * proportional to the cost of reclaiming each list, as + * determined by the share of pages that are refaulting, times + * the relative IO cost of bringing back a swapped out + * anonymous page vs reloading a filesystem page (swappiness). * - * Because workloads change over time (and to avoid overflow) - * we keep these statistics as a floating average, which ends - * up weighing recent references more than old ones. + * Although we limit that influence to ensure no list gets + * left behind completely: at least a third of the pressure is + * applied, before swappiness. * - * anon in [0], file in [1] + * With swappiness at 100, anon and file have equal IO cost. */ + total_cost = sc->anon_cost + sc->file_cost; + anon_cost = total_cost + sc->anon_cost; + file_cost = total_cost + sc->file_cost; + total_cost = anon_cost + file_cost; - anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) + - lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES); - file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) + - lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES); + ap = swappiness * (total_cost + 1); + ap /= anon_cost + 1; - spin_lock_irq(&pgdat->lru_lock); - if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { - reclaim_stat->recent_scanned[0] /= 2; - reclaim_stat->recent_rotated[0] /= 2; - } - - if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { - reclaim_stat->recent_scanned[1] /= 2; - reclaim_stat->recent_rotated[1] /= 2; - } - - /* - * The amount of pressure on anon vs file pages is inversely - * proportional to the fraction of recently scanned pages on - * each list that were recently referenced and in active use. - */ - ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1); - ap /= reclaim_stat->recent_rotated[0] + 1; - - fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); - fp /= reclaim_stat->recent_rotated[1] + 1; - spin_unlock_irq(&pgdat->lru_lock); + fp = (200 - swappiness) * (total_cost + 1); + fp /= file_cost + 1; fraction[0] = ap; fraction[1] = fp; - denominator = ap + fp + 1; + denominator = ap + fp; out: for_each_evictable_lru(lru) { int file = is_file_lru(lru); @@ -2567,7 +2549,7 @@ static bool in_reclaim_compaction(struct scan_control *sc) * Reclaim/compaction is used for high-order allocation requests. It reclaims * order-0 pages before compacting the zone. should_continue_reclaim() returns * true if more pages should be reclaimed such that when the page allocator - * calls try_to_compact_zone() that it will have enough free pages to succeed. + * calls try_to_compact_pages() that it will have enough free pages to succeed. * It will give up earlier than that if there is difficulty reclaiming pages. */ static inline bool should_continue_reclaim(struct pglist_data *pgdat, @@ -2698,6 +2680,14 @@ again: nr_scanned = sc->nr_scanned; /* + * Determine the scan balance between anon and file LRUs. + */ + spin_lock_irq(&pgdat->lru_lock); + sc->anon_cost = target_lruvec->anon_cost; + sc->file_cost = target_lruvec->file_cost; + spin_unlock_irq(&pgdat->lru_lock); + + /* * Target desirable inactive:active list ratios for the anon * and file LRU lists. */ @@ -3132,8 +3122,8 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) /* kswapd must be awake if processes are being throttled */ if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { - if (READ_ONCE(pgdat->kswapd_classzone_idx) > ZONE_NORMAL) - WRITE_ONCE(pgdat->kswapd_classzone_idx, ZONE_NORMAL); + if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL) + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL); wake_up_interruptible(&pgdat->kswapd_wait); } @@ -3386,7 +3376,7 @@ static void age_active_anon(struct pglist_data *pgdat, } while (memcg); } -static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx) +static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx) { int i; struct zone *zone; @@ -3398,7 +3388,7 @@ static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx) * start prematurely when there is no boosting and a lower * zone is balanced. */ - for (i = classzone_idx; i >= 0; i--) { + for (i = highest_zoneidx; i >= 0; i--) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; @@ -3412,9 +3402,9 @@ static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx) /* * Returns true if there is an eligible zone balanced for the request order - * and classzone_idx + * and highest_zoneidx */ -static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) +static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) { int i; unsigned long mark = -1; @@ -3424,19 +3414,19 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) * Check watermarks bottom-up as lower zones are more likely to * meet watermarks. */ - for (i = 0; i <= classzone_idx; i++) { + for (i = 0; i <= highest_zoneidx; i++) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; mark = high_wmark_pages(zone); - if (zone_watermark_ok_safe(zone, order, mark, classzone_idx)) + if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx)) return true; } /* - * If a node has no populated zone within classzone_idx, it does not + * If a node has no populated zone within highest_zoneidx, it does not * need balancing by definition. This can happen if a zone-restricted * allocation tries to wake a remote kswapd. */ @@ -3462,7 +3452,8 @@ static void clear_pgdat_congested(pg_data_t *pgdat) * * Returns true if kswapd is ready to sleep */ -static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) +static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, + int highest_zoneidx) { /* * The throttled processes are normally woken up in balance_pgdat() as @@ -3484,7 +3475,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) return true; - if (pgdat_balanced(pgdat, order, classzone_idx)) { + if (pgdat_balanced(pgdat, order, highest_zoneidx)) { clear_pgdat_congested(pgdat); return true; } @@ -3548,7 +3539,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, * or lower is eligible for reclaim until at least one usable zone is * balanced. */ -static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) +static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) { int i; unsigned long nr_soft_reclaimed; @@ -3576,7 +3567,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * stall or direct reclaim until kswapd is finished. */ nr_boost_reclaim = 0; - for (i = 0; i <= classzone_idx; i++) { + for (i = 0; i <= highest_zoneidx; i++) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; @@ -3594,7 +3585,7 @@ restart: bool balanced; bool ret; - sc.reclaim_idx = classzone_idx; + sc.reclaim_idx = highest_zoneidx; /* * If the number of buffer_heads exceeds the maximum allowed @@ -3624,7 +3615,7 @@ restart: * on the grounds that the normal reclaim should be enough to * re-evaluate if boosting is required when kswapd next wakes. */ - balanced = pgdat_balanced(pgdat, sc.order, classzone_idx); + balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx); if (!balanced && nr_boost_reclaim) { nr_boost_reclaim = 0; goto restart; @@ -3724,7 +3715,7 @@ out: if (boosted) { unsigned long flags; - for (i = 0; i <= classzone_idx; i++) { + for (i = 0; i <= highest_zoneidx; i++) { if (!zone_boosts[i]) continue; @@ -3739,7 +3730,7 @@ out: * As there is now likely space, wakeup kcompact to defragment * pageblocks. */ - wakeup_kcompactd(pgdat, pageblock_order, classzone_idx); + wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx); } snapshot_refaults(NULL, pgdat); @@ -3757,22 +3748,22 @@ out: } /* - * The pgdat->kswapd_classzone_idx is used to pass the highest zone index to be - * reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is not - * a valid index then either kswapd runs for first time or kswapd couldn't sleep - * after previous reclaim attempt (node is still unbalanced). In that case - * return the zone index of the previous kswapd reclaim cycle. + * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to + * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is + * not a valid index then either kswapd runs for first time or kswapd couldn't + * sleep after previous reclaim attempt (node is still unbalanced). In that + * case return the zone index of the previous kswapd reclaim cycle. */ -static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat, - enum zone_type prev_classzone_idx) +static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat, + enum zone_type prev_highest_zoneidx) { - enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_classzone_idx); + enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); - return curr_idx == MAX_NR_ZONES ? prev_classzone_idx : curr_idx; + return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx; } static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, - unsigned int classzone_idx) + unsigned int highest_zoneidx) { long remaining = 0; DEFINE_WAIT(wait); @@ -3789,7 +3780,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * eligible zone balanced that it's also unlikely that compaction will * succeed. */ - if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { /* * Compaction records what page blocks it recently failed to * isolate pages from and skips them in the future scanning. @@ -3802,18 +3793,19 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * We have freed the memory, now we should compact it to make * allocation of the requested order possible. */ - wakeup_kcompactd(pgdat, alloc_order, classzone_idx); + wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx); remaining = schedule_timeout(HZ/10); /* - * If woken prematurely then reset kswapd_classzone_idx and + * If woken prematurely then reset kswapd_highest_zoneidx and * order. The values will either be from a wakeup request or * the previous request that slept prematurely. */ if (remaining) { - WRITE_ONCE(pgdat->kswapd_classzone_idx, - kswapd_classzone_idx(pgdat, classzone_idx)); + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, + kswapd_highest_zoneidx(pgdat, + highest_zoneidx)); if (READ_ONCE(pgdat->kswapd_order) < reclaim_order) WRITE_ONCE(pgdat->kswapd_order, reclaim_order); @@ -3828,7 +3820,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * go fully to sleep until explicitly woken up. */ if (!remaining && - prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { + prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* @@ -3870,7 +3862,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o static int kswapd(void *p) { unsigned int alloc_order, reclaim_order; - unsigned int classzone_idx = MAX_NR_ZONES - 1; + unsigned int highest_zoneidx = MAX_NR_ZONES - 1; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); @@ -3894,22 +3886,24 @@ static int kswapd(void *p) set_freezable(); WRITE_ONCE(pgdat->kswapd_order, 0); - WRITE_ONCE(pgdat->kswapd_classzone_idx, MAX_NR_ZONES); + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); for ( ; ; ) { bool ret; alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); - classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); + highest_zoneidx = kswapd_highest_zoneidx(pgdat, + highest_zoneidx); kswapd_try_sleep: kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, - classzone_idx); + highest_zoneidx); - /* Read the new order and classzone_idx */ + /* Read the new order and highest_zoneidx */ alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); - classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); + highest_zoneidx = kswapd_highest_zoneidx(pgdat, + highest_zoneidx); WRITE_ONCE(pgdat->kswapd_order, 0); - WRITE_ONCE(pgdat->kswapd_classzone_idx, MAX_NR_ZONES); + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); ret = try_to_freeze(); if (kthread_should_stop()) @@ -3930,9 +3924,10 @@ kswapd_try_sleep: * but kcompactd is woken to compact for the original * request (alloc_order). */ - trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx, + trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx, alloc_order); - reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); + reclaim_order = balance_pgdat(pgdat, alloc_order, + highest_zoneidx); if (reclaim_order < alloc_order) goto kswapd_try_sleep; } @@ -3950,7 +3945,7 @@ kswapd_try_sleep: * needed. */ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, - enum zone_type classzone_idx) + enum zone_type highest_zoneidx) { pg_data_t *pgdat; enum zone_type curr_idx; @@ -3962,10 +3957,10 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, return; pgdat = zone->zone_pgdat; - curr_idx = READ_ONCE(pgdat->kswapd_classzone_idx); + curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); - if (curr_idx == MAX_NR_ZONES || curr_idx < classzone_idx) - WRITE_ONCE(pgdat->kswapd_classzone_idx, classzone_idx); + if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx) + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx); if (READ_ONCE(pgdat->kswapd_order) < order) WRITE_ONCE(pgdat->kswapd_order, order); @@ -3975,8 +3970,8 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, /* Hopeless node, leave it to direct reclaim if possible */ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || - (pgdat_balanced(pgdat, order, classzone_idx) && - !pgdat_watermark_boosted(pgdat, classzone_idx))) { + (pgdat_balanced(pgdat, order, highest_zoneidx) && + !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { /* * There may be plenty of free memory available, but it's too * fragmented for high-order allocations. Wake up kcompactd @@ -3985,11 +3980,11 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, * ratelimit its work. */ if (!(gfp_flags & __GFP_DIRECT_RECLAIM)) - wakeup_kcompactd(pgdat, order, classzone_idx); + wakeup_kcompactd(pgdat, order, highest_zoneidx); return; } - trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order, + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order, gfp_flags); wake_up_interruptible(&pgdat->kswapd_wait); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 96d21a792b57..a7db29f7e5f7 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -76,7 +76,7 @@ static void invalid_numa_statistics(void) static DEFINE_MUTEX(vm_numa_stat_lock); int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int ret, oldval; @@ -1108,7 +1108,7 @@ int fragmentation_index(struct zone *zone, unsigned int order) TEXT_FOR_HIGHMEM(xx) xx "_movable", const char * const vmstat_text[] = { - /* enum zone_stat_item countes */ + /* enum zone_stat_item counters */ "nr_free_pages", "nr_zone_inactive_anon", "nr_zone_active_anon", @@ -1119,6 +1119,9 @@ const char * const vmstat_text[] = { "nr_mlock", "nr_page_table_pages", "nr_kernel_stack", +#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) + "nr_shadow_call_stack", +#endif "nr_bounce", #if IS_ENABLED(CONFIG_ZSMALLOC) "nr_zspages", @@ -1162,7 +1165,6 @@ const char * const vmstat_text[] = { "nr_file_hugepages", "nr_file_pmdmapped", "nr_anon_transparent_hugepages", - "nr_unstable", "nr_vmscan_write", "nr_vmscan_immediate_reclaim", "nr_dirtied", @@ -1201,6 +1203,10 @@ const char * const vmstat_text[] = { "pgscan_kswapd", "pgscan_direct", "pgscan_direct_throttle", + "pgscan_anon", + "pgscan_file", + "pgsteal_anon", + "pgsteal_file", #ifdef CONFIG_NUMA "zone_reclaim_failed", @@ -1590,6 +1596,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, zone->present_pages, zone_managed_pages(zone)); + /* If unpopulated, no other information is useful */ + if (!populated_zone(zone)) { + seq_putc(m, '\n'); + return; + } + seq_printf(m, "\n protection: (%ld", zone->lowmem_reserve[0]); @@ -1597,12 +1609,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, seq_printf(m, ", %ld", zone->lowmem_reserve[i]); seq_putc(m, ')'); - /* If unpopulated, no other information is useful */ - if (!populated_zone(zone)) { - seq_putc(m, '\n'); - return; - } - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) seq_printf(m, "\n %-12s %lu", zone_stat_name(i), zone_page_state(zone, i)); @@ -1723,6 +1729,14 @@ static int vmstat_show(struct seq_file *m, void *arg) seq_puts(m, vmstat_text[off]); seq_put_decimal_ull(m, " ", *l); seq_putc(m, '\n'); + + if (off == NR_VMSTAT_ITEMS - 1) { + /* + * We've come to the end - add any deprecated counters to avoid + * breaking userspace which might depend on them being present. + */ + seq_puts(m, "nr_unstable 0\n"); + } return 0; } @@ -1751,7 +1765,7 @@ static void refresh_vm_stats(struct work_struct *work) } int vmstat_refresh(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { long val; int err; diff --git a/mm/workingset.c b/mm/workingset.c index 474186b76ced..d481ea452eeb 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -277,8 +277,8 @@ void workingset_refault(struct page *page, void *shadow) struct mem_cgroup *eviction_memcg; struct lruvec *eviction_lruvec; unsigned long refault_distance; + unsigned long workingset_size; struct pglist_data *pgdat; - unsigned long active_file; struct mem_cgroup *memcg; unsigned long eviction; struct lruvec *lruvec; @@ -310,7 +310,6 @@ void workingset_refault(struct page *page, void *shadow) goto out; eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->inactive_age); - active_file = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); /* * Calculate the refault distance @@ -345,10 +344,18 @@ void workingset_refault(struct page *page, void *shadow) /* * Compare the distance to the existing workingset size. We - * don't act on pages that couldn't stay resident even if all - * the memory was available to the page cache. + * don't activate pages that couldn't stay resident even if + * all the memory was available to the page cache. Whether + * cache can compete with anon or not depends on having swap. */ - if (refault_distance > active_file) + workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); + if (mem_cgroup_get_nr_swap_pages(memcg) > 0) { + workingset_size += lruvec_page_state(eviction_lruvec, + NR_INACTIVE_ANON); + workingset_size += lruvec_page_state(eviction_lruvec, + NR_ACTIVE_ANON); + } + if (refault_distance > workingset_size) goto out; SetPageActive(page); @@ -358,6 +365,10 @@ void workingset_refault(struct page *page, void *shadow) /* Page was active prior to eviction */ if (workingset) { SetPageWorkingset(page); + /* XXX: Move to lru_cache_add() when it supports new vs putback */ + spin_lock_irq(&page_pgdat(page)->lru_lock); + lru_note_cost_page(page); + spin_unlock_irq(&page_pgdat(page)->lru_lock); inc_lruvec_state(lruvec, WORKINGSET_RESTORE); } out: diff --git a/mm/z3fold.c b/mm/z3fold.c index 42f31c4b53ad..460b0feced26 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -43,6 +43,7 @@ #include <linux/spinlock.h> #include <linux/zpool.h> #include <linux/magic.h> +#include <linux/kmemleak.h> /* * NCHUNKS_ORDER determines the internal allocation granularity, effectively @@ -215,6 +216,8 @@ static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool, (gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE))); if (slots) { + /* It will be freed separately in free_handle(). */ + kmemleak_not_leak(slots); memset(slots->slot, 0, sizeof(slots->slot)); slots->pool = (unsigned long)pool; rwlock_init(&slots->lock); @@ -318,16 +321,16 @@ static inline void free_handle(unsigned long handle) slots = handle_to_slots(handle); write_lock(&slots->lock); *(unsigned long *)handle = 0; - write_unlock(&slots->lock); - if (zhdr->slots == slots) + if (zhdr->slots == slots) { + write_unlock(&slots->lock); return; /* simple case, nothing else to do */ + } /* we are freeing a foreign handle if we are here */ zhdr->foreign_handles--; is_free = true; - read_lock(&slots->lock); if (!test_bit(HANDLES_ORPHANED, &slots->pool)) { - read_unlock(&slots->lock); + write_unlock(&slots->lock); return; } for (i = 0; i <= BUDDY_MASK; i++) { @@ -336,7 +339,7 @@ static inline void free_handle(unsigned long handle) break; } } - read_unlock(&slots->lock); + write_unlock(&slots->lock); if (is_free) { struct z3fold_pool *pool = slots_to_pool(slots); @@ -422,6 +425,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless, zhdr->start_middle = 0; zhdr->cpu = -1; zhdr->foreign_handles = 0; + zhdr->mapped_count = 0; zhdr->slots = slots; zhdr->pool = pool; INIT_LIST_HEAD(&zhdr->buddy); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 2f836a2b993f..f6dc0673e62c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -293,7 +293,7 @@ struct zspage { }; struct mapping_area { -#ifdef CONFIG_PGTABLE_MAPPING +#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING struct vm_struct *vm; /* vm area for mapping object that span pages */ #else char *vm_buf; /* copy buffer for objects that span pages */ @@ -1113,7 +1113,7 @@ static struct zspage *find_get_zspage(struct size_class *class) return zspage; } -#ifdef CONFIG_PGTABLE_MAPPING +#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING static inline int __zs_cpu_up(struct mapping_area *area) { /* @@ -1138,7 +1138,9 @@ static inline void __zs_cpu_down(struct mapping_area *area) static inline void *__zs_map_object(struct mapping_area *area, struct page *pages[2], int off, int size) { - BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages)); + unsigned long addr = (unsigned long)area->vm->addr; + + BUG_ON(map_kernel_range(addr, PAGE_SIZE * 2, PAGE_KERNEL, pages) < 0); area->vm_addr = area->vm->addr; return area->vm_addr + off; } @@ -1151,7 +1153,7 @@ static inline void __zs_unmap_object(struct mapping_area *area, unmap_kernel_range(addr, PAGE_SIZE * 2); } -#else /* CONFIG_PGTABLE_MAPPING */ +#else /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */ static inline int __zs_cpu_up(struct mapping_area *area) { @@ -1233,7 +1235,7 @@ out: pagefault_enable(); } -#endif /* CONFIG_PGTABLE_MAPPING */ +#endif /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */ static int zs_cpu_prepare(unsigned int cpu) { |