diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-12-02 07:36:41 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-12-02 07:36:41 +0300 |
commit | 596cf45cbf6e4fa7bcb0df33e373a7d062b644b5 (patch) | |
tree | 3f24095d65cbdaaae8e89ff02b87df373ada1bfb /mm | |
parent | c3bfc5dd73c6f519ff0636d4e709515f06edef78 (diff) | |
parent | 937790699be9c8100e5358625e7dfa8b32bd33f2 (diff) | |
download | linux-596cf45cbf6e4fa7bcb0df33e373a7d062b644b5.tar.xz |
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton:
"Incoming:
- a small number of updates to scripts/, ocfs2 and fs/buffer.c
- most of MM
I still have quite a lot of material (mostly not MM) staged after
linux-next due to -next dependencies. I'll send those across next week
as the preprequisites get merged up"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (135 commits)
mm/page_io.c: annotate refault stalls from swap_readpage
mm/Kconfig: fix trivial help text punctuation
mm/Kconfig: fix indentation
mm/memory_hotplug.c: remove __online_page_set_limits()
mm: fix typos in comments when calling __SetPageUptodate()
mm: fix struct member name in function comments
mm/shmem.c: cast the type of unmap_start to u64
mm: shmem: use proper gfp flags for shmem_writepage()
mm/shmem.c: make array 'values' static const, makes object smaller
userfaultfd: require CAP_SYS_PTRACE for UFFD_FEATURE_EVENT_FORK
fs/userfaultfd.c: wp: clear VM_UFFD_MISSING or VM_UFFD_WP during userfaultfd_register()
userfaultfd: wrap the common dst_vma check into an inlined function
userfaultfd: remove unnecessary WARN_ON() in __mcopy_atomic_hugetlb()
userfaultfd: use vma_pagesize for all huge page size calculation
mm/madvise.c: use PAGE_ALIGN[ED] for range checking
mm/madvise.c: replace with page_size() in madvise_inject_error()
mm/mmap.c: make vma_merge() comment more easy to understand
mm/hwpoison-inject: use DEFINE_DEBUGFS_ATTRIBUTE to define debugfs fops
autonuma: reduce cache footprint when scanning page tables
autonuma: fix watermark checking in migrate_balanced_pgdat()
...
Diffstat (limited to 'mm')
44 files changed, 1954 insertions, 1273 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index f332efe751dd..ab80933be65f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -29,7 +29,7 @@ config FLATMEM_MANUAL For systems that have holes in their physical address spaces and for features like NUMA and memory hotplug, - choose "Sparse Memory" + choose "Sparse Memory". If unsure, choose this option (Flat Memory) over any other. @@ -122,9 +122,9 @@ config SPARSEMEM_VMEMMAP depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE default y help - SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise - pfn_to_page and page_to_pfn operations. This is the most - efficient option when sufficient kernel resources are available. + SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise + pfn_to_page and page_to_pfn operations. This is the most + efficient option when sufficient kernel resources are available. config HAVE_MEMBLOCK_NODE_MAP bool @@ -160,9 +160,9 @@ config MEMORY_HOTPLUG_SPARSE depends on SPARSEMEM && MEMORY_HOTPLUG config MEMORY_HOTPLUG_DEFAULT_ONLINE - bool "Online the newly added memory blocks by default" - depends on MEMORY_HOTPLUG - help + bool "Online the newly added memory blocks by default" + depends on MEMORY_HOTPLUG + help This option sets the default policy setting for memory hotplug onlining policy (/sys/devices/system/memory/auto_online_blocks) which determines what happens to newly added memory regions. Policy setting @@ -227,14 +227,14 @@ config COMPACTION select MIGRATION depends on MMU help - Compaction is the only memory management component to form - high order (larger physically contiguous) memory blocks - reliably. The page allocator relies on compaction heavily and - the lack of the feature can lead to unexpected OOM killer - invocations for high order memory requests. You shouldn't - disable this option unless there really is a strong reason for - it and then we would be really interested to hear about that at - linux-mm@kvack.org. + Compaction is the only memory management component to form + high order (larger physically contiguous) memory blocks + reliably. The page allocator relies on compaction heavily and + the lack of the feature can lead to unexpected OOM killer + invocations for high order memory requests. You shouldn't + disable this option unless there really is a strong reason for + it and then we would be really interested to hear about that at + linux-mm@kvack.org. # # support for page migration @@ -258,7 +258,7 @@ config ARCH_ENABLE_THP_MIGRATION bool config CONTIG_ALLOC - def_bool (MEMORY_ISOLATION && COMPACTION) || CMA + def_bool (MEMORY_ISOLATION && COMPACTION) || CMA config PHYS_ADDR_T_64BIT def_bool 64BIT @@ -302,10 +302,10 @@ config KSM root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). config DEFAULT_MMAP_MIN_ADDR - int "Low address space to protect from user allocation" + int "Low address space to protect from user allocation" depends on MMU - default 4096 - help + default 4096 + help This is the portion of low virtual memory which should be protected from userspace allocation. Keeping a user from writing to low pages can help reduce the impact of kernel NULL pointer bugs. @@ -408,7 +408,7 @@ choice endchoice config ARCH_WANTS_THP_SWAP - def_bool n + def_bool n config THP_SWAP def_bool y @@ -95,13 +95,11 @@ static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, static int __init cma_activate_area(struct cma *cma) { - int bitmap_size = BITS_TO_LONGS(cma_bitmap_maxno(cma)) * sizeof(long); unsigned long base_pfn = cma->base_pfn, pfn = base_pfn; unsigned i = cma->count >> pageblock_order; struct zone *zone; - cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - + cma->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma), GFP_KERNEL); if (!cma->bitmap) { cma->count = 0; return -ENOMEM; @@ -139,7 +137,7 @@ static int __init cma_activate_area(struct cma *cma) not_in_zone: pr_err("CMA area %s could not be activated\n", cma->name); - kfree(cma->bitmap); + bitmap_free(cma->bitmap); cma->count = 0; return -EINVAL; } diff --git a/mm/cma_debug.c b/mm/cma_debug.c index a7dd9e8e10d5..4e6cbe2f586e 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -29,7 +29,7 @@ static int cma_debugfs_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n"); static int cma_used_get(void *data, u64 *val) { @@ -44,7 +44,7 @@ static int cma_used_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n"); static int cma_maxchunk_get(void *data, u64 *val) { @@ -66,7 +66,7 @@ static int cma_maxchunk_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(cma_maxchunk_fops, cma_maxchunk_get, NULL, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(cma_maxchunk_fops, cma_maxchunk_get, NULL, "%llu\n"); static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem) { @@ -126,7 +126,7 @@ static int cma_free_write(void *data, u64 val) return cma_free_mem(cma, pages); } -DEFINE_SIMPLE_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n"); static int cma_alloc_mem(struct cma *cma, int count) { @@ -158,7 +158,7 @@ static int cma_alloc_write(void *data, u64 val) return cma_alloc_mem(cma, pages); } -DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry) { diff --git a/mm/filemap.c b/mm/filemap.c index 85b7d087eb45..bf6aa30be58d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2329,27 +2329,6 @@ EXPORT_SYMBOL(generic_file_read_iter); #ifdef CONFIG_MMU #define MMAP_LOTSAMISS (100) -static struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, - struct file *fpin) -{ - int flags = vmf->flags; - - if (fpin) - return fpin; - - /* - * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or - * anything, so we only pin the file and drop the mmap_sem if only - * FAULT_FLAG_ALLOW_RETRY is set. - */ - if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) == - FAULT_FLAG_ALLOW_RETRY) { - fpin = get_file(vmf->vma->vm_file); - up_read(&vmf->vma->vm_mm->mmap_sem); - } - return fpin; -} - /* * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem * @vmf - the vm_fault for this fault. @@ -3161,6 +3140,27 @@ int pagecache_write_end(struct file *file, struct address_space *mapping, } EXPORT_SYMBOL(pagecache_write_end); +/* + * Warn about a page cache invalidation failure during a direct I/O write. + */ +void dio_warn_stale_pagecache(struct file *filp) +{ + static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); + char pathname[128]; + struct inode *inode = file_inode(filp); + char *path; + + errseq_set(&inode->i_mapping->wb_err, -EIO); + if (__ratelimit(&_rs)) { + path = file_path(filp, pathname, sizeof(pathname)); + if (IS_ERR(path)) + path = "(unknown)"; + pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n"); + pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid, + current->comm); + } +} + ssize_t generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) { @@ -3218,11 +3218,15 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) * Most of the time we do not need this since dio_complete() will do * the invalidation for us. However there are some file systems that * do not end up with dio_complete() being called, so let's not break - * them by removing it completely + * them by removing it completely. + * + * Noticeable example is a blkdev_direct_IO(). + * + * Skip invalidation for async writes or if mapping has no pages. */ - if (mapping->nrpages) - invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); + if (written > 0 && mapping->nrpages && + invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end)) + dio_warn_stale_pagecache(file); if (written > 0) { pos += written; @@ -734,11 +734,17 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * Or NULL if the caller does not require them. * @nonblocking: whether waiting for disk IO or mmap_sem contention * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. Each page returned must be released - * with a put_page() call when it is finished with. vmas will only - * remain valid while mmap_sem is held. + * Returns either number of pages pinned (which may be less than the + * number requested), or an error. Details about the return value: + * + * -- If nr_pages is 0, returns 0. + * -- If nr_pages is >0, but no pages were pinned, returns -errno. + * -- If nr_pages is >0, and some pages were pinned, returns the number of + * pages pinned. Again, this may be less than nr_pages. + * + * The caller is responsible for releasing returned @pages, via put_page(). + * + * @vmas are valid only as long as mmap_sem is held. * * Must be called with mmap_sem held. It may be released. See below. * @@ -1107,11 +1113,17 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, * subsequently whether VM_FAULT_RETRY functionality can be * utilised. Lock must initially be held. * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. Each page returned must be released - * with a put_page() call when it is finished with. vmas will only - * remain valid while mmap_sem is held. + * Returns either number of pages pinned (which may be less than the + * number requested), or an error. Details about the return value: + * + * -- If nr_pages is 0, returns 0. + * -- If nr_pages is >0, but no pages were pinned, returns -errno. + * -- If nr_pages is >0, and some pages were pinned, returns the number of + * pages pinned. Again, this may be less than nr_pages. + * + * The caller is responsible for releasing returned @pages, via put_page(). + * + * @vmas are valid only as long as mmap_sem is held. * * Must be called with mmap_sem held for read or write. * @@ -1443,6 +1455,7 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk, bool drain_allow = true; bool migrate_allow = true; LIST_HEAD(cma_page_list); + long ret = nr_pages; check_again: for (i = 0; i < nr_pages;) { @@ -1504,17 +1517,18 @@ check_again: * again migrating any new CMA pages which we failed to isolate * earlier. */ - nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages, + ret = __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, NULL, gup_flags); - if ((nr_pages > 0) && migrate_allow) { + if ((ret > 0) && migrate_allow) { + nr_pages = ret; drain_allow = true; goto check_again; } } - return nr_pages; + return ret; } #else static long check_and_migrate_cma_pages(struct task_struct *tsk, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 13cc93785006..41a0fbddc96b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3003,7 +3003,7 @@ next: return 0; } -DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set, +DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set, "%llu\n"); static int __init split_huge_pages_debugfs(void) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b45a95363a84..ac65bb5e38ac 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -244,16 +244,66 @@ struct file_region { long to; }; +/* Must be called with resv->lock held. Calling this with count_only == true + * will count the number of pages to be added but will not modify the linked + * list. + */ +static long add_reservation_in_range(struct resv_map *resv, long f, long t, + bool count_only) +{ + long chg = 0; + struct list_head *head = &resv->regions; + struct file_region *rg = NULL, *trg = NULL, *nrg = NULL; + + /* Locate the region we are before or in. */ + list_for_each_entry(rg, head, link) + if (f <= rg->to) + break; + + /* Round our left edge to the current segment if it encloses us. */ + if (f > rg->from) + f = rg->from; + + chg = t - f; + + /* Check for and consume any regions we now overlap with. */ + nrg = rg; + list_for_each_entry_safe(rg, trg, rg->link.prev, link) { + if (&rg->link == head) + break; + if (rg->from > t) + break; + + /* We overlap with this area, if it extends further than + * us then we must extend ourselves. Account for its + * existing reservation. + */ + if (rg->to > t) { + chg += rg->to - t; + t = rg->to; + } + chg -= rg->to - rg->from; + + if (!count_only && rg != nrg) { + list_del(&rg->link); + kfree(rg); + } + } + + if (!count_only) { + nrg->from = f; + nrg->to = t; + } + + return chg; +} + /* * Add the huge page range represented by [f, t) to the reserve - * map. In the normal case, existing regions will be expanded - * to accommodate the specified range. Sufficient regions should - * exist for expansion due to the previous call to region_chg - * with the same range. However, it is possible that region_del - * could have been called after region_chg and modifed the map - * in such a way that no region exists to be expanded. In this - * case, pull a region descriptor from the cache associated with - * the map and use that for the new range. + * map. Existing regions will be expanded to accommodate the specified + * range, or a region will be taken from the cache. Sufficient regions + * must exist in the cache due to the previous call to region_chg with + * the same range. * * Return the number of new huge pages added to the map. This * number is greater than or equal to zero. @@ -261,7 +311,7 @@ struct file_region { static long region_add(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; - struct file_region *rg, *nrg, *trg; + struct file_region *rg, *nrg; long add = 0; spin_lock(&resv->lock); @@ -272,9 +322,8 @@ static long region_add(struct resv_map *resv, long f, long t) /* * If no region exists which can be expanded to include the - * specified range, the list must have been modified by an - * interleving call to region_del(). Pull a region descriptor - * from the cache and use it for this range. + * specified range, pull a region descriptor from the cache + * and use it for this range. */ if (&rg->link == head || t < rg->from) { VM_BUG_ON(resv->region_cache_count <= 0); @@ -292,38 +341,7 @@ static long region_add(struct resv_map *resv, long f, long t) goto out_locked; } - /* Round our left edge to the current segment if it encloses us. */ - if (f > rg->from) - f = rg->from; - - /* Check for and consume any regions we now overlap with. */ - nrg = rg; - list_for_each_entry_safe(rg, trg, rg->link.prev, link) { - if (&rg->link == head) - break; - if (rg->from > t) - break; - - /* If this area reaches higher then extend our area to - * include it completely. If this is not the first area - * which we intend to reuse, free it. */ - if (rg->to > t) - t = rg->to; - if (rg != nrg) { - /* Decrement return value by the deleted range. - * Another range will span this area so that by - * end of routine add will be >= zero - */ - add -= (rg->to - rg->from); - list_del(&rg->link); - kfree(rg); - } - } - - add += (nrg->from - f); /* Added to beginning of region */ - nrg->from = f; - add += t - nrg->to; /* Added to end of region */ - nrg->to = t; + add = add_reservation_in_range(resv, f, t, false); out_locked: resv->adds_in_progress--; @@ -339,15 +357,9 @@ out_locked: * call to region_add that will actually modify the reserve * map to add the specified range [f, t). region_chg does * not change the number of huge pages represented by the - * map. However, if the existing regions in the map can not - * be expanded to represent the new range, a new file_region - * structure is added to the map as a placeholder. This is - * so that the subsequent region_add call will have all the - * regions it needs and will not fail. - * - * Upon entry, region_chg will also examine the cache of region descriptors - * associated with the map. If there are not enough descriptors cached, one - * will be allocated for the in progress add operation. + * map. A new file_region structure is added to the cache + * as a placeholder, so that the subsequent region_add + * call will have all the regions it needs and will not fail. * * Returns the number of huge pages that need to be added to the existing * reservation map for the range [f, t). This number is greater or equal to @@ -356,11 +368,8 @@ out_locked: */ static long region_chg(struct resv_map *resv, long f, long t) { - struct list_head *head = &resv->regions; - struct file_region *rg, *nrg = NULL; long chg = 0; -retry: spin_lock(&resv->lock); retry_locked: resv->adds_in_progress++; @@ -378,10 +387,8 @@ retry_locked: spin_unlock(&resv->lock); trg = kmalloc(sizeof(*trg), GFP_KERNEL); - if (!trg) { - kfree(nrg); + if (!trg) return -ENOMEM; - } spin_lock(&resv->lock); list_add(&trg->link, &resv->region_cache); @@ -389,61 +396,8 @@ retry_locked: goto retry_locked; } - /* Locate the region we are before or in. */ - list_for_each_entry(rg, head, link) - if (f <= rg->to) - break; + chg = add_reservation_in_range(resv, f, t, true); - /* If we are below the current region then a new region is required. - * Subtle, allocate a new region at the position but make it zero - * size such that we can guarantee to record the reservation. */ - if (&rg->link == head || t < rg->from) { - if (!nrg) { - resv->adds_in_progress--; - spin_unlock(&resv->lock); - nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); - if (!nrg) - return -ENOMEM; - - nrg->from = f; - nrg->to = f; - INIT_LIST_HEAD(&nrg->link); - goto retry; - } - - list_add(&nrg->link, rg->link.prev); - chg = t - f; - goto out_nrg; - } - - /* Round our left edge to the current segment if it encloses us. */ - if (f > rg->from) - f = rg->from; - chg = t - f; - - /* Check for and consume any regions we now overlap with. */ - list_for_each_entry(rg, rg->link.prev, link) { - if (&rg->link == head) - break; - if (rg->from > t) - goto out; - - /* We overlap with this area, if it extends further than - * us then we must extend ourselves. Account for its - * existing reservation. */ - if (rg->to > t) { - chg += rg->to - t; - t = rg->to; - } - chg -= rg->to - rg->from; - } - -out: - spin_unlock(&resv->lock); - /* We already know we raced and no longer need the new region */ - kfree(nrg); - return chg; -out_nrg: spin_unlock(&resv->lock); return chg; } @@ -1069,85 +1023,12 @@ static void free_gigantic_page(struct page *page, unsigned int order) } #ifdef CONFIG_CONTIG_ALLOC -static int __alloc_gigantic_page(unsigned long start_pfn, - unsigned long nr_pages, gfp_t gfp_mask) -{ - unsigned long end_pfn = start_pfn + nr_pages; - return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, - gfp_mask); -} - -static bool pfn_range_valid_gigantic(struct zone *z, - unsigned long start_pfn, unsigned long nr_pages) -{ - unsigned long i, end_pfn = start_pfn + nr_pages; - struct page *page; - - for (i = start_pfn; i < end_pfn; i++) { - page = pfn_to_online_page(i); - if (!page) - return false; - - if (page_zone(page) != z) - return false; - - if (PageReserved(page)) - return false; - - if (page_count(page) > 0) - return false; - - if (PageHuge(page)) - return false; - } - - return true; -} - -static bool zone_spans_last_pfn(const struct zone *zone, - unsigned long start_pfn, unsigned long nr_pages) -{ - unsigned long last_pfn = start_pfn + nr_pages - 1; - return zone_spans_pfn(zone, last_pfn); -} - static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { - unsigned int order = huge_page_order(h); - unsigned long nr_pages = 1 << order; - unsigned long ret, pfn, flags; - struct zonelist *zonelist; - struct zone *zone; - struct zoneref *z; + unsigned long nr_pages = 1UL << huge_page_order(h); - zonelist = node_zonelist(nid, gfp_mask); - for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) { - spin_lock_irqsave(&zone->lock, flags); - - pfn = ALIGN(zone->zone_start_pfn, nr_pages); - while (zone_spans_last_pfn(zone, pfn, nr_pages)) { - if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) { - /* - * We release the zone lock here because - * alloc_contig_range() will also lock the zone - * at some point. If there's an allocation - * spinning on this lock, it may win the race - * and cause alloc_contig_range() to fail... - */ - spin_unlock_irqrestore(&zone->lock, flags); - ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask); - if (!ret) - return pfn_to_page(pfn); - spin_lock_irqsave(&zone->lock, flags); - } - pfn += nr_pages; - } - - spin_unlock_irqrestore(&zone->lock, flags); - } - - return NULL; + return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); } static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); @@ -3915,7 +3796,7 @@ retry: * handling userfault. Reacquire after handling * fault to make calling code simpler. */ - hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr); + hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_unlock(&hugetlb_fault_mutex_table[hash]); ret = handle_userfault(&vmf, VM_UFFD_MISSING); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -4042,8 +3923,7 @@ backout_unlocked: } #ifdef CONFIG_SMP -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, - pgoff_t idx, unsigned long address) +u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) { unsigned long key[2]; u32 hash; @@ -4051,7 +3931,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, key[0] = (unsigned long) mapping; key[1] = idx; - hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0); + hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0); return hash & (num_fault_mutexes - 1); } @@ -4060,8 +3940,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, * For uniprocesor systems we always use a single mutex, so just * return 0 and avoid the hashing overhead. */ -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, - pgoff_t idx, unsigned long address) +u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) { return 0; } @@ -4105,7 +3984,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr); + hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); entry = huge_ptep_get(ptep); @@ -4459,6 +4338,21 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, break; } } + + /* + * If subpage information not requested, update counters + * and skip the same_page loop below. + */ + if (!pages && !vmas && !pfn_offset && + (vaddr + huge_page_size(h) < vma->vm_end) && + (remainder >= pages_per_huge_page(h))) { + vaddr += huge_page_size(h); + remainder -= pages_per_huge_page(h); + i += pages_per_huge_page(h); + spin_unlock(ptl); + continue; + } + same_page: if (pages) { pages[i] = mem_map_offset(page, pfn_offset); @@ -4842,7 +4736,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!vma_shareable(vma, addr)) return (pte_t *)pmd_alloc(mm, pud, addr); - i_mmap_lock_write(mapping); + i_mmap_lock_read(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; @@ -4872,7 +4766,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) spin_unlock(ptl); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); - i_mmap_unlock_write(mapping); + i_mmap_unlock_read(mapping); return pte; } diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 5b7430bd83a6..e488876b168a 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -67,8 +67,8 @@ static int hwpoison_unpoison(void *data, u64 val) return unpoison_memory(val); } -DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); -DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); +DEFINE_DEBUGFS_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); +DEFINE_DEBUGFS_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); static void pfn_inject_exit(void) { diff --git a/mm/internal.h b/mm/internal.h index 0d5f720c75ab..3cf20ab3ca01 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -165,6 +165,9 @@ extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern int user_min_free_kbytes; +extern void zone_pcp_update(struct zone *zone); +extern void zone_pcp_reset(struct zone *zone); + #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* @@ -290,7 +293,8 @@ static inline bool is_data_mapping(vm_flags_t flags) /* mm/util.c */ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, struct rb_node *rb_parent); + struct vm_area_struct *prev); +void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma); #ifdef CONFIG_MMU extern long populate_vma_page_range(struct vm_area_struct *vma, @@ -362,6 +366,27 @@ vma_address(struct page *page, struct vm_area_struct *vma) return max(start, vma->vm_start); } +static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, + struct file *fpin) +{ + int flags = vmf->flags; + + if (fpin) + return fpin; + + /* + * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or + * anything, so we only pin the file and drop the mmap_sem if only + * FAULT_FLAG_ALLOW_RETRY is set. + */ + if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) == + FAULT_FLAG_ALLOW_RETRY) { + fpin = get_file(vmf->vma->vm_file); + up_read(&vmf->vma->vm_mm->mmap_sem); + } + return fpin; +} + #else /* !CONFIG_MMU */ static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page) { } diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 6814d6d6a023..df3371d5c572 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -36,6 +36,8 @@ #include <linux/bug.h> #include <linux/uaccess.h> +#include <asm/tlbflush.h> + #include "kasan.h" #include "../slab.h" @@ -590,6 +592,7 @@ void kasan_kfree_large(void *ptr, unsigned long ip) /* The object will be poisoned by page_alloc. */ } +#ifndef CONFIG_KASAN_VMALLOC int kasan_module_alloc(void *addr, size_t size) { void *ret; @@ -625,6 +628,7 @@ void kasan_free_shadow(const struct vm_struct *vm) if (vm->flags & VM_KASAN) vfree(kasan_mem_to_shadow(vm->addr)); } +#endif extern void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); @@ -744,3 +748,232 @@ static int __init kasan_memhotplug_init(void) core_initcall(kasan_memhotplug_init); #endif + +#ifdef CONFIG_KASAN_VMALLOC +static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, + void *unused) +{ + unsigned long page; + pte_t pte; + + if (likely(!pte_none(*ptep))) + return 0; + + page = __get_free_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE); + pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL); + + spin_lock(&init_mm.page_table_lock); + if (likely(pte_none(*ptep))) { + set_pte_at(&init_mm, addr, ptep, pte); + page = 0; + } + spin_unlock(&init_mm.page_table_lock); + if (page) + free_page(page); + return 0; +} + +int kasan_populate_vmalloc(unsigned long requested_size, struct vm_struct *area) +{ + unsigned long shadow_start, shadow_end; + int ret; + + shadow_start = (unsigned long)kasan_mem_to_shadow(area->addr); + shadow_start = ALIGN_DOWN(shadow_start, PAGE_SIZE); + shadow_end = (unsigned long)kasan_mem_to_shadow(area->addr + + area->size); + shadow_end = ALIGN(shadow_end, PAGE_SIZE); + + ret = apply_to_page_range(&init_mm, shadow_start, + shadow_end - shadow_start, + kasan_populate_vmalloc_pte, NULL); + if (ret) + return ret; + + flush_cache_vmap(shadow_start, shadow_end); + + kasan_unpoison_shadow(area->addr, requested_size); + + area->flags |= VM_KASAN; + + /* + * We need to be careful about inter-cpu effects here. Consider: + * + * CPU#0 CPU#1 + * WRITE_ONCE(p, vmalloc(100)); while (x = READ_ONCE(p)) ; + * p[99] = 1; + * + * With compiler instrumentation, that ends up looking like this: + * + * CPU#0 CPU#1 + * // vmalloc() allocates memory + * // let a = area->addr + * // we reach kasan_populate_vmalloc + * // and call kasan_unpoison_shadow: + * STORE shadow(a), unpoison_val + * ... + * STORE shadow(a+99), unpoison_val x = LOAD p + * // rest of vmalloc process <data dependency> + * STORE p, a LOAD shadow(x+99) + * + * If there is no barrier between the end of unpoisioning the shadow + * and the store of the result to p, the stores could be committed + * in a different order by CPU#0, and CPU#1 could erroneously observe + * poison in the shadow. + * + * We need some sort of barrier between the stores. + * + * In the vmalloc() case, this is provided by a smp_wmb() in + * clear_vm_uninitialized_flag(). In the per-cpu allocator and in + * get_vm_area() and friends, the caller gets shadow allocated but + * doesn't have any pages mapped into the virtual address space that + * has been reserved. Mapping those pages in will involve taking and + * releasing a page-table lock, which will provide the barrier. + */ + + return 0; +} + +/* + * Poison the shadow for a vmalloc region. Called as part of the + * freeing process at the time the region is freed. + */ +void kasan_poison_vmalloc(void *start, unsigned long size) +{ + size = round_up(size, KASAN_SHADOW_SCALE_SIZE); + kasan_poison_shadow(start, size, KASAN_VMALLOC_INVALID); +} + +static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, + void *unused) +{ + unsigned long page; + + page = (unsigned long)__va(pte_pfn(*ptep) << PAGE_SHIFT); + + spin_lock(&init_mm.page_table_lock); + + if (likely(!pte_none(*ptep))) { + pte_clear(&init_mm, addr, ptep); + free_page(page); + } + spin_unlock(&init_mm.page_table_lock); + + return 0; +} + +/* + * Release the backing for the vmalloc region [start, end), which + * lies within the free region [free_region_start, free_region_end). + * + * This can be run lazily, long after the region was freed. It runs + * under vmap_area_lock, so it's not safe to interact with the vmalloc/vmap + * infrastructure. + * + * How does this work? + * ------------------- + * + * We have a region that is page aligned, labelled as A. + * That might not map onto the shadow in a way that is page-aligned: + * + * start end + * v v + * |????????|????????|AAAAAAAA|AA....AA|AAAAAAAA|????????| < vmalloc + * -------- -------- -------- -------- -------- + * | | | | | + * | | | /-------/ | + * \-------\|/------/ |/---------------/ + * ||| || + * |??AAAAAA|AAAAAAAA|AA??????| < shadow + * (1) (2) (3) + * + * First we align the start upwards and the end downwards, so that the + * shadow of the region aligns with shadow page boundaries. In the + * example, this gives us the shadow page (2). This is the shadow entirely + * covered by this allocation. + * + * Then we have the tricky bits. We want to know if we can free the + * partially covered shadow pages - (1) and (3) in the example. For this, + * we are given the start and end of the free region that contains this + * allocation. Extending our previous example, we could have: + * + * free_region_start free_region_end + * | start end | + * v v v v + * |FFFFFFFF|FFFFFFFF|AAAAAAAA|AA....AA|AAAAAAAA|FFFFFFFF| < vmalloc + * -------- -------- -------- -------- -------- + * | | | | | + * | | | /-------/ | + * \-------\|/------/ |/---------------/ + * ||| || + * |FFAAAAAA|AAAAAAAA|AAF?????| < shadow + * (1) (2) (3) + * + * Once again, we align the start of the free region up, and the end of + * the free region down so that the shadow is page aligned. So we can free + * page (1) - we know no allocation currently uses anything in that page, + * because all of it is in the vmalloc free region. But we cannot free + * page (3), because we can't be sure that the rest of it is unused. + * + * We only consider pages that contain part of the original region for + * freeing: we don't try to free other pages from the free region or we'd + * end up trying to free huge chunks of virtual address space. + * + * Concurrency + * ----------- + * + * How do we know that we're not freeing a page that is simultaneously + * being used for a fresh allocation in kasan_populate_vmalloc(_pte)? + * + * We _can_ have kasan_release_vmalloc and kasan_populate_vmalloc running + * at the same time. While we run under free_vmap_area_lock, the population + * code does not. + * + * free_vmap_area_lock instead operates to ensure that the larger range + * [free_region_start, free_region_end) is safe: because __alloc_vmap_area and + * the per-cpu region-finding algorithm both run under free_vmap_area_lock, + * no space identified as free will become used while we are running. This + * means that so long as we are careful with alignment and only free shadow + * pages entirely covered by the free region, we will not run in to any + * trouble - any simultaneous allocations will be for disjoint regions. + */ +void kasan_release_vmalloc(unsigned long start, unsigned long end, + unsigned long free_region_start, + unsigned long free_region_end) +{ + void *shadow_start, *shadow_end; + unsigned long region_start, region_end; + + region_start = ALIGN(start, PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE); + region_end = ALIGN_DOWN(end, PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE); + + free_region_start = ALIGN(free_region_start, + PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE); + + if (start != region_start && + free_region_start < region_start) + region_start -= PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE; + + free_region_end = ALIGN_DOWN(free_region_end, + PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE); + + if (end != region_end && + free_region_end > region_end) + region_end += PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE; + + shadow_start = kasan_mem_to_shadow((void *)region_start); + shadow_end = kasan_mem_to_shadow((void *)region_end); + + if (shadow_end > shadow_start) { + apply_to_page_range(&init_mm, (unsigned long)shadow_start, + (unsigned long)(shadow_end - shadow_start), + kasan_depopulate_vmalloc_pte, NULL); + flush_tlb_kernel_range((unsigned long)shadow_start, + (unsigned long)shadow_end); + } +} +#endif diff --git a/mm/kasan/generic_report.c b/mm/kasan/generic_report.c index 36c645939bc9..2d97efd4954f 100644 --- a/mm/kasan/generic_report.c +++ b/mm/kasan/generic_report.c @@ -86,6 +86,9 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info) case KASAN_ALLOCA_RIGHT: bug_type = "alloca-out-of-bounds"; break; + case KASAN_VMALLOC_INVALID: + bug_type = "vmalloc-out-of-bounds"; + break; } return bug_type; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 35cff6bbb716..3a083274628e 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -25,6 +25,7 @@ #endif #define KASAN_GLOBAL_REDZONE 0xFA /* redzone for global variable */ +#define KASAN_VMALLOC_INVALID 0xF9 /* unallocated space in vmapped page */ /* * Stack redzone shadow values diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a8a57bebb5fa..b679908743cb 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1602,6 +1602,24 @@ static void collapse_file(struct mm_struct *mm, result = SCAN_FAIL; goto xa_unlocked; } + } else if (PageDirty(page)) { + /* + * khugepaged only works on read-only fd, + * so this page is dirty because it hasn't + * been flushed since first write. There + * won't be new dirty pages. + * + * Trigger async flush here and hope the + * writeback is done when khugepaged + * revisits this page. + * + * This is a one-off situation. We are not + * forcing writeback in loop. + */ + xas_unlock_irq(&xas); + filemap_flush(mapping); + result = SCAN_FAIL; + goto xa_unlocked; } else if (trylock_page(page)) { get_page(page); xas_unlock_irq(&xas); diff --git a/mm/madvise.c b/mm/madvise.c index 94c343b4c968..bcdb6a042787 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -864,13 +864,13 @@ static int madvise_inject_error(int behavior, { struct page *page; struct zone *zone; - unsigned int order; + unsigned long size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - for (; start < end; start += PAGE_SIZE << order) { + for (; start < end; start += size) { unsigned long pfn; int ret; @@ -882,9 +882,9 @@ static int madvise_inject_error(int behavior, /* * When soft offlining hugepages, after migrating the page * we dissolve it, therefore in the second loop "page" will - * no longer be a compound page, and order will be 0. + * no longer be a compound page. */ - order = compound_order(compound_head(page)); + size = page_size(compound_head(page)); if (PageHWPoison(page)) { put_page(page); @@ -895,7 +895,7 @@ static int madvise_inject_error(int behavior, pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", pfn, start); - ret = soft_offline_page(page, MF_COUNT_INCREASED); + ret = soft_offline_page(pfn, MF_COUNT_INCREASED); if (ret) return ret; continue; @@ -1059,9 +1059,9 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) if (!madvise_behavior_valid(behavior)) return error; - if (start & ~PAGE_MASK) + if (!PAGE_ALIGNED(start)) return error; - len = (len_in + ~PAGE_MASK) & PAGE_MASK; + len = PAGE_ALIGN(len_in); /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) diff --git a/mm/memblock.c b/mm/memblock.c index c4b16cae2bc9..4bc2c7d8bf42 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -57,42 +57,38 @@ * at build time. The region arrays for the "memory" and "reserved" * types are initially sized to %INIT_MEMBLOCK_REGIONS and for the * "physmap" type to %INIT_PHYSMEM_REGIONS. - * The :c:func:`memblock_allow_resize` enables automatic resizing of - * the region arrays during addition of new regions. This feature - * should be used with care so that memory allocated for the region - * array will not overlap with areas that should be reserved, for - * example initrd. + * The memblock_allow_resize() enables automatic resizing of the region + * arrays during addition of new regions. This feature should be used + * with care so that memory allocated for the region array will not + * overlap with areas that should be reserved, for example initrd. * * The early architecture setup should tell memblock what the physical - * memory layout is by using :c:func:`memblock_add` or - * :c:func:`memblock_add_node` functions. The first function does not - * assign the region to a NUMA node and it is appropriate for UMA - * systems. Yet, it is possible to use it on NUMA systems as well and - * assign the region to a NUMA node later in the setup process using - * :c:func:`memblock_set_node`. The :c:func:`memblock_add_node` - * performs such an assignment directly. + * memory layout is by using memblock_add() or memblock_add_node() + * functions. The first function does not assign the region to a NUMA + * node and it is appropriate for UMA systems. Yet, it is possible to + * use it on NUMA systems as well and assign the region to a NUMA node + * later in the setup process using memblock_set_node(). The + * memblock_add_node() performs such an assignment directly. * * Once memblock is setup the memory can be allocated using one of the * API variants: * - * * :c:func:`memblock_phys_alloc*` - these functions return the - * **physical** address of the allocated memory - * * :c:func:`memblock_alloc*` - these functions return the **virtual** - * address of the allocated memory. + * * memblock_phys_alloc*() - these functions return the **physical** + * address of the allocated memory + * * memblock_alloc*() - these functions return the **virtual** address + * of the allocated memory. * * Note, that both API variants use implict assumptions about allowed * memory ranges and the fallback methods. Consult the documentation - * of :c:func:`memblock_alloc_internal` and - * :c:func:`memblock_alloc_range_nid` functions for more elaboarte - * description. + * of memblock_alloc_internal() and memblock_alloc_range_nid() + * functions for more elaborate description. * - * As the system boot progresses, the architecture specific - * :c:func:`mem_init` function frees all the memory to the buddy page - * allocator. + * As the system boot progresses, the architecture specific mem_init() + * function frees all the memory to the buddy page allocator. * - * Unless an architecure enables %CONFIG_ARCH_KEEP_MEMBLOCK, the + * Unless an architecture enables %CONFIG_ARCH_KEEP_MEMBLOCK, the * memblock data structures will be discarded after the system - * initialization compltes. + * initialization completes. */ #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -1323,12 +1319,13 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, * @start: the lower bound of the memory region to allocate (phys address) * @end: the upper bound of the memory region to allocate (phys address) * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * @exact_nid: control the allocation fall back to other nodes * * The allocation is performed from memory region limited by - * memblock.current_limit if @max_addr == %MEMBLOCK_ALLOC_ACCESSIBLE. + * memblock.current_limit if @end == %MEMBLOCK_ALLOC_ACCESSIBLE. * - * If the specified node can not hold the requested memory the - * allocation falls back to any node in the system + * If the specified node can not hold the requested memory and @exact_nid + * is false, the allocation falls back to any node in the system. * * For systems with memory mirroring, the allocation is attempted first * from the regions with mirroring enabled and then retried from any @@ -1342,7 +1339,8 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, */ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, phys_addr_t align, phys_addr_t start, - phys_addr_t end, int nid) + phys_addr_t end, int nid, + bool exact_nid) { enum memblock_flags flags = choose_memblock_flags(); phys_addr_t found; @@ -1362,7 +1360,7 @@ again: if (found && !memblock_reserve(found, size)) goto done; - if (nid != NUMA_NO_NODE) { + if (nid != NUMA_NO_NODE && !exact_nid) { found = memblock_find_in_range_node(size, align, start, end, NUMA_NO_NODE, flags); @@ -1410,7 +1408,8 @@ phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size, phys_addr_t start, phys_addr_t end) { - return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE); + return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE, + false); } /** @@ -1429,7 +1428,7 @@ phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size, phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid) { return memblock_alloc_range_nid(size, align, 0, - MEMBLOCK_ALLOC_ACCESSIBLE, nid); + MEMBLOCK_ALLOC_ACCESSIBLE, nid, false); } /** @@ -1439,6 +1438,7 @@ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t ali * @min_addr: the lower bound of the memory region to allocate (phys address) * @max_addr: the upper bound of the memory region to allocate (phys address) * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * @exact_nid: control the allocation fall back to other nodes * * Allocates memory block using memblock_alloc_range_nid() and * converts the returned physical address to virtual. @@ -1454,7 +1454,7 @@ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t ali static void * __init memblock_alloc_internal( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, - int nid) + int nid, bool exact_nid) { phys_addr_t alloc; @@ -1469,11 +1469,13 @@ static void * __init memblock_alloc_internal( if (max_addr > memblock.current_limit) max_addr = memblock.current_limit; - alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid); + alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid, + exact_nid); /* retry allocation without lower limit */ if (!alloc && min_addr) - alloc = memblock_alloc_range_nid(size, align, 0, max_addr, nid); + alloc = memblock_alloc_range_nid(size, align, 0, max_addr, nid, + exact_nid); if (!alloc) return NULL; @@ -1482,6 +1484,43 @@ static void * __init memblock_alloc_internal( } /** + * memblock_alloc_exact_nid_raw - allocate boot memory block on the exact node + * without zeroing memory + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region from where the allocation + * is preferred (phys address) + * @max_addr: the upper bound of the memory region from where the allocation + * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to + * allocate only from memory limited by memblock.current_limit value + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Public function, provides additional debug information (including caller + * info), if enabled. Does not zero allocated memory. + * + * Return: + * Virtual address of allocated memory block on success, NULL on failure. + */ +void * __init memblock_alloc_exact_nid_raw( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid) +{ + void *ptr; + + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n", + __func__, (u64)size, (u64)align, nid, &min_addr, + &max_addr, (void *)_RET_IP_); + + ptr = memblock_alloc_internal(size, align, + min_addr, max_addr, nid, true); + if (ptr && size > 0) + page_init_poison(ptr, size); + + return ptr; +} + +/** * memblock_alloc_try_nid_raw - allocate boot memory block without zeroing * memory and without panicking * @size: size of memory block to be allocated in bytes @@ -1512,7 +1551,7 @@ void * __init memblock_alloc_try_nid_raw( &max_addr, (void *)_RET_IP_); ptr = memblock_alloc_internal(size, align, - min_addr, max_addr, nid); + min_addr, max_addr, nid, false); if (ptr && size > 0) page_init_poison(ptr, size); @@ -1547,7 +1586,7 @@ void * __init memblock_alloc_try_nid( __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); ptr = memblock_alloc_internal(size, align, - min_addr, max_addr, nid); + min_addr, max_addr, nid, false); if (ptr) memset(ptr, 0, size); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 01f3f8b665e9..bc01423277c5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -108,7 +108,6 @@ static const char *const mem_cgroup_lru_names[] = { #define THRESHOLDS_EVENTS_TARGET 128 #define SOFTLIMIT_EVENTS_TARGET 1024 -#define NUMAINFO_EVENTS_TARGET 1024 /* * Cgroups above their limits are maintained in a RB-Tree, independent of @@ -778,7 +777,7 @@ void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) if (!memcg || memcg == root_mem_cgroup) { __mod_node_page_state(pgdat, idx, val); } else { - lruvec = mem_cgroup_lruvec(pgdat, memcg); + lruvec = mem_cgroup_lruvec(memcg, pgdat); __mod_lruvec_state(lruvec, idx, val); } rcu_read_unlock(); @@ -877,9 +876,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, case MEM_CGROUP_TARGET_SOFTLIMIT: next = val + SOFTLIMIT_EVENTS_TARGET; break; - case MEM_CGROUP_TARGET_NUMAINFO: - next = val + NUMAINFO_EVENTS_TARGET; - break; default: break; } @@ -899,21 +895,12 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) if (unlikely(mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_THRESH))) { bool do_softlimit; - bool do_numainfo __maybe_unused; do_softlimit = mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_SOFTLIMIT); -#if MAX_NUMNODES > 1 - do_numainfo = mem_cgroup_event_ratelimit(memcg, - MEM_CGROUP_TARGET_NUMAINFO); -#endif mem_cgroup_threshold(memcg); if (unlikely(do_softlimit)) mem_cgroup_update_tree(memcg, page); -#if MAX_NUMNODES > 1 - if (unlikely(do_numainfo)) - atomic_inc(&memcg->numainfo_events); -#endif } } @@ -1052,7 +1039,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup_per_node *mz; mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id); - iter = &mz->iter[reclaim->priority]; + iter = &mz->iter; if (prev && reclaim->generation != iter->generation) goto out_unlock; @@ -1152,15 +1139,11 @@ static void __invalidate_reclaim_iterators(struct mem_cgroup *from, struct mem_cgroup_reclaim_iter *iter; struct mem_cgroup_per_node *mz; int nid; - int i; for_each_node(nid) { mz = mem_cgroup_nodeinfo(from, nid); - for (i = 0; i <= DEF_PRIORITY; i++) { - iter = &mz->iter[i]; - cmpxchg(&iter->position, - dead_memcg, NULL); - } + iter = &mz->iter; + cmpxchg(&iter->position, dead_memcg, NULL); } } @@ -1238,7 +1221,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd struct lruvec *lruvec; if (mem_cgroup_disabled()) { - lruvec = &pgdat->lruvec; + lruvec = &pgdat->__lruvec; goto out; } @@ -1595,104 +1578,6 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, return ret; } -#if MAX_NUMNODES > 1 - -/** - * test_mem_cgroup_node_reclaimable - * @memcg: the target memcg - * @nid: the node ID to be checked. - * @noswap : specify true here if the user wants flle only information. - * - * This function returns whether the specified memcg contains any - * reclaimable pages on a node. Returns true if there are any reclaimable - * pages in the node. - */ -static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, - int nid, bool noswap) -{ - struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); - - if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) || - lruvec_page_state(lruvec, NR_ACTIVE_FILE)) - return true; - if (noswap || !total_swap_pages) - return false; - if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) || - lruvec_page_state(lruvec, NR_ACTIVE_ANON)) - return true; - return false; - -} - -/* - * Always updating the nodemask is not very good - even if we have an empty - * list or the wrong list here, we can start from some node and traverse all - * nodes based on the zonelist. So update the list loosely once per 10 secs. - * - */ -static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) -{ - int nid; - /* - * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET - * pagein/pageout changes since the last update. - */ - if (!atomic_read(&memcg->numainfo_events)) - return; - if (atomic_inc_return(&memcg->numainfo_updating) > 1) - return; - - /* make a nodemask where this memcg uses memory from */ - memcg->scan_nodes = node_states[N_MEMORY]; - - for_each_node_mask(nid, node_states[N_MEMORY]) { - - if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) - node_clear(nid, memcg->scan_nodes); - } - - atomic_set(&memcg->numainfo_events, 0); - atomic_set(&memcg->numainfo_updating, 0); -} - -/* - * Selecting a node where we start reclaim from. Because what we need is just - * reducing usage counter, start from anywhere is O,K. Considering - * memory reclaim from current node, there are pros. and cons. - * - * Freeing memory from current node means freeing memory from a node which - * we'll use or we've used. So, it may make LRU bad. And if several threads - * hit limits, it will see a contention on a node. But freeing from remote - * node means more costs for memory reclaim because of memory latency. - * - * Now, we use round-robin. Better algorithm is welcomed. - */ -int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) -{ - int node; - - mem_cgroup_may_update_nodemask(memcg); - node = memcg->last_scanned_node; - - node = next_node_in(node, memcg->scan_nodes); - /* - * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages - * last time it really checked all the LRUs due to rate limiting. - * Fallback to the current node in that case for simplicity. - */ - if (unlikely(node == MAX_NUMNODES)) - node = numa_node_id(); - - memcg->last_scanned_node = node; - return node; -} -#else -int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) -{ - return 0; -} -#endif - static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, pg_data_t *pgdat, gfp_t gfp_mask, @@ -1705,7 +1590,6 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, unsigned long nr_scanned; struct mem_cgroup_reclaim_cookie reclaim = { .pgdat = pgdat, - .priority = 0, }; excess = soft_limit_excess(root_memcg); @@ -3750,7 +3634,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask) { - struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); unsigned long nr = 0; enum lru_list lru; @@ -5078,7 +4962,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void) goto fail; INIT_WORK(&memcg->high_work, high_work_func); - memcg->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&memcg->oom_notify); mutex_init(&memcg->thresholds_lock); spin_lock_init(&memcg->move_lock); @@ -5455,8 +5338,8 @@ static int mem_cgroup_move_account(struct page *page, anon = PageAnon(page); pgdat = page_pgdat(page); - from_vec = mem_cgroup_lruvec(pgdat, from); - to_vec = mem_cgroup_lruvec(pgdat, to); + from_vec = mem_cgroup_lruvec(from, pgdat); + to_vec = mem_cgroup_lruvec(to, pgdat); spin_lock_irqsave(&from->move_lock, flags); @@ -6096,7 +5979,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - unsigned long nr_pages; + unsigned int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; + bool drained = false; unsigned long high; int err; @@ -6107,12 +5991,29 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, memcg->high = high; - nr_pages = page_counter_read(&memcg->memory); - if (nr_pages > high) - try_to_free_mem_cgroup_pages(memcg, nr_pages - high, - GFP_KERNEL, true); + for (;;) { + unsigned long nr_pages = page_counter_read(&memcg->memory); + unsigned long reclaimed; + + if (nr_pages <= high) + break; + + if (signal_pending(current)) + break; + + if (!drained) { + drain_all_stock(memcg); + drained = true; + continue; + } + + reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, + GFP_KERNEL, true); + + if (!reclaimed && !nr_retries--) + break; + } - memcg_wb_domain_size_changed(memcg); return nbytes; } @@ -6144,10 +6045,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (nr_pages <= max) break; - if (signal_pending(current)) { - err = -EINTR; + if (signal_pending(current)) break; - } if (!drained) { drain_all_stock(memcg); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 3151c87dff73..41c634f45d45 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -303,30 +303,24 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page, /* * Schedule a process for later kill. * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. - * TBD would GFP_NOIO be enough? */ static void add_to_kill(struct task_struct *tsk, struct page *p, struct vm_area_struct *vma, - struct list_head *to_kill, - struct to_kill **tkc) + struct list_head *to_kill) { struct to_kill *tk; - if (*tkc) { - tk = *tkc; - *tkc = NULL; - } else { - tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); - if (!tk) { - pr_err("Memory failure: Out of memory while machine check handling\n"); - return; - } + tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); + if (!tk) { + pr_err("Memory failure: Out of memory while machine check handling\n"); + return; } + tk->addr = page_address_in_vma(p, vma); if (is_zone_device_page(p)) tk->size_shift = dev_pagemap_mapping_shift(p, vma); else - tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT; + tk->size_shift = page_shift(compound_head(p)); /* * Send SIGKILL if "tk->addr == -EFAULT". Also, as @@ -345,6 +339,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, kfree(tk); return; } + get_task_struct(tsk); tk->tsk = tsk; list_add_tail(&tk->nd, to_kill); @@ -436,7 +431,7 @@ static struct task_struct *task_early_kill(struct task_struct *tsk, * Collect processes when the error hit an anonymous page. */ static void collect_procs_anon(struct page *page, struct list_head *to_kill, - struct to_kill **tkc, int force_early) + int force_early) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -461,7 +456,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, if (!page_mapped_in_vma(page, vma)) continue; if (vma->vm_mm == t->mm) - add_to_kill(t, page, vma, to_kill, tkc); + add_to_kill(t, page, vma, to_kill); } } read_unlock(&tasklist_lock); @@ -472,7 +467,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, * Collect processes when the error hit a file mapped page. */ static void collect_procs_file(struct page *page, struct list_head *to_kill, - struct to_kill **tkc, int force_early) + int force_early) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -496,7 +491,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, * to be informed of all such data corruptions. */ if (vma->vm_mm == t->mm) - add_to_kill(t, page, vma, to_kill, tkc); + add_to_kill(t, page, vma, to_kill); } } read_unlock(&tasklist_lock); @@ -505,26 +500,17 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, /* * Collect the processes who have the corrupted page mapped to kill. - * This is done in two steps for locking reasons. - * First preallocate one tokill structure outside the spin locks, - * so that we can kill at least one process reasonably reliable. */ static void collect_procs(struct page *page, struct list_head *tokill, int force_early) { - struct to_kill *tk; - if (!page->mapping) return; - tk = kmalloc(sizeof(struct to_kill), GFP_NOIO); - if (!tk) - return; if (PageAnon(page)) - collect_procs_anon(page, tokill, &tk, force_early); + collect_procs_anon(page, tokill, force_early); else - collect_procs_file(page, tokill, &tk, force_early); - kfree(tk); + collect_procs_file(page, tokill, force_early); } static const char *action_name[] = { @@ -1490,7 +1476,7 @@ static void memory_failure_work_func(struct work_struct *work) if (!gotten) break; if (entry.flags & MF_SOFT_OFFLINE) - soft_offline_page(pfn_to_page(entry.pfn), entry.flags); + soft_offline_page(entry.pfn, entry.flags); else memory_failure(entry.pfn, entry.flags); } @@ -1871,7 +1857,7 @@ static int soft_offline_free_page(struct page *page) /** * soft_offline_page - Soft offline a page. - * @page: page to offline + * @pfn: pfn to soft-offline * @flags: flags. Same as memory_failure(). * * Returns 0 on success, otherwise negated errno. @@ -1891,18 +1877,17 @@ static int soft_offline_free_page(struct page *page) * This is not a 100% solution for all memory, but tries to be * ``good enough'' for the majority of memory. */ -int soft_offline_page(struct page *page, int flags) +int soft_offline_page(unsigned long pfn, int flags) { int ret; - unsigned long pfn = page_to_pfn(page); + struct page *page; - if (is_zone_device_page(page)) { - pr_debug_ratelimited("soft_offline: %#lx page is device page\n", - pfn); - if (flags & MF_COUNT_INCREASED) - put_page(page); + if (!pfn_valid(pfn)) + return -ENXIO; + /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */ + page = pfn_to_online_page(pfn); + if (!page) return -EIO; - } if (PageHWPoison(page)) { pr_info("soft offline: %#lx page already poisoned\n", pfn); diff --git a/mm/memory.c b/mm/memory.c index b6a5d6a08438..513c3ecc76ee 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -72,6 +72,8 @@ #include <linux/oom.h> #include <linux/numa.h> +#include <trace/events/kmem.h> + #include <asm/io.h> #include <asm/mmu_context.h> #include <asm/pgalloc.h> @@ -152,6 +154,10 @@ static int __init init_zero_pfn(void) } core_initcall(init_zero_pfn); +void mm_trace_rss_stat(struct mm_struct *mm, int member, long count) +{ + trace_rss_stat(mm, member, count); +} #if defined(SPLIT_RSS_COUNTING) @@ -2289,10 +2295,11 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf) * * The function expects the page to be locked and unlocks it. */ -static void fault_dirty_shared_page(struct vm_area_struct *vma, - struct page *page) +static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct address_space *mapping; + struct page *page = vmf->page; bool dirtied; bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; @@ -2307,16 +2314,30 @@ static void fault_dirty_shared_page(struct vm_area_struct *vma, mapping = page_rmapping(page); unlock_page(page); + if (!page_mkwrite) + file_update_time(vma->vm_file); + + /* + * Throttle page dirtying rate down to writeback speed. + * + * mapping may be NULL here because some device drivers do not + * set page.mapping but still dirty their pages + * + * Drop the mmap_sem before waiting on IO, if we can. The file + * is pinning the mapping, as per above. + */ if ((dirtied || page_mkwrite) && mapping) { - /* - * Some device drivers do not set page.mapping - * but still dirty their pages - */ + struct file *fpin; + + fpin = maybe_unlock_mmap_for_io(vmf, NULL); balance_dirty_pages_ratelimited(mapping); + if (fpin) { + fput(fpin); + return VM_FAULT_RETRY; + } } - if (!page_mkwrite) - file_update_time(vma->vm_file); + return 0; } /* @@ -2571,6 +2592,7 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; + vm_fault_t ret = VM_FAULT_WRITE; get_page(vmf->page); @@ -2594,10 +2616,10 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf) wp_page_reuse(vmf); lock_page(vmf->page); } - fault_dirty_shared_page(vma, vmf->page); + ret |= fault_dirty_shared_page(vmf); put_page(vmf->page); - return VM_FAULT_WRITE; + return ret; } /* @@ -3083,7 +3105,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) /* * The memory barrier inside __SetPageUptodate makes sure that - * preceeding stores to the page contents become visible before + * preceding stores to the page contents become visible before * the set_pte_at() write. */ __SetPageUptodate(page); @@ -3641,7 +3663,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) return ret; } - fault_dirty_shared_page(vma, vmf->page); + ret |= fault_dirty_shared_page(vmf); return ret; } @@ -3988,6 +4010,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, vmf.pud = pud_alloc(mm, p4d, address); if (!vmf.pud) return VM_FAULT_OOM; +retry_pud: if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) @@ -4014,6 +4037,11 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, vmf.pmd = pmd_alloc(mm, vmf.pud, address); if (!vmf.pmd) return VM_FAULT_OOM; + + /* Huge pud page fault raced with pmd_alloc? */ + if (pud_trans_unstable(vmf.pud)) + goto retry_pud; + if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f307bd82d750..55ac23ef11c1 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -49,8 +49,6 @@ * and restore_online_page_callback() for generic callback restore. */ -static void generic_online_page(struct page *page, unsigned int order); - static online_page_callback_t online_page_callback = generic_online_page; static DEFINE_MUTEX(online_page_callback_lock); @@ -278,6 +276,22 @@ static int check_pfn_span(unsigned long pfn, unsigned long nr_pages, return 0; } +static int check_hotplug_memory_addressable(unsigned long pfn, + unsigned long nr_pages) +{ + const u64 max_addr = PFN_PHYS(pfn + nr_pages) - 1; + + if (max_addr >> MAX_PHYSMEM_BITS) { + const u64 max_allowed = (1ull << (MAX_PHYSMEM_BITS + 1)) - 1; + WARN(1, + "Hotplugged memory exceeds maximum addressable address, range=%#llx-%#llx, maximum=%#llx\n", + (u64)PFN_PHYS(pfn), max_addr, max_allowed); + return -E2BIG; + } + + return 0; +} + /* * Reasonably generic function for adding memory. It is * expected that archs that support memory hotplug will @@ -291,6 +305,10 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, unsigned long nr, start_sec, end_sec; struct vmem_altmap *altmap = restrictions->altmap; + err = check_hotplug_memory_addressable(pfn, nr_pages); + if (err) + return err; + if (altmap) { /* * Validate altmap is within bounds of the total request @@ -580,24 +598,7 @@ int restore_online_page_callback(online_page_callback_t callback) } EXPORT_SYMBOL_GPL(restore_online_page_callback); -void __online_page_set_limits(struct page *page) -{ -} -EXPORT_SYMBOL_GPL(__online_page_set_limits); - -void __online_page_increment_counters(struct page *page) -{ - adjust_managed_page_count(page, 1); -} -EXPORT_SYMBOL_GPL(__online_page_increment_counters); - -void __online_page_free(struct page *page) -{ - __free_reserved_page(page); -} -EXPORT_SYMBOL_GPL(__online_page_free); - -static void generic_online_page(struct page *page, unsigned int order) +void generic_online_page(struct page *page, unsigned int order) { kernel_map_pages(page, 1 << order, 1); __free_pages_core(page, order); @@ -607,6 +608,7 @@ static void generic_online_page(struct page *page, unsigned int order) totalhigh_pages_add(1UL << order); #endif } +EXPORT_SYMBOL_GPL(generic_online_page); static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, void *arg) @@ -1180,7 +1182,8 @@ static bool is_pageblock_removable_nolock(unsigned long pfn) if (!zone_spans_pfn(zone, pfn)) return false; - return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, SKIP_HWPOISON); + return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, + MEMORY_OFFLINE); } /* Checks if this range of memory is likely to be hot-removable. */ @@ -1377,9 +1380,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) return ret; } -/* - * remove from free_area[] and mark all as Reserved. - */ +/* Mark all sections offline and remove all free pages from the buddy. */ static int offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, void *data) @@ -1397,7 +1398,8 @@ static int check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, void *data) { - return test_pages_isolated(start_pfn, start_pfn + nr_pages, true); + return test_pages_isolated(start_pfn, start_pfn + nr_pages, + MEMORY_OFFLINE); } static int __init cmdline_parse_movable_node(char *p) @@ -1478,10 +1480,19 @@ static void node_states_clear_node(int node, struct memory_notify *arg) node_clear_state(node, N_MEMORY); } +static int count_system_ram_pages_cb(unsigned long start_pfn, + unsigned long nr_pages, void *data) +{ + unsigned long *nr_system_ram_pages = data; + + *nr_system_ram_pages += nr_pages; + return 0; +} + static int __ref __offline_pages(unsigned long start_pfn, unsigned long end_pfn) { - unsigned long pfn, nr_pages; + unsigned long pfn, nr_pages = 0; unsigned long offlined_pages = 0; int ret, node, nr_isolate_pageblock; unsigned long flags; @@ -1492,6 +1503,22 @@ static int __ref __offline_pages(unsigned long start_pfn, mem_hotplug_begin(); + /* + * Don't allow to offline memory blocks that contain holes. + * Consequently, memory blocks with holes can never get onlined + * via the hotplug path - online_pages() - as hotplugged memory has + * no holes. This way, we e.g., don't have to worry about marking + * memory holes PG_reserved, don't need pfn_valid() checks, and can + * avoid using walk_system_ram_range() later. + */ + walk_system_ram_range(start_pfn, end_pfn - start_pfn, &nr_pages, + count_system_ram_pages_cb); + if (nr_pages != end_pfn - start_pfn) { + ret = -EINVAL; + reason = "memory holes"; + goto failed_removal; + } + /* This makes hotplug much easier...and readable. we assume this for now. .*/ if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, @@ -1503,12 +1530,11 @@ static int __ref __offline_pages(unsigned long start_pfn, zone = page_zone(pfn_to_page(valid_start)); node = zone_to_nid(zone); - nr_pages = end_pfn - start_pfn; /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, - SKIP_HWPOISON | REPORT_FAILURE); + MEMORY_OFFLINE | REPORT_FAILURE); if (ret < 0) { reason = "failure to isolate range"; goto failed_removal; @@ -1750,13 +1776,13 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) /* remove memmap entry */ firmware_map_remove(start, start + size, "System RAM"); - memblock_free(start, size); - memblock_remove(start, size); /* remove memory block devices before removing memory */ remove_memory_block_devices(start, size); arch_remove_memory(nid, start, size, NULL); + memblock_free(start, size); + memblock_remove(start, size); __release_memory_resource(start, size); try_offline_node(nid); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e08c94170ae4..067cf7d3daf5 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -410,7 +410,9 @@ struct queue_pages { struct list_head *pagelist; unsigned long flags; nodemask_t *nmask; - struct vm_area_struct *prev; + unsigned long start; + unsigned long end; + struct vm_area_struct *first; }; /* @@ -618,6 +620,22 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; + /* range check first */ + VM_BUG_ON((vma->vm_start > start) || (vma->vm_end < end)); + + if (!qp->first) { + qp->first = vma; + if (!(flags & MPOL_MF_DISCONTIG_OK) && + (qp->start < vma->vm_start)) + /* hole at head side of range */ + return -EFAULT; + } + if (!(flags & MPOL_MF_DISCONTIG_OK) && + ((vma->vm_end < qp->end) && + (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start))) + /* hole at middle or tail of range */ + return -EFAULT; + /* * Need check MPOL_MF_STRICT to return -EIO if possible * regardless of vma_migratable @@ -628,17 +646,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, if (endvma > end) endvma = end; - if (vma->vm_start > start) - start = vma->vm_start; - - if (!(flags & MPOL_MF_DISCONTIG_OK)) { - if (!vma->vm_next && vma->vm_end < end) - return -EFAULT; - if (qp->prev && qp->prev->vm_end < vma->vm_start) - return -EFAULT; - } - - qp->prev = vma; if (flags & MPOL_MF_LAZY) { /* Similar to task_numa_work, skip inaccessible VMAs */ @@ -681,14 +688,23 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, nodemask_t *nodes, unsigned long flags, struct list_head *pagelist) { + int err; struct queue_pages qp = { .pagelist = pagelist, .flags = flags, .nmask = nodes, - .prev = NULL, + .start = start, + .end = end, + .first = NULL, }; - return walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp); + err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp); + + if (!qp.first) + /* whole range in hole */ + err = -EFAULT; + + return err; } /* @@ -740,8 +756,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long vmend; vma = find_vma(mm, start); - if (!vma || vma->vm_start > start) - return -EFAULT; + VM_BUG_ON(!vma); prev = vma->vm_prev; if (start > vma->vm_start) diff --git a/mm/migrate.c b/mm/migrate.c index 4fe45d1428c8..eae1565285e3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1168,15 +1168,11 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, enum migrate_reason reason) { int rc = MIGRATEPAGE_SUCCESS; - struct page *newpage; + struct page *newpage = NULL; if (!thp_migration_supported() && PageTransHuge(page)) return -ENOMEM; - newpage = get_new_page(page, private); - if (!newpage) - return -ENOMEM; - if (page_count(page) == 1) { /* page was freed from under us. So we are done. */ ClearPageActive(page); @@ -1187,13 +1183,13 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, __ClearPageIsolated(page); unlock_page(page); } - if (put_new_page) - put_new_page(newpage, private); - else - put_page(newpage); goto out; } + newpage = get_new_page(page, private); + if (!newpage) + return -ENOMEM; + rc = __unmap_and_move(page, newpage, force, mode); if (rc == MIGRATEPAGE_SUCCESS) set_page_owner_migrate_reason(newpage, reason); @@ -1863,7 +1859,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, if (!zone_watermark_ok(zone, 0, high_wmark_pages(zone) + nr_migrate_pages, - 0, 0)) + ZONE_MOVABLE, 0)) continue; return true; } diff --git a/mm/mmap.c b/mm/mmap.c index a7d8c84d19b7..9c648524e4dc 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -641,7 +641,7 @@ __vma_link(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node **rb_link, struct rb_node *rb_parent) { - __vma_link_list(mm, vma, prev, rb_parent); + __vma_link_list(mm, vma, prev); __vma_link_rb(mm, vma, rb_link, rb_parent); } @@ -684,37 +684,14 @@ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) static __always_inline void __vma_unlink_common(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, - bool has_prev, struct vm_area_struct *ignore) { - struct vm_area_struct *next; - vma_rb_erase_ignore(vma, &mm->mm_rb, ignore); - next = vma->vm_next; - if (has_prev) - prev->vm_next = next; - else { - prev = vma->vm_prev; - if (prev) - prev->vm_next = next; - else - mm->mmap = next; - } - if (next) - next->vm_prev = prev; - + __vma_unlink_list(mm, vma); /* Kill the cache */ vmacache_invalidate(mm); } -static inline void __vma_unlink_prev(struct mm_struct *mm, - struct vm_area_struct *vma, - struct vm_area_struct *prev) -{ - __vma_unlink_common(mm, vma, prev, true, vma); -} - /* * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that * is already present in an i_mmap tree without adjusting the tree. @@ -769,8 +746,6 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, remove_next = 1 + (end > next->vm_end); VM_WARN_ON(remove_next == 2 && end != next->vm_next->vm_end); - VM_WARN_ON(remove_next == 1 && - end != next->vm_end); /* trim end to next, for case 6 first pass */ end = next->vm_end; } @@ -889,7 +864,7 @@ again: * us to remove next before dropping the locks. */ if (remove_next != 3) - __vma_unlink_prev(mm, next, vma); + __vma_unlink_common(mm, next, next); else /* * vma is not before next if they've been @@ -900,7 +875,7 @@ again: * "next" (which is stored in post-swap() * "vma"). */ - __vma_unlink_common(mm, next, NULL, false, vma); + __vma_unlink_common(mm, next, vma); if (file) __remove_shared_vm_struct(next, file, mapping); } else if (insert) { @@ -1116,15 +1091,18 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * the area passed down from mprotect_fixup, never extending beyond one * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after: * - * AAAA AAAA AAAA AAAA - * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX - * cannot merge might become might become might become - * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or - * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or - * mremap move: PPPPXXXXXXXX 8 - * AAAA - * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN - * might become case 1 below case 2 below case 3 below + * AAAA AAAA AAAA + * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN + * cannot merge might become might become + * PPNNNNNNNNNN PPPPPPPPPPNN + * mmap, brk or case 4 below case 5 below + * mremap move: + * AAAA AAAA + * PPPP NNNN PPPPNNNNXXXX + * might become might become + * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or + * PPPPPPPPNNNN 2 or PPPPPPPPXXXX 7 or + * PPPPNNNNNNNN 3 PPPPXXXXXXXX 8 * * It is important for case 8 that the vma NNNN overlapping the * region AAAA is never going to extended over XXXX. Instead XXXX must @@ -1442,7 +1420,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, * that it represents a valid section of the address space. */ addr = get_unmapped_area(file, addr, len, pgoff, flags); - if (offset_in_page(addr)) + if (IS_ERR_VALUE(addr)) return addr; if (flags & MAP_FIXED_NOREPLACE) { @@ -3006,15 +2984,16 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla struct rb_node **rb_link, *rb_parent; pgoff_t pgoff = addr >> PAGE_SHIFT; int error; + unsigned long mapped_addr; /* Until we need other flags, refuse anything except VM_EXEC. */ if ((flags & (~VM_EXEC)) != 0) return -EINVAL; flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); - if (offset_in_page(error)) - return error; + mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); + if (IS_ERR_VALUE(mapped_addr)) + return mapped_addr; error = mlock_future_check(mm, mm->def_flags, len); if (error) diff --git a/mm/mprotect.c b/mm/mprotect.c index 7967825f6d33..7a8e84f86831 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -80,6 +80,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (prot_numa) { struct page *page; + /* Avoid TLB flush if possible */ + if (pte_protnone(oldpte)) + continue; + page = vm_normal_page(vma, addr, oldpte); if (!page || PageKsm(page)) continue; @@ -97,10 +101,6 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (page_is_file_cache(page) && PageDirty(page)) continue; - /* Avoid TLB flush if possible */ - if (pte_protnone(oldpte)) - continue; - /* * Don't mess with PTEs if page is already on the node * a single-threaded process is running on. diff --git a/mm/mremap.c b/mm/mremap.c index 1fc8a29fbe3f..122938dcec15 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -558,7 +558,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT), map_flags); - if (offset_in_page(ret)) + if (IS_ERR_VALUE(ret)) goto out1; ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf, @@ -706,7 +706,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT), map_flags); - if (offset_in_page(new_addr)) { + if (IS_ERR_VALUE(new_addr)) { ret = new_addr; goto out; } diff --git a/mm/nommu.c b/mm/nommu.c index 7de592058ab4..bd2b4e5ef144 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -648,7 +648,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) if (rb_prev) prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); - __vma_link_list(mm, vma, prev, parent); + __vma_link_list(mm, vma, prev); } /* @@ -684,13 +684,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) /* remove from the MM's tree and list */ rb_erase(&vma->vm_rb, &mm->mm_rb); - if (vma->vm_prev) - vma->vm_prev->vm_next = vma->vm_next; - else - mm->mmap = vma->vm_next; - - if (vma->vm_next) - vma->vm_next->vm_prev = vma->vm_prev; + __vma_unlink_list(mm, vma); } /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f391c0c4ed1d..4785a8a2040e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5354,6 +5354,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " min:%lukB" " low:%lukB" " high:%lukB" + " reserved_highatomic:%luKB" " active_anon:%lukB" " inactive_anon:%lukB" " active_file:%lukB" @@ -5375,6 +5376,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), + K(zone->nr_reserved_highatomic), K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), @@ -6711,7 +6713,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) pgdat_page_ext_init(pgdat); spin_lock_init(&pgdat->lru_lock); - lruvec_init(node_lruvec(pgdat)); + lruvec_init(&pgdat->__lruvec); } static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, @@ -7988,6 +7990,15 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, return 0; } +static void __zone_pcp_update(struct zone *zone) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) + pageset_set_high_and_batch(zone, + per_cpu_ptr(zone->pageset, cpu)); +} + /* * percpu_pagelist_fraction - changes the pcp->high for each zone on each * cpu. It is the fraction of total pages in each zone that a hot per cpu @@ -8019,13 +8030,8 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) goto out; - for_each_populated_zone(zone) { - unsigned int cpu; - - for_each_possible_cpu(cpu) - pageset_set_high_and_batch(zone, - per_cpu_ptr(zone->pageset, cpu)); - } + for_each_populated_zone(zone) + __zone_pcp_update(zone); out: mutex_unlock(&pcp_batch_high_lock); return ret; @@ -8261,7 +8267,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, * The HWPoisoned page may be not in buddy system, and * page_count() is not 0. */ - if ((flags & SKIP_HWPOISON) && PageHWPoison(page)) + if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) continue; if (__PageMovable(page)) @@ -8477,7 +8483,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, } /* Make sure the range is really isolated. */ - if (test_pages_isolated(outer_start, end, false)) { + if (test_pages_isolated(outer_start, end, 0)) { pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n", __func__, outer_start, end); ret = -EBUSY; @@ -8502,6 +8508,107 @@ done: pfn_max_align_up(end), migratetype); return ret; } + +static int __alloc_contig_pages(unsigned long start_pfn, + unsigned long nr_pages, gfp_t gfp_mask) +{ + unsigned long end_pfn = start_pfn + nr_pages; + + return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, + gfp_mask); +} + +static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long i, end_pfn = start_pfn + nr_pages; + struct page *page; + + for (i = start_pfn; i < end_pfn; i++) { + page = pfn_to_online_page(i); + if (!page) + return false; + + if (page_zone(page) != z) + return false; + + if (PageReserved(page)) + return false; + + if (page_count(page) > 0) + return false; + + if (PageHuge(page)) + return false; + } + return true; +} + +static bool zone_spans_last_pfn(const struct zone *zone, + unsigned long start_pfn, unsigned long nr_pages) +{ + unsigned long last_pfn = start_pfn + nr_pages - 1; + + return zone_spans_pfn(zone, last_pfn); +} + +/** + * alloc_contig_pages() -- tries to find and allocate contiguous range of pages + * @nr_pages: Number of contiguous pages to allocate + * @gfp_mask: GFP mask to limit search and used during compaction + * @nid: Target node + * @nodemask: Mask for other possible nodes + * + * This routine is a wrapper around alloc_contig_range(). It scans over zones + * on an applicable zonelist to find a contiguous pfn range which can then be + * tried for allocation with alloc_contig_range(). This routine is intended + * for allocation requests which can not be fulfilled with the buddy allocator. + * + * The allocated memory is always aligned to a page boundary. If nr_pages is a + * power of two then the alignment is guaranteed to be to the given nr_pages + * (e.g. 1GB request would be aligned to 1GB). + * + * Allocated pages can be freed with free_contig_range() or by manually calling + * __free_page() on each allocated page. + * + * Return: pointer to contiguous pages on success, or NULL if not successful. + */ +struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) +{ + unsigned long ret, pfn, flags; + struct zonelist *zonelist; + struct zone *zone; + struct zoneref *z; + + zonelist = node_zonelist(nid, gfp_mask); + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_zone(gfp_mask), nodemask) { + spin_lock_irqsave(&zone->lock, flags); + + pfn = ALIGN(zone->zone_start_pfn, nr_pages); + while (zone_spans_last_pfn(zone, pfn, nr_pages)) { + if (pfn_range_valid_contig(zone, pfn, nr_pages)) { + /* + * We release the zone lock here because + * alloc_contig_range() will also lock the zone + * at some point. If there's an allocation + * spinning on this lock, it may win the race + * and cause alloc_contig_range() to fail... + */ + spin_unlock_irqrestore(&zone->lock, flags); + ret = __alloc_contig_pages(pfn, nr_pages, + gfp_mask); + if (!ret) + return pfn_to_page(pfn); + spin_lock_irqsave(&zone->lock, flags); + } + pfn += nr_pages; + } + spin_unlock_irqrestore(&zone->lock, flags); + } + return NULL; +} #endif /* CONFIG_CONTIG_ALLOC */ void free_contig_range(unsigned long pfn, unsigned int nr_pages) @@ -8523,11 +8630,8 @@ void free_contig_range(unsigned long pfn, unsigned int nr_pages) */ void __meminit zone_pcp_update(struct zone *zone) { - unsigned cpu; mutex_lock(&pcp_batch_high_lock); - for_each_possible_cpu(cpu) - pageset_set_high_and_batch(zone, - per_cpu_ptr(zone->pageset, cpu)); + __zone_pcp_update(zone); mutex_unlock(&pcp_batch_high_lock); } @@ -8560,7 +8664,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) { struct page *page; struct zone *zone; - unsigned int order, i; + unsigned int order; unsigned long pfn; unsigned long flags; unsigned long offlined_pages = 0; @@ -8588,7 +8692,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) */ if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { pfn++; - SetPageReserved(page); offlined_pages++; continue; } @@ -8602,8 +8705,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) pfn, 1 << order, end_pfn); #endif del_page_from_free_area(page, &zone->free_area[order]); - for (i = 0; i < (1 << order); i++) - SetPageReserved((page+i)); pfn += (1 << order); } spin_unlock_irqrestore(&zone->lock, flags); diff --git a/mm/page_io.c b/mm/page_io.c index 60a66a58b9bf..3a198deb8bb1 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -22,6 +22,7 @@ #include <linux/writeback.h> #include <linux/frontswap.h> #include <linux/blkdev.h> +#include <linux/psi.h> #include <linux/uio.h> #include <linux/sched/task.h> #include <asm/pgtable.h> @@ -354,10 +355,19 @@ int swap_readpage(struct page *page, bool synchronous) struct swap_info_struct *sis = page_swap_info(page); blk_qc_t qc; struct gendisk *disk; + unsigned long pflags; VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageUptodate(page), page); + + /* + * Count submission time as memory stall. When the device is congested, + * or the submitting cgroup IO-throttled, submission can be a + * significant part of overall IO time. + */ + psi_memstall_enter(&pflags); + if (frontswap_load(page) == 0) { SetPageUptodate(page); unlock_page(page); @@ -371,7 +381,7 @@ int swap_readpage(struct page *page, bool synchronous) ret = mapping->a_ops->readpage(swap_file, page); if (!ret) count_vm_event(PSWPIN); - return ret; + goto out; } ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); @@ -382,7 +392,7 @@ int swap_readpage(struct page *page, bool synchronous) } count_vm_event(PSWPIN); - return 0; + goto out; } ret = 0; @@ -418,6 +428,7 @@ int swap_readpage(struct page *page, bool synchronous) bio_put(bio); out: + psi_memstall_leave(&pflags); return ret; } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 89c19c0feadb..04ee1663cdbe 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -168,7 +168,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * @migratetype: Migrate type to set in error recovery. * @flags: The following flags are allowed (they can be combined in * a bit mask) - * SKIP_HWPOISON - ignore hwpoison pages + * MEMORY_OFFLINE - isolate to offline (!allocate) memory + * e.g., skip over PageHWPoison() pages * REPORT_FAILURE - report details about the failure to * isolate the range * @@ -257,7 +258,7 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, */ static unsigned long __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, - bool skip_hwpoisoned_pages) + int flags) { struct page *page; @@ -274,7 +275,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, * simple way to verify that as VM_BUG_ON(), though. */ pfn += 1 << page_order(page); - else if (skip_hwpoisoned_pages && PageHWPoison(page)) + else if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) /* A HWPoisoned page cannot be also PageBuddy */ pfn++; else @@ -286,7 +287,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, /* Caller should ensure that requested range is in a single zone */ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, - bool skip_hwpoisoned_pages) + int isol_flags) { unsigned long pfn, flags; struct page *page; @@ -308,8 +309,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, /* Check all pages are free or marked as ISOLATED */ zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); - pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, - skip_hwpoisoned_pages); + pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, isol_flags); spin_unlock_irqrestore(&zone->lock, flags); trace_test_pages_isolated(start_pfn, end_pfn, pfn); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 532c29276fce..3d7c01e76efc 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -24,18 +24,27 @@ void pgd_clear_bad(pgd_t *pgd) pgd_clear(pgd); } +#ifndef __PAGETABLE_P4D_FOLDED void p4d_clear_bad(p4d_t *p4d) { p4d_ERROR(*p4d); p4d_clear(p4d); } +#endif +#ifndef __PAGETABLE_PUD_FOLDED void pud_clear_bad(pud_t *pud) { pud_ERROR(*pud); pud_clear(pud); } +#endif +/* + * Note that the pmd variant below can't be stub'ed out just as for p4d/pud + * above. pmd folding is special and typically pmd_* macros refer to upper + * level even when folded + */ void pmd_clear_bad(pmd_t *pmd) { pmd_ERROR(*pmd); diff --git a/mm/rmap.c b/mm/rmap.c index 0c7b2a9400d4..b3e381919835 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -251,18 +251,37 @@ static inline void unlock_anon_vma_root(struct anon_vma *root) * Attach the anon_vmas from src to dst. * Returns 0 on success, -ENOMEM on failure. * - * If dst->anon_vma is NULL this function tries to find and reuse existing - * anon_vma which has no vmas and only one child anon_vma. This prevents - * degradation of anon_vma hierarchy to endless linear chain in case of - * constantly forking task. On the other hand, an anon_vma with more than one - * child isn't reused even if there was no alive vma, thus rmap walker has a - * good chance of avoiding scanning the whole hierarchy when it searches where - * page is mapped. + * anon_vma_clone() is called by __vma_split(), __split_vma(), copy_vma() and + * anon_vma_fork(). The first three want an exact copy of src, while the last + * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent + * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call, + * we can identify this case by checking (!dst->anon_vma && src->anon_vma). + * + * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find + * and reuse existing anon_vma which has no vmas and only one child anon_vma. + * This prevents degradation of anon_vma hierarchy to endless linear chain in + * case of constantly forking task. On the other hand, an anon_vma with more + * than one child isn't reused even if there was no alive vma, thus rmap + * walker has a good chance of avoiding scanning the whole hierarchy when it + * searches where page is mapped. */ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) { struct anon_vma_chain *avc, *pavc; struct anon_vma *root = NULL; + struct vm_area_struct *prev = dst->vm_prev, *pprev = src->vm_prev; + + /* + * If parent share anon_vma with its vm_prev, keep this sharing in in + * child. + * + * 1. Parent has vm_prev, which implies we have vm_prev. + * 2. Parent and its vm_prev have the same anon_vma. + */ + if (!dst->anon_vma && src->anon_vma && + pprev && pprev->anon_vma == src->anon_vma) + dst->anon_vma = prev->anon_vma; + list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { struct anon_vma *anon_vma; @@ -287,8 +306,8 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) * will always reuse it. Root anon_vma is never reused: * it has self-parent reference and at least one child. */ - if (!dst->anon_vma && anon_vma != src->anon_vma && - anon_vma->degree < 2) + if (!dst->anon_vma && src->anon_vma && + anon_vma != src->anon_vma && anon_vma->degree < 2) dst->anon_vma = anon_vma; } if (dst->anon_vma) @@ -458,9 +477,10 @@ void __init anon_vma_init(void) * chain and verify that the page in question is indeed mapped in it * [ something equivalent to page_mapped_in_vma() ]. * - * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap() - * that the anon_vma pointer from page->mapping is valid if there is a - * mapcount, we can dereference the anon_vma after observing those. + * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from + * page_remove_rmap() that the anon_vma pointer from page->mapping is valid + * if there is a mapcount, we can dereference the anon_vma after observing + * those. */ struct anon_vma *page_get_anon_vma(struct page *page) { @@ -1055,7 +1075,6 @@ static void __page_set_anon_rmap(struct page *page, static void __page_check_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { -#ifdef CONFIG_DEBUG_VM /* * The page's anon-rmap details (mapping and index) are guaranteed to * be set up correctly at this point. @@ -1068,9 +1087,9 @@ static void __page_check_anon_rmap(struct page *page, * are initially only visible via the pagetables, and the pte is locked * over the call to page_add_new_anon_rmap. */ - BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root); - BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address)); -#endif + VM_BUG_ON_PAGE(page_anon_vma(page)->root != vma->anon_vma->root, page); + VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address), + page); } /** @@ -1273,12 +1292,20 @@ static void page_remove_anon_compound_rmap(struct page *page) if (TestClearPageDoubleMap(page)) { /* * Subpages can be mapped with PTEs too. Check how many of - * themi are still mapped. + * them are still mapped. */ for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { if (atomic_add_negative(-1, &page[i]._mapcount)) nr++; } + + /* + * Queue the page for deferred split if at least one small + * page of the compound page is unmapped, but at least one + * small page is still mapped. + */ + if (nr && nr < HPAGE_PMD_NR) + deferred_split_huge_page(page); } else { nr = HPAGE_PMD_NR; } @@ -1286,10 +1313,8 @@ static void page_remove_anon_compound_rmap(struct page *page) if (unlikely(PageMlocked(page))) clear_page_mlock(page); - if (nr) { + if (nr) __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr); - deferred_split_huge_page(page); - } } /** diff --git a/mm/shmem.c b/mm/shmem.c index 220be9fa2c41..165fa6332993 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1369,7 +1369,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (list_empty(&info->swaplist)) list_add(&info->swaplist, &shmem_swaplist); - if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { + if (add_to_swap_cache(page, swap, + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN) == 0) { spin_lock_irq(&info->lock); shmem_recalc_inode(inode); info->swapped++; @@ -2022,16 +2023,14 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) shmem_falloc->waitq && vmf->pgoff >= shmem_falloc->start && vmf->pgoff < shmem_falloc->next) { + struct file *fpin; wait_queue_head_t *shmem_falloc_waitq; DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); ret = VM_FAULT_NOPAGE; - if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && - !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { - /* It's polite to up mmap_sem if we can */ - up_read(&vma->vm_mm->mmap_sem); + fpin = maybe_unlock_mmap_for_io(vmf, NULL); + if (fpin) ret = VM_FAULT_RETRY; - } shmem_falloc_waitq = shmem_falloc->waitq; prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, @@ -2049,6 +2048,9 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) spin_lock(&inode->i_lock); finish_wait(shmem_falloc_waitq, &shmem_fault_wait); spin_unlock(&inode->i_lock); + + if (fpin) + fput(fpin); return ret; } spin_unlock(&inode->i_lock); @@ -2213,11 +2215,14 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) return -EPERM; /* - * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED - * read-only mapping, take care to not allow mprotect to revert - * protections. + * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as + * MAP_SHARED and read-only, take care to not allow mprotect to + * revert protections on such mappings. Do this only for shared + * mappings. For private mappings, don't need to mask + * VM_MAYWRITE as we still want them to be COW-writable. */ - vma->vm_flags &= ~(VM_MAYWRITE); + if (vma->vm_flags & VM_SHARED) + vma->vm_flags &= ~(VM_MAYWRITE); } file_accessed(file); @@ -2742,7 +2747,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, } shmem_falloc.waitq = &shmem_falloc_waitq; - shmem_falloc.start = unmap_start >> PAGE_SHIFT; + shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT; shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; spin_lock(&inode->i_lock); inode->i_private = &shmem_falloc; @@ -3928,7 +3933,7 @@ out2: static ssize_t shmem_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - int values[] = { + static const int values[] = { SHMEM_HUGE_ALWAYS, SHMEM_HUGE_WITHIN_SIZE, SHMEM_HUGE_ADVISE, diff --git a/mm/slab.c b/mm/slab.c index 66e5d8032bae..f1e1840af533 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1247,9 +1247,10 @@ void __init kmem_cache_init(void) * structures first. Without this, further allocations will bug. */ kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache( - kmalloc_info[INDEX_NODE].name, - kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS, - 0, kmalloc_size(INDEX_NODE)); + kmalloc_info[INDEX_NODE].name[KMALLOC_NORMAL], + kmalloc_info[INDEX_NODE].size, + ARCH_KMALLOC_FLAGS, 0, + kmalloc_info[INDEX_NODE].size); slab_state = PARTIAL_NODE; setup_kmalloc_cache_index_table(); diff --git a/mm/slab.h b/mm/slab.h index b2b01694dc43..7e94700aa78c 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -139,7 +139,7 @@ extern struct kmem_cache *kmem_cache; /* A table of kmalloc cache names and sizes */ extern const struct kmalloc_info_struct { - const char *name; + const char *name[NR_KMALLOC_TYPES]; unsigned int size; } kmalloc_info[]; @@ -369,7 +369,7 @@ static __always_inline int memcg_charge_slab(struct page *page, if (ret) goto out; - lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); + lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page)); mod_lruvec_state(lruvec, cache_vmstat_idx(s), 1 << order); /* transer try_charge() page references to kmem_cache */ @@ -393,7 +393,7 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order, rcu_read_lock(); memcg = READ_ONCE(s->memcg_params.memcg); if (likely(!mem_cgroup_is_root(memcg))) { - lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); + lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page)); mod_lruvec_state(lruvec, cache_vmstat_idx(s), -(1 << order)); memcg_kmem_uncharge_memcg(page, order, memcg); } else { diff --git a/mm/slab_common.c b/mm/slab_common.c index f9fb27b4c843..8afa188f6e20 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1139,26 +1139,56 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) return kmalloc_caches[kmalloc_type(flags)][index]; } +#ifdef CONFIG_ZONE_DMA +#define INIT_KMALLOC_INFO(__size, __short_size) \ +{ \ + .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \ + .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \ + .name[KMALLOC_DMA] = "dma-kmalloc-" #__short_size, \ + .size = __size, \ +} +#else +#define INIT_KMALLOC_INFO(__size, __short_size) \ +{ \ + .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \ + .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \ + .size = __size, \ +} +#endif + /* * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time. * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is * kmalloc-67108864. */ const struct kmalloc_info_struct kmalloc_info[] __initconst = { - {NULL, 0}, {"kmalloc-96", 96}, - {"kmalloc-192", 192}, {"kmalloc-8", 8}, - {"kmalloc-16", 16}, {"kmalloc-32", 32}, - {"kmalloc-64", 64}, {"kmalloc-128", 128}, - {"kmalloc-256", 256}, {"kmalloc-512", 512}, - {"kmalloc-1k", 1024}, {"kmalloc-2k", 2048}, - {"kmalloc-4k", 4096}, {"kmalloc-8k", 8192}, - {"kmalloc-16k", 16384}, {"kmalloc-32k", 32768}, - {"kmalloc-64k", 65536}, {"kmalloc-128k", 131072}, - {"kmalloc-256k", 262144}, {"kmalloc-512k", 524288}, - {"kmalloc-1M", 1048576}, {"kmalloc-2M", 2097152}, - {"kmalloc-4M", 4194304}, {"kmalloc-8M", 8388608}, - {"kmalloc-16M", 16777216}, {"kmalloc-32M", 33554432}, - {"kmalloc-64M", 67108864} + INIT_KMALLOC_INFO(0, 0), + INIT_KMALLOC_INFO(96, 96), + INIT_KMALLOC_INFO(192, 192), + INIT_KMALLOC_INFO(8, 8), + INIT_KMALLOC_INFO(16, 16), + INIT_KMALLOC_INFO(32, 32), + INIT_KMALLOC_INFO(64, 64), + INIT_KMALLOC_INFO(128, 128), + INIT_KMALLOC_INFO(256, 256), + INIT_KMALLOC_INFO(512, 512), + INIT_KMALLOC_INFO(1024, 1k), + INIT_KMALLOC_INFO(2048, 2k), + INIT_KMALLOC_INFO(4096, 4k), + INIT_KMALLOC_INFO(8192, 8k), + INIT_KMALLOC_INFO(16384, 16k), + INIT_KMALLOC_INFO(32768, 32k), + INIT_KMALLOC_INFO(65536, 64k), + INIT_KMALLOC_INFO(131072, 128k), + INIT_KMALLOC_INFO(262144, 256k), + INIT_KMALLOC_INFO(524288, 512k), + INIT_KMALLOC_INFO(1048576, 1M), + INIT_KMALLOC_INFO(2097152, 2M), + INIT_KMALLOC_INFO(4194304, 4M), + INIT_KMALLOC_INFO(8388608, 8M), + INIT_KMALLOC_INFO(16777216, 16M), + INIT_KMALLOC_INFO(33554432, 32M), + INIT_KMALLOC_INFO(67108864, 64M) }; /* @@ -1208,36 +1238,14 @@ void __init setup_kmalloc_cache_index_table(void) } } -static const char * -kmalloc_cache_name(const char *prefix, unsigned int size) -{ - - static const char units[3] = "\0kM"; - int idx = 0; - - while (size >= 1024 && (size % 1024 == 0)) { - size /= 1024; - idx++; - } - - return kasprintf(GFP_NOWAIT, "%s-%u%c", prefix, size, units[idx]); -} - static void __init -new_kmalloc_cache(int idx, int type, slab_flags_t flags) +new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags) { - const char *name; - - if (type == KMALLOC_RECLAIM) { + if (type == KMALLOC_RECLAIM) flags |= SLAB_RECLAIM_ACCOUNT; - name = kmalloc_cache_name("kmalloc-rcl", - kmalloc_info[idx].size); - BUG_ON(!name); - } else { - name = kmalloc_info[idx].name; - } - kmalloc_caches[type][idx] = create_kmalloc_cache(name, + kmalloc_caches[type][idx] = create_kmalloc_cache( + kmalloc_info[idx].name[type], kmalloc_info[idx].size, flags, 0, kmalloc_info[idx].size); } @@ -1249,7 +1257,8 @@ new_kmalloc_cache(int idx, int type, slab_flags_t flags) */ void __init create_kmalloc_caches(slab_flags_t flags) { - int i, type; + int i; + enum kmalloc_cache_type type; for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) { for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { @@ -1278,12 +1287,10 @@ void __init create_kmalloc_caches(slab_flags_t flags) struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i]; if (s) { - unsigned int size = kmalloc_size(i); - const char *n = kmalloc_cache_name("dma-kmalloc", size); - - BUG_ON(!n); kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache( - n, size, SLAB_CACHE_DMA | flags, 0, 0); + kmalloc_info[i].name[KMALLOC_DMA], + kmalloc_info[i].size, + SLAB_CACHE_DMA | flags, 0, 0); } } #endif diff --git a/mm/slub.c b/mm/slub.c index e72e802fc569..d11389710b12 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -93,9 +93,7 @@ * minimal so we rely on the page allocators per cpu caches for * fast frees and allocs. * - * Overloading of page flags that are otherwise used for LRU management. - * - * PageActive The slab is frozen and exempt from list processing. + * page->frozen The slab is frozen and exempt from list processing. * This means that the slab is dedicated to a purpose * such as satisfying allocations for a specific * processor. Objects may be freed in the slab while @@ -111,7 +109,7 @@ * free objects in addition to the regular freelist * that requires the slab lock. * - * PageError Slab requires special handling due to debug + * SLAB_DEBUG_FLAGS Slab requires special handling due to debug * options set. This moves slab handling out of * the fast path and disables lockless freelists. */ @@ -736,6 +734,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, { u8 *fault; u8 *end; + u8 *addr = page_address(page); metadata_access_enable(); fault = memchr_inv(start, value, bytes); @@ -748,8 +747,9 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, end--; slab_bug(s, "%s overwritten", what); - pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", - fault, end - 1, fault[0], value); + pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", + fault, end - 1, fault - addr, + fault[0], value); print_trailer(s, page, object); restore_bytes(s, what, value, fault, end); @@ -844,7 +844,8 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) while (end > fault && end[-1] == POISON_INUSE) end--; - slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); + slab_err(s, page, "Padding overwritten. 0x%p-0x%p @offset=%tu", + fault, end - 1, fault - start); print_section(KERN_ERR, "Padding ", pad, remainder); restore_bytes(s, "slab padding", POISON_INUSE, fault, end); @@ -4383,31 +4384,26 @@ static int count_total(struct page *page) #endif #ifdef CONFIG_SLUB_DEBUG -static int validate_slab(struct kmem_cache *s, struct page *page, +static void validate_slab(struct kmem_cache *s, struct page *page, unsigned long *map) { void *p; void *addr = page_address(page); - if (!check_slab(s, page) || - !on_freelist(s, page, NULL)) - return 0; + if (!check_slab(s, page) || !on_freelist(s, page, NULL)) + return; /* Now we know that a valid freelist exists */ bitmap_zero(map, page->objects); get_map(s, page, map); for_each_object(p, s, addr, page->objects) { - if (test_bit(slab_index(p, s, addr), map)) - if (!check_object(s, page, p, SLUB_RED_INACTIVE)) - return 0; - } + u8 val = test_bit(slab_index(p, s, addr), map) ? + SLUB_RED_INACTIVE : SLUB_RED_ACTIVE; - for_each_object(p, s, addr, page->objects) - if (!test_bit(slab_index(p, s, addr), map)) - if (!check_object(s, page, p, SLUB_RED_ACTIVE)) - return 0; - return 1; + if (!check_object(s, page, p, val)) + break; + } } static void validate_slab_slab(struct kmem_cache *s, struct page *page, diff --git a/mm/sparse.c b/mm/sparse.c index f6891c1992b1..b20ab7cdac86 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -458,8 +458,7 @@ struct page __init *__populate_section_memmap(unsigned long pfn, if (map) return map; - map = memblock_alloc_try_nid(size, - PAGE_SIZE, addr, + map = memblock_alloc_try_nid_raw(size, size, addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid); if (!map) panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n", @@ -482,10 +481,13 @@ static void __init sparse_buffer_init(unsigned long size, int nid) { phys_addr_t addr = __pa(MAX_DMA_ADDRESS); WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */ - sparsemap_buf = - memblock_alloc_try_nid_raw(size, PAGE_SIZE, - addr, - MEMBLOCK_ALLOC_ACCESSIBLE, nid); + /* + * Pre-allocated buffer is mainly used by __populate_section_memmap + * and we want it to be properly aligned to the section size - this is + * especially the case for VMEMMAP which maps memmap to PMDs + */ + sparsemap_buf = memblock_alloc_exact_nid_raw(size, section_map_size(), + addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid); sparsemap_buf_end = sparsemap_buf + size; } @@ -647,7 +649,7 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP -static struct page *populate_section_memmap(unsigned long pfn, +static struct page * __meminit populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap) { return __populate_section_memmap(pfn, nr_pages, nid, altmap); @@ -669,7 +671,7 @@ static void free_map_bootmem(struct page *memmap) vmemmap_free(start, end, NULL); } #else -struct page *populate_section_memmap(unsigned long pfn, +struct page * __meminit populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap) { struct page *page, *ret; diff --git a/mm/swap.c b/mm/swap.c index 38c3fa4308e2..5341ae93861f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -373,9 +373,16 @@ static void __lru_cache_activate_page(struct page *page) void mark_page_accessed(struct page *page) { page = compound_head(page); - if (!PageActive(page) && !PageUnevictable(page) && - PageReferenced(page)) { + if (!PageReferenced(page)) { + SetPageReferenced(page); + } else if (PageUnevictable(page)) { + /* + * Unevictable pages are on the "LRU_UNEVICTABLE" list. But, + * this list is never rotated or maintained, so marking an + * evictable page accessed has no effect. + */ + } else if (!PageActive(page)) { /* * If the page is on the LRU, queue it for activation via * activate_page_pvecs. Otherwise, assume the page is on a @@ -389,8 +396,6 @@ void mark_page_accessed(struct page *page) ClearPageReferenced(page); if (page_is_file_cache(page)) workingset_activation(page); - } else if (!PageReferenced(page)) { - SetPageReferenced(page); } if (page_is_idle(page)) clear_page_idle(page); @@ -708,9 +713,10 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) */ void lru_add_drain_all(void) { + static seqcount_t seqcount = SEQCNT_ZERO(seqcount); static DEFINE_MUTEX(lock); static struct cpumask has_work; - int cpu; + int cpu, seq; /* * Make sure nobody triggers this path before mm_percpu_wq is fully @@ -719,7 +725,19 @@ void lru_add_drain_all(void) if (WARN_ON(!mm_percpu_wq)) return; + seq = raw_read_seqcount_latch(&seqcount); + mutex_lock(&lock); + + /* + * Piggyback on drain started and finished while we waited for lock: + * all pages pended at the time of our enter were drained from vectors. + */ + if (__read_seqcount_retry(&seqcount, seq)) + goto done; + + raw_write_seqcount_latch(&seqcount); + cpumask_clear(&has_work); for_each_online_cpu(cpu) { @@ -740,6 +758,7 @@ void lru_add_drain_all(void) for_each_cpu(cpu, &has_work) flush_work(&per_cpu(lru_add_drain_work, cpu)); +done: mutex_unlock(&lock); } #else diff --git a/mm/swapfile.c b/mm/swapfile.c index dab43523afdd..bb3261d45b6a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2887,6 +2887,13 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) error = set_blocksize(p->bdev, PAGE_SIZE); if (error < 0) return error; + /* + * Zoned block devices contain zones that have a sequential + * write only restriction. Hence zoned block devices are not + * suitable for swapping. Disallow them here. + */ + if (blk_queue_is_zoned(p->bdev->bd_queue)) + return -EINVAL; p->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { p->bdev = inode->i_sb->s_bdev; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index c7ae74ce5ff3..1b0d7abad1d4 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -18,6 +18,36 @@ #include <asm/tlbflush.h> #include "internal.h" +static __always_inline +struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm, + unsigned long dst_start, + unsigned long len) +{ + /* + * Make sure that the dst range is both valid and fully within a + * single existing vma. + */ + struct vm_area_struct *dst_vma; + + dst_vma = find_vma(dst_mm, dst_start); + if (!dst_vma) + return NULL; + + if (dst_start < dst_vma->vm_start || + dst_start + len > dst_vma->vm_end) + return NULL; + + /* + * Check the vma is registered in uffd, this is required to + * enforce the VM_MAYWRITE check done at uffd registration + * time. + */ + if (!dst_vma->vm_userfaultfd_ctx.ctx) + return NULL; + + return dst_vma; +} + static int mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, struct vm_area_struct *dst_vma, @@ -60,7 +90,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, /* * The memory barrier inside __SetPageUptodate makes sure that - * preceeding stores to the page contents become visible before + * preceding stores to the page contents become visible before * the set_pte_at() write. */ __SetPageUptodate(page); @@ -184,7 +214,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, unsigned long src_addr, dst_addr; long copied; struct page *page; - struct hstate *h; unsigned long vma_hpagesize; pgoff_t idx; u32 hash; @@ -221,20 +250,9 @@ retry: */ if (!dst_vma) { err = -ENOENT; - dst_vma = find_vma(dst_mm, dst_start); + dst_vma = find_dst_vma(dst_mm, dst_start, len); if (!dst_vma || !is_vm_hugetlb_page(dst_vma)) goto out_unlock; - /* - * Check the vma is registered in uffd, this is - * required to enforce the VM_MAYWRITE check done at - * uffd registration time. - */ - if (!dst_vma->vm_userfaultfd_ctx.ctx) - goto out_unlock; - - if (dst_start < dst_vma->vm_start || - dst_start + len > dst_vma->vm_end) - goto out_unlock; err = -EINVAL; if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) @@ -243,10 +261,6 @@ retry: vm_shared = dst_vma->vm_flags & VM_SHARED; } - if (WARN_ON(dst_addr & (vma_hpagesize - 1) || - (len - copied) & (vma_hpagesize - 1))) - goto out_unlock; - /* * If not shared, ensure the dst_vma has a anon_vma. */ @@ -256,24 +270,21 @@ retry: goto out_unlock; } - h = hstate_vma(dst_vma); - while (src_addr < src_start + len) { pte_t dst_pteval; BUG_ON(dst_addr >= dst_start + len); - VM_BUG_ON(dst_addr & ~huge_page_mask(h)); /* * Serialize via hugetlb_fault_mutex */ idx = linear_page_index(dst_vma, dst_addr); mapping = dst_vma->vm_file->f_mapping; - hash = hugetlb_fault_mutex_hash(h, mapping, idx, dst_addr); + hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); err = -ENOMEM; - dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h)); + dst_pte = huge_pte_alloc(dst_mm, dst_addr, vma_hpagesize); if (!dst_pte) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out_unlock; @@ -300,7 +311,8 @@ retry: err = copy_huge_page_from_user(page, (const void __user *)src_addr, - pages_per_huge_page(h), true); + vma_hpagesize / PAGE_SIZE, + true); if (unlikely(err)) { err = -EFAULT; goto out; @@ -475,20 +487,9 @@ retry: * both valid and fully within a single existing vma. */ err = -ENOENT; - dst_vma = find_vma(dst_mm, dst_start); + dst_vma = find_dst_vma(dst_mm, dst_start, len); if (!dst_vma) goto out_unlock; - /* - * Check the vma is registered in uffd, this is required to - * enforce the VM_MAYWRITE check done at uffd registration - * time. - */ - if (!dst_vma->vm_userfaultfd_ctx.ctx) - goto out_unlock; - - if (dst_start < dst_vma->vm_start || - dst_start + len > dst_vma->vm_end) - goto out_unlock; err = -EINVAL; /* diff --git a/mm/util.c b/mm/util.c index 3ad6db9a722e..988d11e6c17c 100644 --- a/mm/util.c +++ b/mm/util.c @@ -271,7 +271,7 @@ void *memdup_user_nul(const void __user *src, size_t len) EXPORT_SYMBOL(memdup_user_nul); void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, struct rb_node *rb_parent) + struct vm_area_struct *prev) { struct vm_area_struct *next; @@ -280,18 +280,28 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, next = prev->vm_next; prev->vm_next = vma; } else { + next = mm->mmap; mm->mmap = vma; - if (rb_parent) - next = rb_entry(rb_parent, - struct vm_area_struct, vm_rb); - else - next = NULL; } vma->vm_next = next; if (next) next->vm_prev = vma; } +void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma) +{ + struct vm_area_struct *prev, *next; + + next = vma->vm_next; + prev = vma->vm_prev; + if (prev) + prev->vm_next = next; + else + mm->mmap = next; + if (next) + next->vm_prev = prev; +} + /* Check if the vma is being used as a stack by this task */ int vma_is_stack_for_current(struct vm_area_struct *vma) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4a7d7459c4f9..4d3b3d60d893 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -331,6 +331,7 @@ EXPORT_SYMBOL(vmalloc_to_pfn); static DEFINE_SPINLOCK(vmap_area_lock); +static DEFINE_SPINLOCK(free_vmap_area_lock); /* Export for kexec only */ LIST_HEAD(vmap_area_list); static LLIST_HEAD(vmap_purge_list); @@ -682,7 +683,7 @@ insert_vmap_area_augment(struct vmap_area *va, * free area is inserted. If VA has been merged, it is * freed. */ -static __always_inline void +static __always_inline struct vmap_area * merge_or_add_vmap_area(struct vmap_area *va, struct rb_root *root, struct list_head *head) { @@ -749,7 +750,10 @@ merge_or_add_vmap_area(struct vmap_area *va, /* Free vmap_area object. */ kmem_cache_free(vmap_area_cachep, va); - return; + + /* Point to the new merged area. */ + va = sibling; + merged = true; } } @@ -758,6 +762,8 @@ insert: link_va(va, root, parent, link, head); augment_tree_propagate_from(va); } + + return va; } static __always_inline bool @@ -968,6 +974,19 @@ adjust_va_to_fit_type(struct vmap_area *va, * There are a few exceptions though, as an example it is * a first allocation (early boot up) when we have "one" * big free space that has to be split. + * + * Also we can hit this path in case of regular "vmap" + * allocations, if "this" current CPU was not preloaded. + * See the comment in alloc_vmap_area() why. If so, then + * GFP_NOWAIT is used instead to get an extra object for + * split purpose. That is rare and most time does not + * occur. + * + * What happens if an allocation gets failed. Basically, + * an "overflow" path is triggered to purge lazily freed + * areas to free some memory, then, the "retry" path is + * triggered to repeat one more time. See more details + * in alloc_vmap_area() function. */ lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); if (!lva) @@ -1063,9 +1082,9 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, return ERR_PTR(-EBUSY); might_sleep(); + gfp_mask = gfp_mask & GFP_RECLAIM_MASK; - va = kmem_cache_alloc_node(vmap_area_cachep, - gfp_mask & GFP_RECLAIM_MASK, node); + va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); if (unlikely(!va)) return ERR_PTR(-ENOMEM); @@ -1073,49 +1092,55 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, * Only scan the relevant parts containing pointers to other objects * to avoid false negatives. */ - kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK); + kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); retry: /* - * Preload this CPU with one extra vmap_area object to ensure - * that we have it available when fit type of free area is - * NE_FIT_TYPE. + * Preload this CPU with one extra vmap_area object. It is used + * when fit type of free area is NE_FIT_TYPE. Please note, it + * does not guarantee that an allocation occurs on a CPU that + * is preloaded, instead we minimize the case when it is not. + * It can happen because of cpu migration, because there is a + * race until the below spinlock is taken. * * The preload is done in non-atomic context, thus it allows us * to use more permissive allocation masks to be more stable under - * low memory condition and high memory pressure. + * low memory condition and high memory pressure. In rare case, + * if not preloaded, GFP_NOWAIT is used. * - * Even if it fails we do not really care about that. Just proceed - * as it is. "overflow" path will refill the cache we allocate from. + * Set "pva" to NULL here, because of "retry" path. */ - preempt_disable(); - if (!__this_cpu_read(ne_fit_preload_node)) { - preempt_enable(); - pva = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, node); - preempt_disable(); - - if (__this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) { - if (pva) - kmem_cache_free(vmap_area_cachep, pva); - } - } + pva = NULL; - spin_lock(&vmap_area_lock); - preempt_enable(); + if (!this_cpu_read(ne_fit_preload_node)) + /* + * Even if it fails we do not really care about that. + * Just proceed as it is. If needed "overflow" path + * will refill the cache we allocate from. + */ + pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); + + spin_lock(&free_vmap_area_lock); + + if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) + kmem_cache_free(vmap_area_cachep, pva); /* * If an allocation fails, the "vend" address is * returned. Therefore trigger the overflow path. */ addr = __alloc_vmap_area(size, align, vstart, vend); + spin_unlock(&free_vmap_area_lock); + if (unlikely(addr == vend)) goto overflow; va->va_start = addr; va->va_end = addr + size; va->vm = NULL; - insert_vmap_area(va, &vmap_area_root, &vmap_area_list); + spin_lock(&vmap_area_lock); + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); spin_unlock(&vmap_area_lock); BUG_ON(!IS_ALIGNED(va->va_start, align)); @@ -1125,7 +1150,6 @@ retry: return va; overflow: - spin_unlock(&vmap_area_lock); if (!purged) { purge_vmap_area_lazy(); purged = 1; @@ -1161,28 +1185,24 @@ int unregister_vmap_purge_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); -static void __free_vmap_area(struct vmap_area *va) +/* + * Free a region of KVA allocated by alloc_vmap_area + */ +static void free_vmap_area(struct vmap_area *va) { /* * Remove from the busy tree/list. */ + spin_lock(&vmap_area_lock); unlink_va(va, &vmap_area_root); + spin_unlock(&vmap_area_lock); /* - * Merge VA with its neighbors, otherwise just add it. + * Insert/Merge it back to the free tree/list. */ - merge_or_add_vmap_area(va, - &free_vmap_area_root, &free_vmap_area_list); -} - -/* - * Free a region of KVA allocated by alloc_vmap_area - */ -static void free_vmap_area(struct vmap_area *va) -{ - spin_lock(&vmap_area_lock); - __free_vmap_area(va); - spin_unlock(&vmap_area_lock); + spin_lock(&free_vmap_area_lock); + merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list); + spin_unlock(&free_vmap_area_lock); } /* @@ -1275,24 +1295,30 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) flush_tlb_kernel_range(start, end); resched_threshold = lazy_max_pages() << 1; - spin_lock(&vmap_area_lock); + spin_lock(&free_vmap_area_lock); llist_for_each_entry_safe(va, n_va, valist, purge_list) { unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; + unsigned long orig_start = va->va_start; + unsigned long orig_end = va->va_end; /* * Finally insert or merge lazily-freed area. It is * detached and there is no need to "unlink" it from * anything. */ - merge_or_add_vmap_area(va, - &free_vmap_area_root, &free_vmap_area_list); + va = merge_or_add_vmap_area(va, &free_vmap_area_root, + &free_vmap_area_list); + + if (is_vmalloc_or_module_addr((void *)orig_start)) + kasan_release_vmalloc(orig_start, orig_end, + va->va_start, va->va_end); atomic_long_sub(nr, &vmap_lazy_nr); if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) - cond_resched_lock(&vmap_area_lock); + cond_resched_lock(&free_vmap_area_lock); } - spin_unlock(&vmap_area_lock); + spin_unlock(&free_vmap_area_lock); return true; } @@ -2014,15 +2040,21 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages) } EXPORT_SYMBOL_GPL(map_vm_area); -static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, - unsigned long flags, const void *caller) +static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, + struct vmap_area *va, unsigned long flags, const void *caller) { - spin_lock(&vmap_area_lock); vm->flags = flags; vm->addr = (void *)va->va_start; vm->size = va->va_end - va->va_start; vm->caller = caller; va->vm = vm; +} + +static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, + unsigned long flags, const void *caller) +{ + spin_lock(&vmap_area_lock); + setup_vmalloc_vm_locked(vm, va, flags, caller); spin_unlock(&vmap_area_lock); } @@ -2068,6 +2100,22 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, setup_vmalloc_vm(area, va, flags, caller); + /* + * For KASAN, if we are in vmalloc space, we need to cover the shadow + * area with real memory. If we come here through VM_ALLOC, this is + * done by a higher level function that has access to the true size, + * which might not be a full page. + * + * We assume module space comes via VM_ALLOC path. + */ + if (is_vmalloc_addr(area->addr) && !(area->flags & VM_ALLOC)) { + if (kasan_populate_vmalloc(area->size, area)) { + unmap_vmap_area(va); + kfree(area); + return NULL; + } + } + return area; } @@ -2245,6 +2293,9 @@ static void __vunmap(const void *addr, int deallocate_pages) debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); + if (area->flags & VM_KASAN) + kasan_poison_vmalloc(area->addr, area->size); + vm_remove_mappings(area, deallocate_pages); if (deallocate_pages) { @@ -2440,7 +2491,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } area->pages[i] = page; - if (gfpflags_allow_blocking(gfp_mask|highmem_mask)) + if (gfpflags_allow_blocking(gfp_mask)) cond_resched(); } atomic_long_add(area->nr_pages, &nr_vmalloc_pages); @@ -2497,6 +2548,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, if (!addr) return NULL; + if (is_vmalloc_or_module_addr(area->addr)) { + if (kasan_populate_vmalloc(real_size, area)) + return NULL; + } + /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED * flag. It means that vm_struct is not fully initialized. @@ -3282,7 +3338,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, goto err_free; } retry: - spin_lock(&vmap_area_lock); + spin_lock(&free_vmap_area_lock); /* start scanning - we scan from the top, begin with the last area */ area = term_area = last_area; @@ -3364,29 +3420,44 @@ retry: va = vas[area]; va->va_start = start; va->va_end = start + size; - - insert_vmap_area(va, &vmap_area_root, &vmap_area_list); } - spin_unlock(&vmap_area_lock); + spin_unlock(&free_vmap_area_lock); /* insert all vm's */ - for (area = 0; area < nr_vms; area++) - setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC, + spin_lock(&vmap_area_lock); + for (area = 0; area < nr_vms; area++) { + insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list); + + setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC, pcpu_get_vm_areas); + } + spin_unlock(&vmap_area_lock); + + /* populate the shadow space outside of the lock */ + for (area = 0; area < nr_vms; area++) { + /* assume success here */ + kasan_populate_vmalloc(sizes[area], vms[area]); + } kfree(vas); return vms; recovery: - /* Remove previously inserted areas. */ + /* + * Remove previously allocated areas. There is no + * need in removing these areas from the busy tree, + * because they are inserted only on the final step + * and when pcpu_get_vm_areas() is success. + */ while (area--) { - __free_vmap_area(vas[area]); + merge_or_add_vmap_area(vas[area], &free_vmap_area_root, + &free_vmap_area_list); vas[area] = NULL; } overflow: - spin_unlock(&vmap_area_lock); + spin_unlock(&free_vmap_area_lock); if (!purged) { purge_vmap_area_lazy(); purged = true; @@ -3437,9 +3508,12 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) + __acquires(&vmap_purge_lock) __acquires(&vmap_area_lock) { + mutex_lock(&vmap_purge_lock); spin_lock(&vmap_area_lock); + return seq_list_start(&vmap_area_list, *pos); } @@ -3449,8 +3523,10 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos) } static void s_stop(struct seq_file *m, void *p) + __releases(&vmap_purge_lock) __releases(&vmap_area_lock) { + mutex_unlock(&vmap_purge_lock); spin_unlock(&vmap_area_lock); } diff --git a/mm/vmscan.c b/mm/vmscan.c index ee4eecc7e1c2..74e8edce83ca 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -79,6 +79,13 @@ struct scan_control { */ struct mem_cgroup *target_mem_cgroup; + /* Can active pages be deactivated as part of reclaim? */ +#define DEACTIVATE_ANON 1 +#define DEACTIVATE_FILE 2 + unsigned int may_deactivate:2; + unsigned int force_deactivate:1; + unsigned int skipped_deactivate:1; + /* Writepage batching in laptop mode; RECLAIM_WRITE */ unsigned int may_writepage:1; @@ -101,6 +108,12 @@ struct scan_control { /* One of the zones is ready for compaction */ unsigned int compaction_ready:1; + /* There is easily reclaimable cold cache in the current node */ + unsigned int cache_trim_mode:1; + + /* The file pages on the current node are dangerously low */ + unsigned int file_is_tiny:1; + /* Allocation order */ s8 order; @@ -239,13 +252,13 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) up_write(&shrinker_rwsem); } -static bool global_reclaim(struct scan_control *sc) +static bool cgroup_reclaim(struct scan_control *sc) { - return !sc->target_mem_cgroup; + return sc->target_mem_cgroup; } /** - * sane_reclaim - is the usual dirty throttling mechanism operational? + * writeback_throttling_sane - is the usual dirty throttling mechanism available? * @sc: scan_control in question * * The normal page dirty throttling mechanism in balance_dirty_pages() is @@ -257,11 +270,9 @@ static bool global_reclaim(struct scan_control *sc) * This function tests whether the vmscan currently in progress can assume * that the normal dirty throttling mechanism is operational. */ -static bool sane_reclaim(struct scan_control *sc) +static bool writeback_throttling_sane(struct scan_control *sc) { - struct mem_cgroup *memcg = sc->target_mem_cgroup; - - if (!memcg) + if (!cgroup_reclaim(sc)) return true; #ifdef CONFIG_CGROUP_WRITEBACK if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) @@ -269,29 +280,6 @@ static bool sane_reclaim(struct scan_control *sc) #endif return false; } - -static void set_memcg_congestion(pg_data_t *pgdat, - struct mem_cgroup *memcg, - bool congested) -{ - struct mem_cgroup_per_node *mn; - - if (!memcg) - return; - - mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id); - WRITE_ONCE(mn->congested, congested); -} - -static bool memcg_congested(pg_data_t *pgdat, - struct mem_cgroup *memcg) -{ - struct mem_cgroup_per_node *mn; - - mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id); - return READ_ONCE(mn->congested); - -} #else static int prealloc_memcg_shrinker(struct shrinker *shrinker) { @@ -302,27 +290,15 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) { } -static bool global_reclaim(struct scan_control *sc) +static bool cgroup_reclaim(struct scan_control *sc) { - return true; + return false; } -static bool sane_reclaim(struct scan_control *sc) +static bool writeback_throttling_sane(struct scan_control *sc) { return true; } - -static inline void set_memcg_congestion(struct pglist_data *pgdat, - struct mem_cgroup *memcg, bool congested) -{ -} - -static inline bool memcg_congested(struct pglist_data *pgdat, - struct mem_cgroup *memcg) -{ - return false; - -} #endif /* @@ -351,32 +327,21 @@ unsigned long zone_reclaimable_pages(struct zone *zone) */ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { - unsigned long lru_size = 0; + unsigned long size = 0; int zid; - if (!mem_cgroup_disabled()) { - for (zid = 0; zid < MAX_NR_ZONES; zid++) - lru_size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid); - } else - lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); - - for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) { + for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) { struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; - unsigned long size; if (!managed_zone(zone)) continue; if (!mem_cgroup_disabled()) - size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid); + size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid); else - size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid], - NR_ZONE_LRU_BASE + lru); - lru_size -= min(size, lru_size); + size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru); } - - return lru_size; - + return size; } /* @@ -775,7 +740,7 @@ static inline int is_page_cache_freeable(struct page *page) return page_count(page) - page_has_private(page) == 1 + page_cache_pins; } -static int may_write_to_inode(struct inode *inode, struct scan_control *sc) +static int may_write_to_inode(struct inode *inode) { if (current->flags & PF_SWAPWRITE) return 1; @@ -823,8 +788,7 @@ typedef enum { * pageout is called by shrink_page_list() for each dirty page. * Calls ->writepage(). */ -static pageout_t pageout(struct page *page, struct address_space *mapping, - struct scan_control *sc) +static pageout_t pageout(struct page *page, struct address_space *mapping) { /* * If the page is dirty, only perform writeback if that write @@ -860,7 +824,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; - if (!may_write_to_inode(mapping->host, sc)) + if (!may_write_to_inode(mapping->host)) return PAGE_KEEP; if (clear_page_dirty_for_io(page)) { @@ -899,7 +863,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * gets returned with a refcount of 0. */ static int __remove_mapping(struct address_space *mapping, struct page *page, - bool reclaimed) + bool reclaimed, struct mem_cgroup *target_memcg) { unsigned long flags; int refcount; @@ -971,7 +935,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, */ if (reclaimed && page_is_file_cache(page) && !mapping_exiting(mapping) && !dax_mapping(mapping)) - shadow = workingset_eviction(page); + shadow = workingset_eviction(page, target_memcg); __delete_from_page_cache(page, shadow); xa_unlock_irqrestore(&mapping->i_pages, flags); @@ -994,7 +958,7 @@ cannot_free: */ int remove_mapping(struct address_space *mapping, struct page *page) { - if (__remove_mapping(mapping, page, false)) { + if (__remove_mapping(mapping, page, false, NULL)) { /* * Unfreezing the refcount with 1 rather than 2 effectively * drops the pagecache ref for us without requiring another @@ -1239,7 +1203,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto activate_locked; /* Case 2 above */ - } else if (sane_reclaim(sc) || + } else if (writeback_throttling_sane(sc) || !PageReclaim(page) || !may_enter_fs) { /* * This is slightly racy - end_page_writeback() @@ -1394,7 +1358,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * starts and then write it out here. */ try_to_unmap_flush_dirty(); - switch (pageout(page, mapping, sc)) { + switch (pageout(page, mapping)) { case PAGE_KEEP: goto keep_locked; case PAGE_ACTIVATE: @@ -1472,7 +1436,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, count_vm_event(PGLAZYFREED); count_memcg_page_event(page, PGLAZYFREED); - } else if (!mapping || !__remove_mapping(mapping, page, true)) + } else if (!mapping || !__remove_mapping(mapping, page, true, + sc->target_mem_cgroup)) goto keep_locked; unlock_page(page); @@ -1820,7 +1785,7 @@ int isolate_lru_page(struct page *page) /* * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and - * then get resheduled. When there are massive number of tasks doing page + * then get rescheduled. When there are massive number of tasks doing page * allocation, such sleeping direct reclaimers may keep piling up on each CPU, * the LRU list will go small and be scanned faster than necessary, leading to * unnecessary swapping, thrashing and OOM. @@ -1833,7 +1798,7 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, if (current_is_kswapd()) return 0; - if (!sane_reclaim(sc)) + if (!writeback_throttling_sane(sc)) return 0; if (file) { @@ -1983,7 +1948,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, reclaim_stat->recent_scanned[file] += nr_taken; item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; - if (global_reclaim(sc)) + if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_scanned); __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); spin_unlock_irq(&pgdat->lru_lock); @@ -1997,7 +1962,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_lock_irq(&pgdat->lru_lock); item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; - if (global_reclaim(sc)) + if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); reclaim_stat->recent_rotated[0] += stat.nr_activate[0]; @@ -2199,6 +2164,20 @@ unsigned long reclaim_pages(struct list_head *page_list) return nr_reclaimed; } +static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, + struct lruvec *lruvec, struct scan_control *sc) +{ + if (is_active_lru(lru)) { + if (sc->may_deactivate & (1 << is_file_lru(lru))) + shrink_active_list(nr_to_scan, lruvec, sc, lru); + else + sc->skipped_deactivate = 1; + return 0; + } + + return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); +} + /* * The inactive anon list should be small enough that the VM never has * to do too much work. @@ -2227,64 +2206,25 @@ unsigned long reclaim_pages(struct list_head *page_list) * 1TB 101 10GB * 10TB 320 32GB */ -static bool inactive_list_is_low(struct lruvec *lruvec, bool file, - struct scan_control *sc, bool trace) +static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) { - enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE; - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - enum lru_list inactive_lru = file * LRU_FILE; + enum lru_list active_lru = inactive_lru + LRU_ACTIVE; unsigned long inactive, active; unsigned long inactive_ratio; - unsigned long refaults; unsigned long gb; - /* - * If we don't have swap space, anonymous page deactivation - * is pointless. - */ - if (!file && !total_swap_pages) - return false; - - inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx); - active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx); - - /* - * When refaults are being observed, it means a new workingset - * is being established. Disable active list protection to get - * rid of the stale workingset quickly. - */ - refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE); - if (file && lruvec->refaults != refaults) { - inactive_ratio = 0; - } else { - gb = (inactive + active) >> (30 - PAGE_SHIFT); - if (gb) - inactive_ratio = int_sqrt(10 * gb); - else - inactive_ratio = 1; - } + inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru); + active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru); - if (trace) - trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx, - lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive, - lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active, - inactive_ratio, file); + gb = (inactive + active) >> (30 - PAGE_SHIFT); + if (gb) + inactive_ratio = int_sqrt(10 * gb); + else + inactive_ratio = 1; return inactive * inactive_ratio < active; } -static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, - struct lruvec *lruvec, struct scan_control *sc) -{ - if (is_active_lru(lru)) { - if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true)) - shrink_active_list(nr_to_scan, lruvec, sc, lru); - return 0; - } - - return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); -} - enum scan_balance { SCAN_EQUAL, SCAN_FRACT, @@ -2301,10 +2241,10 @@ enum scan_balance { * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ -static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, - struct scan_control *sc, unsigned long *nr, - unsigned long *lru_pages) +static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, + unsigned long *nr) { + struct mem_cgroup *memcg = lruvec_memcg(lruvec); int swappiness = mem_cgroup_swappiness(memcg); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; u64 fraction[2]; @@ -2329,7 +2269,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * using the memory controller's swap limit feature would be * too expensive. */ - if (!global_reclaim(sc) && !swappiness) { + if (cgroup_reclaim(sc) && !swappiness) { scan_balance = SCAN_FILE; goto out; } @@ -2345,58 +2285,18 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, } /* - * Prevent the reclaimer from falling into the cache trap: as - * cache pages start out inactive, every cache fault will tip - * the scan balance towards the file LRU. And as the file LRU - * shrinks, so does the window for rotation from references. - * This means we have a runaway feedback loop where a tiny - * thrashing file LRU becomes infinitely more attractive than - * anon pages. Try to detect this based on file LRU size. + * If the system is almost out of file pages, force-scan anon. */ - if (global_reclaim(sc)) { - unsigned long pgdatfile; - unsigned long pgdatfree; - int z; - unsigned long total_high_wmark = 0; - - pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); - pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) + - node_page_state(pgdat, NR_INACTIVE_FILE); - - for (z = 0; z < MAX_NR_ZONES; z++) { - struct zone *zone = &pgdat->node_zones[z]; - if (!managed_zone(zone)) - continue; - - total_high_wmark += high_wmark_pages(zone); - } - - if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) { - /* - * Force SCAN_ANON if there are enough inactive - * anonymous pages on the LRU in eligible zones. - * Otherwise, the small LRU gets thrashed. - */ - if (!inactive_list_is_low(lruvec, false, sc, false) && - lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx) - >> sc->priority) { - scan_balance = SCAN_ANON; - goto out; - } - } + if (sc->file_is_tiny) { + scan_balance = SCAN_ANON; + goto out; } /* - * If there is enough inactive page cache, i.e. if the size of the - * inactive list is greater than that of the active list *and* the - * inactive list actually has some pages to scan on this priority, we - * do not reclaim anything from the anonymous working set right now. - * Without the second condition we could end up never scanning an - * lruvec even if it has plenty of old anonymous pages unless the - * system is under heavy pressure. + * If there is enough inactive page cache, we do not reclaim + * anything from the anonymous working right now. */ - if (!inactive_list_is_low(lruvec, true, sc, false) && - lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) { + if (sc->cache_trim_mode) { scan_balance = SCAN_FILE; goto out; } @@ -2454,7 +2354,6 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, fraction[1] = fp; denominator = ap + fp + 1; out: - *lru_pages = 0; for_each_evictable_lru(lru) { int file = is_file_lru(lru); unsigned long lruvec_size; @@ -2549,18 +2448,12 @@ out: BUG(); } - *lru_pages += lruvec_size; nr[lru] = scan; } } -/* - * This is a basic per-node page freer. Used by both kswapd and direct reclaim. - */ -static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg, - struct scan_control *sc, unsigned long *lru_pages) +static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { - struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; unsigned long nr_to_scan; @@ -2570,7 +2463,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc struct blk_plug plug; bool scan_adjusted; - get_scan_count(lruvec, memcg, sc, nr, lru_pages); + get_scan_count(lruvec, sc, nr); /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); @@ -2586,7 +2479,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc * abort proportional reclaim if either the file or anon lru has already * dropped to zero at the first pass. */ - scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && + scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() && sc->priority == DEF_PRIORITY); blk_start_plug(&plug); @@ -2668,7 +2561,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_list_is_low(lruvec, false, sc, true)) + if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); } @@ -2744,156 +2637,234 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, return inactive_lru_pages > pages_for_compaction; } -static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg) +static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) { - return test_bit(PGDAT_CONGESTED, &pgdat->flags) || - (memcg && memcg_congested(pgdat, memcg)); + struct mem_cgroup *target_memcg = sc->target_mem_cgroup; + struct mem_cgroup *memcg; + + memcg = mem_cgroup_iter(target_memcg, NULL, NULL); + do { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + unsigned long reclaimed; + unsigned long scanned; + + switch (mem_cgroup_protected(target_memcg, memcg)) { + case MEMCG_PROT_MIN: + /* + * Hard protection. + * If there is no reclaimable memory, OOM. + */ + continue; + case MEMCG_PROT_LOW: + /* + * Soft protection. + * Respect the protection only as long as + * there is an unprotected supply + * of reclaimable memory from other cgroups. + */ + if (!sc->memcg_low_reclaim) { + sc->memcg_low_skipped = 1; + continue; + } + memcg_memory_event(memcg, MEMCG_LOW); + break; + case MEMCG_PROT_NONE: + /* + * All protection thresholds breached. We may + * still choose to vary the scan pressure + * applied based on by how much the cgroup in + * question has exceeded its protection + * thresholds (see get_scan_count). + */ + break; + } + + reclaimed = sc->nr_reclaimed; + scanned = sc->nr_scanned; + + shrink_lruvec(lruvec, sc); + + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, + sc->priority); + + /* Record the group's reclaim efficiency */ + vmpressure(sc->gfp_mask, memcg, false, + sc->nr_scanned - scanned, + sc->nr_reclaimed - reclaimed); + + } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); } static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) { struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long nr_reclaimed, nr_scanned; + struct lruvec *target_lruvec; bool reclaimable = false; + unsigned long file; - do { - struct mem_cgroup *root = sc->target_mem_cgroup; - unsigned long node_lru_pages = 0; - struct mem_cgroup *memcg; + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); - memset(&sc->nr, 0, sizeof(sc->nr)); +again: + memset(&sc->nr, 0, sizeof(sc->nr)); - nr_reclaimed = sc->nr_reclaimed; - nr_scanned = sc->nr_scanned; + nr_reclaimed = sc->nr_reclaimed; + nr_scanned = sc->nr_scanned; - memcg = mem_cgroup_iter(root, NULL, NULL); - do { - unsigned long lru_pages; - unsigned long reclaimed; - unsigned long scanned; + /* + * Target desirable inactive:active list ratios for the anon + * and file LRU lists. + */ + if (!sc->force_deactivate) { + unsigned long refaults; - switch (mem_cgroup_protected(root, memcg)) { - case MEMCG_PROT_MIN: - /* - * Hard protection. - * If there is no reclaimable memory, OOM. - */ - continue; - case MEMCG_PROT_LOW: - /* - * Soft protection. - * Respect the protection only as long as - * there is an unprotected supply - * of reclaimable memory from other cgroups. - */ - if (!sc->memcg_low_reclaim) { - sc->memcg_low_skipped = 1; - continue; - } - memcg_memory_event(memcg, MEMCG_LOW); - break; - case MEMCG_PROT_NONE: - /* - * All protection thresholds breached. We may - * still choose to vary the scan pressure - * applied based on by how much the cgroup in - * question has exceeded its protection - * thresholds (see get_scan_count). - */ - break; - } + if (inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) + sc->may_deactivate |= DEACTIVATE_ANON; + else + sc->may_deactivate &= ~DEACTIVATE_ANON; - reclaimed = sc->nr_reclaimed; - scanned = sc->nr_scanned; - shrink_node_memcg(pgdat, memcg, sc, &lru_pages); - node_lru_pages += lru_pages; + /* + * When refaults are being observed, it means a new + * workingset is being established. Deactivate to get + * rid of any stale active pages quickly. + */ + refaults = lruvec_page_state(target_lruvec, + WORKINGSET_ACTIVATE); + if (refaults != target_lruvec->refaults || + inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) + sc->may_deactivate |= DEACTIVATE_FILE; + else + sc->may_deactivate &= ~DEACTIVATE_FILE; + } else + sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; - shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, - sc->priority); + /* + * If we have plenty of inactive file pages that aren't + * thrashing, try to reclaim those first before touching + * anonymous pages. + */ + file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); + if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) + sc->cache_trim_mode = 1; + else + sc->cache_trim_mode = 0; + + /* + * Prevent the reclaimer from falling into the cache trap: as + * cache pages start out inactive, every cache fault will tip + * the scan balance towards the file LRU. And as the file LRU + * shrinks, so does the window for rotation from references. + * This means we have a runaway feedback loop where a tiny + * thrashing file LRU becomes infinitely more attractive than + * anon pages. Try to detect this based on file LRU size. + */ + if (!cgroup_reclaim(sc)) { + unsigned long total_high_wmark = 0; + unsigned long free, anon; + int z; - /* Record the group's reclaim efficiency */ - vmpressure(sc->gfp_mask, memcg, false, - sc->nr_scanned - scanned, - sc->nr_reclaimed - reclaimed); + free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); + file = node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_FILE); - } while ((memcg = mem_cgroup_iter(root, memcg, NULL))); + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = &pgdat->node_zones[z]; + if (!managed_zone(zone)) + continue; - if (reclaim_state) { - sc->nr_reclaimed += reclaim_state->reclaimed_slab; - reclaim_state->reclaimed_slab = 0; + total_high_wmark += high_wmark_pages(zone); } - /* Record the subtree's reclaim efficiency */ - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, - sc->nr_scanned - nr_scanned, - sc->nr_reclaimed - nr_reclaimed); + /* + * Consider anon: if that's low too, this isn't a + * runaway file reclaim problem, but rather just + * extreme pressure. Reclaim as per usual then. + */ + anon = node_page_state(pgdat, NR_INACTIVE_ANON); - if (sc->nr_reclaimed - nr_reclaimed) - reclaimable = true; + sc->file_is_tiny = + file + free <= total_high_wmark && + !(sc->may_deactivate & DEACTIVATE_ANON) && + anon >> sc->priority; + } - if (current_is_kswapd()) { - /* - * If reclaim is isolating dirty pages under writeback, - * it implies that the long-lived page allocation rate - * is exceeding the page laundering rate. Either the - * global limits are not being effective at throttling - * processes due to the page distribution throughout - * zones or there is heavy usage of a slow backing - * device. The only option is to throttle from reclaim - * context which is not ideal as there is no guarantee - * the dirtying process is throttled in the same way - * balance_dirty_pages() manages. - * - * Once a node is flagged PGDAT_WRITEBACK, kswapd will - * count the number of pages under pages flagged for - * immediate reclaim and stall if any are encountered - * in the nr_immediate check below. - */ - if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) - set_bit(PGDAT_WRITEBACK, &pgdat->flags); + shrink_node_memcgs(pgdat, sc); - /* - * Tag a node as congested if all the dirty pages - * scanned were backed by a congested BDI and - * wait_iff_congested will stall. - */ - if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) - set_bit(PGDAT_CONGESTED, &pgdat->flags); + if (reclaim_state) { + sc->nr_reclaimed += reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; + } - /* Allow kswapd to start writing pages during reclaim.*/ - if (sc->nr.unqueued_dirty == sc->nr.file_taken) - set_bit(PGDAT_DIRTY, &pgdat->flags); + /* Record the subtree's reclaim efficiency */ + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, + sc->nr_scanned - nr_scanned, + sc->nr_reclaimed - nr_reclaimed); - /* - * If kswapd scans pages marked marked for immediate - * reclaim and under writeback (nr_immediate), it - * implies that pages are cycling through the LRU - * faster than they are written so also forcibly stall. - */ - if (sc->nr.immediate) - congestion_wait(BLK_RW_ASYNC, HZ/10); - } + if (sc->nr_reclaimed - nr_reclaimed) + reclaimable = true; + if (current_is_kswapd()) { /* - * Legacy memcg will stall in page writeback so avoid forcibly - * stalling in wait_iff_congested(). + * If reclaim is isolating dirty pages under writeback, + * it implies that the long-lived page allocation rate + * is exceeding the page laundering rate. Either the + * global limits are not being effective at throttling + * processes due to the page distribution throughout + * zones or there is heavy usage of a slow backing + * device. The only option is to throttle from reclaim + * context which is not ideal as there is no guarantee + * the dirtying process is throttled in the same way + * balance_dirty_pages() manages. + * + * Once a node is flagged PGDAT_WRITEBACK, kswapd will + * count the number of pages under pages flagged for + * immediate reclaim and stall if any are encountered + * in the nr_immediate check below. */ - if (!global_reclaim(sc) && sane_reclaim(sc) && - sc->nr.dirty && sc->nr.dirty == sc->nr.congested) - set_memcg_congestion(pgdat, root, true); + if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) + set_bit(PGDAT_WRITEBACK, &pgdat->flags); + + /* Allow kswapd to start writing pages during reclaim.*/ + if (sc->nr.unqueued_dirty == sc->nr.file_taken) + set_bit(PGDAT_DIRTY, &pgdat->flags); /* - * Stall direct reclaim for IO completions if underlying BDIs - * and node is congested. Allow kswapd to continue until it - * starts encountering unqueued dirty pages or cycling through - * the LRU too quickly. + * If kswapd scans pages marked marked for immediate + * reclaim and under writeback (nr_immediate), it + * implies that pages are cycling through the LRU + * faster than they are written so also forcibly stall. */ - if (!sc->hibernation_mode && !current_is_kswapd() && - current_may_throttle() && pgdat_memcg_congested(pgdat, root)) - wait_iff_congested(BLK_RW_ASYNC, HZ/10); + if (sc->nr.immediate) + congestion_wait(BLK_RW_ASYNC, HZ/10); + } - } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, - sc)); + /* + * Tag a node/memcg as congested if all the dirty pages + * scanned were backed by a congested BDI and + * wait_iff_congested will stall. + * + * Legacy memcg will stall in page writeback so avoid forcibly + * stalling in wait_iff_congested(). + */ + if ((current_is_kswapd() || + (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) && + sc->nr.dirty && sc->nr.dirty == sc->nr.congested) + set_bit(LRUVEC_CONGESTED, &target_lruvec->flags); + + /* + * Stall direct reclaim for IO completions if underlying BDIs + * and node is congested. Allow kswapd to continue until it + * starts encountering unqueued dirty pages or cycling through + * the LRU too quickly. + */ + if (!current_is_kswapd() && current_may_throttle() && + !sc->hibernation_mode && + test_bit(LRUVEC_CONGESTED, &target_lruvec->flags)) + wait_iff_congested(BLK_RW_ASYNC, HZ/10); + + if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, + sc)) + goto again; /* * Kswapd gives up on balancing particular nodes after too @@ -2973,7 +2944,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) * Take care memory controller reclaiming has small influence * to global LRU. */ - if (global_reclaim(sc)) { + if (!cgroup_reclaim(sc)) { if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) continue; @@ -3032,19 +3003,14 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) sc->gfp_mask = orig_mask; } -static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat) +static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) { - struct mem_cgroup *memcg; - - memcg = mem_cgroup_iter(root_memcg, NULL, NULL); - do { - unsigned long refaults; - struct lruvec *lruvec; + struct lruvec *target_lruvec; + unsigned long refaults; - lruvec = mem_cgroup_lruvec(pgdat, memcg); - refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE); - lruvec->refaults = refaults; - } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL))); + target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE); + target_lruvec->refaults = refaults; } /* @@ -3073,7 +3039,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, retry: delayacct_freepages_start(); - if (global_reclaim(sc)) + if (!cgroup_reclaim(sc)) __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1); do { @@ -3102,8 +3068,16 @@ retry: if (zone->zone_pgdat == last_pgdat) continue; last_pgdat = zone->zone_pgdat; + snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat); - set_memcg_congestion(last_pgdat, sc->target_mem_cgroup, false); + + if (cgroup_reclaim(sc)) { + struct lruvec *lruvec; + + lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, + zone->zone_pgdat); + clear_bit(LRUVEC_CONGESTED, &lruvec->flags); + } } delayacct_freepages_end(); @@ -3115,9 +3089,27 @@ retry: if (sc->compaction_ready) return 1; + /* + * We make inactive:active ratio decisions based on the node's + * composition of memory, but a restrictive reclaim_idx or a + * memory.low cgroup setting can exempt large amounts of + * memory from reclaim. Neither of which are very common, so + * instead of doing costly eligibility calculations of the + * entire cgroup subtree up front, we assume the estimates are + * good, and retry with forcible deactivation if that fails. + */ + if (sc->skipped_deactivate) { + sc->priority = initial_priority; + sc->force_deactivate = 1; + sc->skipped_deactivate = 0; + goto retry; + } + /* Untapped cgroup reserves? Don't OOM, retry. */ if (sc->memcg_low_skipped) { sc->priority = initial_priority; + sc->force_deactivate = 0; + sc->skipped_deactivate = 0; sc->memcg_low_reclaim = 1; sc->memcg_low_skipped = 0; goto retry; @@ -3309,6 +3301,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, pg_data_t *pgdat, unsigned long *nr_scanned) { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, .target_mem_cgroup = memcg, @@ -3317,7 +3310,6 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, .reclaim_idx = MAX_NR_ZONES - 1, .may_swap = !noswap, }; - unsigned long lru_pages; WARN_ON_ONCE(!current->reclaim_state); @@ -3334,7 +3326,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_node_memcg(pgdat, memcg, &sc, &lru_pages); + shrink_lruvec(lruvec, &sc); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -3348,10 +3340,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, gfp_t gfp_mask, bool may_swap) { - struct zonelist *zonelist; unsigned long nr_reclaimed; unsigned long pflags; - int nid; unsigned int noreclaim_flag; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), @@ -3364,16 +3354,14 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .may_unmap = 1, .may_swap = may_swap, }; - - set_task_reclaim_state(current, &sc.reclaim_state); /* - * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't - * take care of from where we get pages. So the node where we start the - * scan does not need to be the current node. + * Traverse the ZONELIST_FALLBACK zonelist of the current node to put + * equal pressure on all the nodes. This is based on the assumption that + * the reclaim does not bail out early. */ - nid = mem_cgroup_select_victim_node(memcg); + struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); - zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK]; + set_task_reclaim_state(current, &sc.reclaim_state); trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask); @@ -3396,18 +3384,20 @@ static void age_active_anon(struct pglist_data *pgdat, struct scan_control *sc) { struct mem_cgroup *memcg; + struct lruvec *lruvec; if (!total_swap_pages) return; + lruvec = mem_cgroup_lruvec(NULL, pgdat); + if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON)) + return; + memcg = mem_cgroup_iter(NULL, NULL, NULL); do { - struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); - - if (inactive_list_is_low(lruvec, false, sc, true)) - shrink_active_list(SWAP_CLUSTER_MAX, lruvec, - sc, LRU_ACTIVE_ANON); - + lruvec = mem_cgroup_lruvec(memcg, pgdat); + shrink_active_list(SWAP_CLUSTER_MAX, lruvec, + sc, LRU_ACTIVE_ANON); memcg = mem_cgroup_iter(NULL, memcg, NULL); } while (memcg); } @@ -3475,7 +3465,9 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) /* Clear pgdat state for congested, dirty or under writeback. */ static void clear_pgdat_congested(pg_data_t *pgdat) { - clear_bit(PGDAT_CONGESTED, &pgdat->flags); + struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); + + clear_bit(LRUVEC_CONGESTED, &lruvec->flags); clear_bit(PGDAT_DIRTY, &pgdat->flags); clear_bit(PGDAT_WRITEBACK, &pgdat->flags); } diff --git a/mm/workingset.c b/mm/workingset.c index c963831d354f..474186b76ced 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -213,28 +213,53 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, *workingsetp = workingset; } +static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat) +{ + /* + * Reclaiming a cgroup means reclaiming all its children in a + * round-robin fashion. That means that each cgroup has an LRU + * order that is composed of the LRU orders of its child + * cgroups; and every page has an LRU position not just in the + * cgroup that owns it, but in all of that group's ancestors. + * + * So when the physical inactive list of a leaf cgroup ages, + * the virtual inactive lists of all its parents, including + * the root cgroup's, age as well. + */ + do { + struct lruvec *lruvec; + + lruvec = mem_cgroup_lruvec(memcg, pgdat); + atomic_long_inc(&lruvec->inactive_age); + } while (memcg && (memcg = parent_mem_cgroup(memcg))); +} + /** * workingset_eviction - note the eviction of a page from memory + * @target_memcg: the cgroup that is causing the reclaim * @page: the page being evicted * * Returns a shadow entry to be stored in @page->mapping->i_pages in place * of the evicted @page so that a later refault can be detected. */ -void *workingset_eviction(struct page *page) +void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) { struct pglist_data *pgdat = page_pgdat(page); - struct mem_cgroup *memcg = page_memcg(page); - int memcgid = mem_cgroup_id(memcg); unsigned long eviction; struct lruvec *lruvec; + int memcgid; /* Page is fully exclusive and pins page->mem_cgroup */ VM_BUG_ON_PAGE(PageLRU(page), page); VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); - lruvec = mem_cgroup_lruvec(pgdat, memcg); - eviction = atomic_long_inc_return(&lruvec->inactive_age); + advance_inactive_age(page_memcg(page), pgdat); + + lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + /* XXX: target_memcg can be NULL, go through lruvec */ + memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); + eviction = atomic_long_read(&lruvec->inactive_age); return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); } @@ -244,10 +269,13 @@ void *workingset_eviction(struct page *page) * @shadow: shadow entry of the evicted page * * Calculates and evaluates the refault distance of the previously - * evicted page in the context of the node it was allocated in. + * evicted page in the context of the node and the memcg whose memory + * pressure caused the eviction. */ void workingset_refault(struct page *page, void *shadow) { + struct mem_cgroup *eviction_memcg; + struct lruvec *eviction_lruvec; unsigned long refault_distance; struct pglist_data *pgdat; unsigned long active_file; @@ -277,12 +305,12 @@ void workingset_refault(struct page *page, void *shadow) * would be better if the root_mem_cgroup existed in all * configurations instead. */ - memcg = mem_cgroup_from_id(memcgid); - if (!mem_cgroup_disabled() && !memcg) + eviction_memcg = mem_cgroup_from_id(memcgid); + if (!mem_cgroup_disabled() && !eviction_memcg) goto out; - lruvec = mem_cgroup_lruvec(pgdat, memcg); - refault = atomic_long_read(&lruvec->inactive_age); - active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); + eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); + refault = atomic_long_read(&eviction_lruvec->inactive_age); + active_file = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); /* * Calculate the refault distance @@ -302,6 +330,17 @@ void workingset_refault(struct page *page, void *shadow) */ refault_distance = (refault - eviction) & EVICTION_MASK; + /* + * The activation decision for this page is made at the level + * where the eviction occurred, as that is where the LRU order + * during page reclaim is being determined. + * + * However, the cgroup that will own the page is the one that + * is actually experiencing the refault event. + */ + memcg = page_memcg(page); + lruvec = mem_cgroup_lruvec(memcg, pgdat); + inc_lruvec_state(lruvec, WORKINGSET_REFAULT); /* @@ -313,7 +352,7 @@ void workingset_refault(struct page *page, void *shadow) goto out; SetPageActive(page); - atomic_long_inc(&lruvec->inactive_age); + advance_inactive_age(memcg, pgdat); inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); /* Page was active prior to eviction */ @@ -332,7 +371,6 @@ out: void workingset_activation(struct page *page) { struct mem_cgroup *memcg; - struct lruvec *lruvec; rcu_read_lock(); /* @@ -345,8 +383,7 @@ void workingset_activation(struct page *page) memcg = page_memcg_rcu(page); if (!mem_cgroup_disabled() && !memcg) goto out; - lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); - atomic_long_inc(&lruvec->inactive_age); + advance_inactive_age(memcg, page_pgdat(page)); out: rcu_read_unlock(); } @@ -426,7 +463,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct lruvec *lruvec; int i; - lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg); + lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) pages += lruvec_page_state_local(lruvec, NR_LRU_BASE + i); diff --git a/mm/z3fold.c b/mm/z3fold.c index 6d3d3f698ebb..43754d8ebce8 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -41,6 +41,7 @@ #include <linux/workqueue.h> #include <linux/slab.h> #include <linux/spinlock.h> +#include <linux/rwlock.h> #include <linux/zpool.h> #include <linux/magic.h> @@ -90,6 +91,7 @@ struct z3fold_buddy_slots { */ unsigned long slot[BUDDY_MASK + 1]; unsigned long pool; /* back link + flags */ + rwlock_t lock; }; #define HANDLE_FLAG_MASK (0x03) @@ -124,6 +126,7 @@ struct z3fold_header { unsigned short start_middle; unsigned short first_num:2; unsigned short mapped_count:2; + unsigned short foreign_handles:2; }; /** @@ -178,6 +181,19 @@ enum z3fold_page_flags { PAGE_CLAIMED, /* by either reclaim or free */ }; +/* + * handle flags, go under HANDLE_FLAG_MASK + */ +enum z3fold_handle_flags { + HANDLES_ORPHANED = 0, +}; + +/* + * Forward declarations + */ +static struct z3fold_header *__z3fold_alloc(struct z3fold_pool *, size_t, bool); +static void compact_page_work(struct work_struct *w); + /***************** * Helpers *****************/ @@ -191,8 +207,6 @@ static int size_to_chunks(size_t size) #define for_each_unbuddied_list(_iter, _begin) \ for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) -static void compact_page_work(struct work_struct *w); - static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool, gfp_t gfp) { @@ -204,6 +218,7 @@ static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool, if (slots) { memset(slots->slot, 0, sizeof(slots->slot)); slots->pool = (unsigned long)pool; + rwlock_init(&slots->lock); } return slots; @@ -219,25 +234,110 @@ static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle) return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1)); } +/* Lock a z3fold page */ +static inline void z3fold_page_lock(struct z3fold_header *zhdr) +{ + spin_lock(&zhdr->page_lock); +} + +/* Try to lock a z3fold page */ +static inline int z3fold_page_trylock(struct z3fold_header *zhdr) +{ + return spin_trylock(&zhdr->page_lock); +} + +/* Unlock a z3fold page */ +static inline void z3fold_page_unlock(struct z3fold_header *zhdr) +{ + spin_unlock(&zhdr->page_lock); +} + + +static inline struct z3fold_header *__get_z3fold_header(unsigned long handle, + bool lock) +{ + struct z3fold_buddy_slots *slots; + struct z3fold_header *zhdr; + int locked = 0; + + if (!(handle & (1 << PAGE_HEADLESS))) { + slots = handle_to_slots(handle); + do { + unsigned long addr; + + read_lock(&slots->lock); + addr = *(unsigned long *)handle; + zhdr = (struct z3fold_header *)(addr & PAGE_MASK); + if (lock) + locked = z3fold_page_trylock(zhdr); + read_unlock(&slots->lock); + if (locked) + break; + cpu_relax(); + } while (lock); + } else { + zhdr = (struct z3fold_header *)(handle & PAGE_MASK); + } + + return zhdr; +} + +/* Returns the z3fold page where a given handle is stored */ +static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h) +{ + return __get_z3fold_header(h, false); +} + +/* return locked z3fold page if it's not headless */ +static inline struct z3fold_header *get_z3fold_header(unsigned long h) +{ + return __get_z3fold_header(h, true); +} + +static inline void put_z3fold_header(struct z3fold_header *zhdr) +{ + struct page *page = virt_to_page(zhdr); + + if (!test_bit(PAGE_HEADLESS, &page->private)) + z3fold_page_unlock(zhdr); +} + static inline void free_handle(unsigned long handle) { struct z3fold_buddy_slots *slots; + struct z3fold_header *zhdr; int i; bool is_free; if (handle & (1 << PAGE_HEADLESS)) return; - WARN_ON(*(unsigned long *)handle == 0); - *(unsigned long *)handle = 0; + if (WARN_ON(*(unsigned long *)handle == 0)) + return; + + zhdr = handle_to_z3fold_header(handle); slots = handle_to_slots(handle); + write_lock(&slots->lock); + *(unsigned long *)handle = 0; + write_unlock(&slots->lock); + if (zhdr->slots == slots) + return; /* simple case, nothing else to do */ + + /* we are freeing a foreign handle if we are here */ + zhdr->foreign_handles--; is_free = true; + read_lock(&slots->lock); + if (!test_bit(HANDLES_ORPHANED, &slots->pool)) { + read_unlock(&slots->lock); + return; + } for (i = 0; i <= BUDDY_MASK; i++) { if (slots->slot[i]) { is_free = false; break; } } + read_unlock(&slots->lock); if (is_free) { struct z3fold_pool *pool = slots_to_pool(slots); @@ -322,6 +422,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless, zhdr->first_num = 0; zhdr->start_middle = 0; zhdr->cpu = -1; + zhdr->foreign_handles = 0; zhdr->slots = slots; zhdr->pool = pool; INIT_LIST_HEAD(&zhdr->buddy); @@ -341,24 +442,6 @@ static void free_z3fold_page(struct page *page, bool headless) __free_page(page); } -/* Lock a z3fold page */ -static inline void z3fold_page_lock(struct z3fold_header *zhdr) -{ - spin_lock(&zhdr->page_lock); -} - -/* Try to lock a z3fold page */ -static inline int z3fold_page_trylock(struct z3fold_header *zhdr) -{ - return spin_trylock(&zhdr->page_lock); -} - -/* Unlock a z3fold page */ -static inline void z3fold_page_unlock(struct z3fold_header *zhdr) -{ - spin_unlock(&zhdr->page_lock); -} - /* Helper function to build the index */ static inline int __idx(struct z3fold_header *zhdr, enum buddy bud) { @@ -389,7 +472,9 @@ static unsigned long __encode_handle(struct z3fold_header *zhdr, if (bud == LAST) h |= (zhdr->last_chunks << BUDDY_SHIFT); + write_lock(&slots->lock); slots->slot[idx] = h; + write_unlock(&slots->lock); return (unsigned long)&slots->slot[idx]; } @@ -398,22 +483,15 @@ static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) return __encode_handle(zhdr, zhdr->slots, bud); } -/* Returns the z3fold page where a given handle is stored */ -static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h) -{ - unsigned long addr = h; - - if (!(addr & (1 << PAGE_HEADLESS))) - addr = *(unsigned long *)h; - - return (struct z3fold_header *)(addr & PAGE_MASK); -} - /* only for LAST bud, returns zero otherwise */ static unsigned short handle_to_chunks(unsigned long handle) { - unsigned long addr = *(unsigned long *)handle; + struct z3fold_buddy_slots *slots = handle_to_slots(handle); + unsigned long addr; + read_lock(&slots->lock); + addr = *(unsigned long *)handle; + read_unlock(&slots->lock); return (addr & ~PAGE_MASK) >> BUDDY_SHIFT; } @@ -425,10 +503,13 @@ static unsigned short handle_to_chunks(unsigned long handle) static enum buddy handle_to_buddy(unsigned long handle) { struct z3fold_header *zhdr; + struct z3fold_buddy_slots *slots = handle_to_slots(handle); unsigned long addr; + read_lock(&slots->lock); WARN_ON(handle & (1 << PAGE_HEADLESS)); addr = *(unsigned long *)handle; + read_unlock(&slots->lock); zhdr = (struct z3fold_header *)(addr & PAGE_MASK); return (addr - zhdr->first_num) & BUDDY_MASK; } @@ -442,6 +523,8 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) { struct page *page = virt_to_page(zhdr); struct z3fold_pool *pool = zhdr_to_pool(zhdr); + bool is_free = true; + int i; WARN_ON(!list_empty(&zhdr->buddy)); set_bit(PAGE_STALE, &page->private); @@ -450,8 +533,25 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) if (!list_empty(&page->lru)) list_del_init(&page->lru); spin_unlock(&pool->lock); + + /* If there are no foreign handles, free the handles array */ + read_lock(&zhdr->slots->lock); + for (i = 0; i <= BUDDY_MASK; i++) { + if (zhdr->slots->slot[i]) { + is_free = false; + break; + } + } + if (!is_free) + set_bit(HANDLES_ORPHANED, &zhdr->slots->pool); + read_unlock(&zhdr->slots->lock); + + if (is_free) + kmem_cache_free(pool->c_handle, zhdr->slots); + if (locked) z3fold_page_unlock(zhdr); + spin_lock(&pool->stale_lock); list_add(&zhdr->buddy, &pool->stale); queue_work(pool->release_wq, &pool->work); @@ -479,6 +579,7 @@ static void release_z3fold_page_locked_list(struct kref *ref) struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, refcount); struct z3fold_pool *pool = zhdr_to_pool(zhdr); + spin_lock(&pool->lock); list_del_init(&zhdr->buddy); spin_unlock(&pool->lock); @@ -559,6 +660,119 @@ static inline void *mchunk_memmove(struct z3fold_header *zhdr, zhdr->middle_chunks << CHUNK_SHIFT); } +static inline bool buddy_single(struct z3fold_header *zhdr) +{ + return !((zhdr->first_chunks && zhdr->middle_chunks) || + (zhdr->first_chunks && zhdr->last_chunks) || + (zhdr->middle_chunks && zhdr->last_chunks)); +} + +static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr) +{ + struct z3fold_pool *pool = zhdr_to_pool(zhdr); + void *p = zhdr; + unsigned long old_handle = 0; + size_t sz = 0; + struct z3fold_header *new_zhdr = NULL; + int first_idx = __idx(zhdr, FIRST); + int middle_idx = __idx(zhdr, MIDDLE); + int last_idx = __idx(zhdr, LAST); + unsigned short *moved_chunks = NULL; + + /* + * No need to protect slots here -- all the slots are "local" and + * the page lock is already taken + */ + if (zhdr->first_chunks && zhdr->slots->slot[first_idx]) { + p += ZHDR_SIZE_ALIGNED; + sz = zhdr->first_chunks << CHUNK_SHIFT; + old_handle = (unsigned long)&zhdr->slots->slot[first_idx]; + moved_chunks = &zhdr->first_chunks; + } else if (zhdr->middle_chunks && zhdr->slots->slot[middle_idx]) { + p += zhdr->start_middle << CHUNK_SHIFT; + sz = zhdr->middle_chunks << CHUNK_SHIFT; + old_handle = (unsigned long)&zhdr->slots->slot[middle_idx]; + moved_chunks = &zhdr->middle_chunks; + } else if (zhdr->last_chunks && zhdr->slots->slot[last_idx]) { + p += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT); + sz = zhdr->last_chunks << CHUNK_SHIFT; + old_handle = (unsigned long)&zhdr->slots->slot[last_idx]; + moved_chunks = &zhdr->last_chunks; + } + + if (sz > 0) { + enum buddy new_bud = HEADLESS; + short chunks = size_to_chunks(sz); + void *q; + + new_zhdr = __z3fold_alloc(pool, sz, false); + if (!new_zhdr) + return NULL; + + if (WARN_ON(new_zhdr == zhdr)) + goto out_fail; + + if (new_zhdr->first_chunks == 0) { + if (new_zhdr->middle_chunks != 0 && + chunks >= new_zhdr->start_middle) { + new_bud = LAST; + } else { + new_bud = FIRST; + } + } else if (new_zhdr->last_chunks == 0) { + new_bud = LAST; + } else if (new_zhdr->middle_chunks == 0) { + new_bud = MIDDLE; + } + q = new_zhdr; + switch (new_bud) { + case FIRST: + new_zhdr->first_chunks = chunks; + q += ZHDR_SIZE_ALIGNED; + break; + case MIDDLE: + new_zhdr->middle_chunks = chunks; + new_zhdr->start_middle = + new_zhdr->first_chunks + ZHDR_CHUNKS; + q += new_zhdr->start_middle << CHUNK_SHIFT; + break; + case LAST: + new_zhdr->last_chunks = chunks; + q += PAGE_SIZE - (new_zhdr->last_chunks << CHUNK_SHIFT); + break; + default: + goto out_fail; + } + new_zhdr->foreign_handles++; + memcpy(q, p, sz); + write_lock(&zhdr->slots->lock); + *(unsigned long *)old_handle = (unsigned long)new_zhdr + + __idx(new_zhdr, new_bud); + if (new_bud == LAST) + *(unsigned long *)old_handle |= + (new_zhdr->last_chunks << BUDDY_SHIFT); + write_unlock(&zhdr->slots->lock); + add_to_unbuddied(pool, new_zhdr); + z3fold_page_unlock(new_zhdr); + + *moved_chunks = 0; + } + + return new_zhdr; + +out_fail: + if (new_zhdr) { + if (kref_put(&new_zhdr->refcount, release_z3fold_page_locked)) + atomic64_dec(&pool->pages_nr); + else { + add_to_unbuddied(pool, new_zhdr); + z3fold_page_unlock(new_zhdr); + } + } + return NULL; + +} + #define BIG_CHUNK_GAP 3 /* Has to be called with lock held */ static int z3fold_compact_page(struct z3fold_header *zhdr) @@ -638,6 +852,15 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked) return; } + if (!zhdr->foreign_handles && buddy_single(zhdr) && + zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) { + if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) + atomic64_dec(&pool->pages_nr); + else + z3fold_page_unlock(zhdr); + return; + } + z3fold_compact_page(zhdr); add_to_unbuddied(pool, zhdr); z3fold_page_unlock(zhdr); @@ -690,7 +913,8 @@ lookup: spin_unlock(&pool->lock); page = virt_to_page(zhdr); - if (test_bit(NEEDS_COMPACTING, &page->private)) { + if (test_bit(NEEDS_COMPACTING, &page->private) || + test_bit(PAGE_CLAIMED, &page->private)) { z3fold_page_unlock(zhdr); zhdr = NULL; put_cpu_ptr(pool->unbuddied); @@ -734,7 +958,8 @@ lookup: spin_unlock(&pool->lock); page = virt_to_page(zhdr); - if (test_bit(NEEDS_COMPACTING, &page->private)) { + if (test_bit(NEEDS_COMPACTING, &page->private) || + test_bit(PAGE_CLAIMED, &page->private)) { z3fold_page_unlock(zhdr); zhdr = NULL; if (can_sleep) @@ -1000,7 +1225,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) enum buddy bud; bool page_claimed; - zhdr = handle_to_z3fold_header(handle); + zhdr = get_z3fold_header(handle); page = virt_to_page(zhdr); page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private); @@ -1014,6 +1239,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) spin_lock(&pool->lock); list_del(&page->lru); spin_unlock(&pool->lock); + put_z3fold_header(zhdr); free_z3fold_page(page, true); atomic64_dec(&pool->pages_nr); } @@ -1021,7 +1247,6 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) } /* Non-headless case */ - z3fold_page_lock(zhdr); bud = handle_to_buddy(handle); switch (bud) { @@ -1037,11 +1262,13 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) default: pr_err("%s: unknown bud %d\n", __func__, bud); WARN_ON(1); - z3fold_page_unlock(zhdr); + put_z3fold_header(zhdr); + clear_bit(PAGE_CLAIMED, &page->private); return; } - free_handle(handle); + if (!page_claimed) + free_handle(handle); if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) { atomic64_dec(&pool->pages_nr); return; @@ -1053,7 +1280,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) } if (unlikely(PageIsolated(page)) || test_and_set_bit(NEEDS_COMPACTING, &page->private)) { - z3fold_page_unlock(zhdr); + put_z3fold_header(zhdr); clear_bit(PAGE_CLAIMED, &page->private); return; } @@ -1063,14 +1290,14 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) spin_unlock(&pool->lock); zhdr->cpu = -1; kref_get(&zhdr->refcount); - do_compact_page(zhdr, true); clear_bit(PAGE_CLAIMED, &page->private); + do_compact_page(zhdr, true); return; } kref_get(&zhdr->refcount); - queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work); clear_bit(PAGE_CLAIMED, &page->private); - z3fold_page_unlock(zhdr); + queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work); + put_z3fold_header(zhdr); } /** @@ -1111,11 +1338,10 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) */ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) { - int i, ret = 0; + int i, ret = -1; struct z3fold_header *zhdr = NULL; struct page *page = NULL; struct list_head *pos; - struct z3fold_buddy_slots slots; unsigned long first_handle = 0, middle_handle = 0, last_handle = 0; spin_lock(&pool->lock); @@ -1153,6 +1379,12 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) zhdr = NULL; continue; /* can't evict at this point */ } + if (zhdr->foreign_handles) { + clear_bit(PAGE_CLAIMED, &page->private); + z3fold_page_unlock(zhdr); + zhdr = NULL; + continue; /* can't evict such page */ + } kref_get(&zhdr->refcount); list_del_init(&zhdr->buddy); zhdr->cpu = -1; @@ -1176,39 +1408,38 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) last_handle = 0; middle_handle = 0; if (zhdr->first_chunks) - first_handle = __encode_handle(zhdr, &slots, - FIRST); + first_handle = encode_handle(zhdr, FIRST); if (zhdr->middle_chunks) - middle_handle = __encode_handle(zhdr, &slots, - MIDDLE); + middle_handle = encode_handle(zhdr, MIDDLE); if (zhdr->last_chunks) - last_handle = __encode_handle(zhdr, &slots, - LAST); + last_handle = encode_handle(zhdr, LAST); /* * it's safe to unlock here because we hold a * reference to this page */ z3fold_page_unlock(zhdr); } else { - first_handle = __encode_handle(zhdr, &slots, HEADLESS); + first_handle = encode_handle(zhdr, HEADLESS); last_handle = middle_handle = 0; } - /* Issue the eviction callback(s) */ if (middle_handle) { ret = pool->ops->evict(pool, middle_handle); if (ret) goto next; + free_handle(middle_handle); } if (first_handle) { ret = pool->ops->evict(pool, first_handle); if (ret) goto next; + free_handle(first_handle); } if (last_handle) { ret = pool->ops->evict(pool, last_handle); if (ret) goto next; + free_handle(last_handle); } next: if (test_bit(PAGE_HEADLESS, &page->private)) { @@ -1264,14 +1495,13 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle) void *addr; enum buddy buddy; - zhdr = handle_to_z3fold_header(handle); + zhdr = get_z3fold_header(handle); addr = zhdr; page = virt_to_page(zhdr); if (test_bit(PAGE_HEADLESS, &page->private)) goto out; - z3fold_page_lock(zhdr); buddy = handle_to_buddy(handle); switch (buddy) { case FIRST: @@ -1293,8 +1523,8 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle) if (addr) zhdr->mapped_count++; - z3fold_page_unlock(zhdr); out: + put_z3fold_header(zhdr); return addr; } @@ -1309,18 +1539,17 @@ static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle) struct page *page; enum buddy buddy; - zhdr = handle_to_z3fold_header(handle); + zhdr = get_z3fold_header(handle); page = virt_to_page(zhdr); if (test_bit(PAGE_HEADLESS, &page->private)) return; - z3fold_page_lock(zhdr); buddy = handle_to_buddy(handle); if (buddy == MIDDLE) clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); zhdr->mapped_count--; - z3fold_page_unlock(zhdr); + put_z3fold_header(zhdr); } /** @@ -1352,19 +1581,21 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) test_bit(PAGE_STALE, &page->private)) goto out; + if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) + goto out; + pool = zhdr_to_pool(zhdr); + spin_lock(&pool->lock); + if (!list_empty(&zhdr->buddy)) + list_del_init(&zhdr->buddy); + if (!list_empty(&page->lru)) + list_del_init(&page->lru); + spin_unlock(&pool->lock); + + kref_get(&zhdr->refcount); + z3fold_page_unlock(zhdr); + return true; - if (zhdr->mapped_count == 0) { - kref_get(&zhdr->refcount); - if (!list_empty(&zhdr->buddy)) - list_del_init(&zhdr->buddy); - spin_lock(&pool->lock); - if (!list_empty(&page->lru)) - list_del(&page->lru); - spin_unlock(&pool->lock); - z3fold_page_unlock(zhdr); - return true; - } out: z3fold_page_unlock(zhdr); return false; @@ -1387,7 +1618,7 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa if (!z3fold_page_trylock(zhdr)) { return -EAGAIN; } - if (zhdr->mapped_count != 0) { + if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) { z3fold_page_unlock(zhdr); return -EBUSY; } |