diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 16 | ||||
-rw-r--r-- | mm/backing-dev.c | 20 | ||||
-rw-r--r-- | mm/bootmem.c | 159 | ||||
-rw-r--r-- | mm/cma.c | 8 | ||||
-rw-r--r-- | mm/cma_debug.c | 2 | ||||
-rw-r--r-- | mm/debug.c | 18 | ||||
-rw-r--r-- | mm/fadvise.c | 8 | ||||
-rw-r--r-- | mm/gup.c | 2 | ||||
-rw-r--r-- | mm/hmm.c | 19 | ||||
-rw-r--r-- | mm/huge_memory.c | 19 | ||||
-rw-r--r-- | mm/hugetlb.c | 47 | ||||
-rw-r--r-- | mm/init-mm.c | 11 | ||||
-rw-r--r-- | mm/kasan/kasan.c | 5 | ||||
-rw-r--r-- | mm/kasan/kasan_init.c | 316 | ||||
-rw-r--r-- | mm/khugepaged.c | 60 | ||||
-rw-r--r-- | mm/ksm.c | 5 | ||||
-rw-r--r-- | mm/list_lru.c | 147 | ||||
-rw-r--r-- | mm/memblock.c | 254 | ||||
-rw-r--r-- | mm/memcontrol.c | 356 | ||||
-rw-r--r-- | mm/memfd.c | 2 | ||||
-rw-r--r-- | mm/memory.c | 225 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 96 | ||||
-rw-r--r-- | mm/mempolicy.c | 1 | ||||
-rw-r--r-- | mm/mempool.c | 12 | ||||
-rw-r--r-- | mm/migrate.c | 3 | ||||
-rw-r--r-- | mm/mlock.c | 3 | ||||
-rw-r--r-- | mm/mmap.c | 76 | ||||
-rw-r--r-- | mm/mprotect.c | 49 | ||||
-rw-r--r-- | mm/nobootmem.c | 20 | ||||
-rw-r--r-- | mm/nommu.c | 16 | ||||
-rw-r--r-- | mm/oom_kill.c | 16 | ||||
-rw-r--r-- | mm/page-writeback.c | 4 | ||||
-rw-r--r-- | mm/page_alloc.c | 68 | ||||
-rw-r--r-- | mm/page_ext.c | 4 | ||||
-rw-r--r-- | mm/page_io.c | 3 | ||||
-rw-r--r-- | mm/readahead.c | 19 | ||||
-rw-r--r-- | mm/rmap.c | 8 | ||||
-rw-r--r-- | mm/shmem.c | 63 | ||||
-rw-r--r-- | mm/slab.h | 6 | ||||
-rw-r--r-- | mm/slab_common.c | 12 | ||||
-rw-r--r-- | mm/slub.c | 10 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 61 | ||||
-rw-r--r-- | mm/sparse.c | 290 | ||||
-rw-r--r-- | mm/swap_slots.c | 4 | ||||
-rw-r--r-- | mm/swapfile.c | 77 | ||||
-rw-r--r-- | mm/usercopy.c | 25 | ||||
-rw-r--r-- | mm/vmacache.c | 38 | ||||
-rw-r--r-- | mm/vmalloc.c | 4 | ||||
-rw-r--r-- | mm/vmscan.c | 241 | ||||
-rw-r--r-- | mm/vmstat.c | 2 | ||||
-rw-r--r-- | mm/workingset.c | 30 | ||||
-rw-r--r-- | mm/zsmalloc.c | 36 | ||||
-rw-r--r-- | mm/zswap.c | 9 |
53 files changed, 1954 insertions, 1051 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index ce95491abd6a..a550635ea5c3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1,3 +1,6 @@ + +menu "Memory Management options" + config SELECT_MEMORY_MODEL def_bool y depends on ARCH_SELECT_MEMORY_MODEL @@ -115,10 +118,6 @@ config SPARSEMEM_EXTREME config SPARSEMEM_VMEMMAP_ENABLE bool -config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - def_bool y - depends on SPARSEMEM && X86_64 - config SPARSEMEM_VMEMMAP bool "Sparse Memory virtual memmap" depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE @@ -420,10 +419,11 @@ config ARCH_WANTS_THP_SWAP config THP_SWAP def_bool y - depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP + depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP help Swap transparent huge pages in one piece, without splitting. - XXX: For now this only does clustered swap space allocation. + XXX: For now, swap cluster backing transparent huge page + will be split after swapout. For selection by architectures with reasonable THP sizes. @@ -635,7 +635,7 @@ config DEFERRED_STRUCT_PAGE_INIT bool "Defer initialisation of struct pages to kthreads" default n depends on NO_BOOTMEM - depends on !FLATMEM + depends on SPARSEMEM depends on !NEED_PER_CPU_KM help Ordinarily all struct pages are initialised during early boot in a @@ -762,3 +762,5 @@ config GUP_BENCHMARK config ARCH_HAS_PTE_SPECIAL bool + +endmenu diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 347cc834c04a..2e5d3df0853d 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -359,15 +359,8 @@ static void wb_shutdown(struct bdi_writeback *wb) spin_lock_bh(&wb->work_lock); if (!test_and_clear_bit(WB_registered, &wb->state)) { spin_unlock_bh(&wb->work_lock); - /* - * Wait for wb shutdown to finish if someone else is just - * running wb_shutdown(). Otherwise we could proceed to wb / - * bdi destruction before wb_shutdown() is finished. - */ - wait_on_bit(&wb->state, WB_shutting_down, TASK_UNINTERRUPTIBLE); return; } - set_bit(WB_shutting_down, &wb->state); spin_unlock_bh(&wb->work_lock); cgwb_remove_from_bdi_list(wb); @@ -379,12 +372,6 @@ static void wb_shutdown(struct bdi_writeback *wb) mod_delayed_work(bdi_wq, &wb->dwork, 0); flush_delayed_work(&wb->dwork); WARN_ON(!list_empty(&wb->work_list)); - /* - * Make sure bit gets cleared after shutdown is finished. Matches with - * the barrier provided by test_and_clear_bit() above. - */ - smp_wmb(); - clear_and_wake_up_bit(WB_shutting_down, &wb->state); } static void wb_exit(struct bdi_writeback *wb) @@ -508,10 +495,12 @@ static void cgwb_release_workfn(struct work_struct *work) struct bdi_writeback *wb = container_of(work, struct bdi_writeback, release_work); + mutex_lock(&wb->bdi->cgwb_release_mutex); wb_shutdown(wb); css_put(wb->memcg_css); css_put(wb->blkcg_css); + mutex_unlock(&wb->bdi->cgwb_release_mutex); fprop_local_destroy_percpu(&wb->memcg_completions); percpu_ref_exit(&wb->refcnt); @@ -697,6 +686,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); bdi->cgwb_congested_tree = RB_ROOT; + mutex_init(&bdi->cgwb_release_mutex); ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); if (!ret) { @@ -717,7 +707,10 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi) spin_lock_irq(&cgwb_lock); radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) cgwb_kill(*slot); + spin_unlock_irq(&cgwb_lock); + mutex_lock(&bdi->cgwb_release_mutex); + spin_lock_irq(&cgwb_lock); while (!list_empty(&bdi->wb_list)) { wb = list_first_entry(&bdi->wb_list, struct bdi_writeback, bdi_node); @@ -726,6 +719,7 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi) spin_lock_irq(&cgwb_lock); } spin_unlock_irq(&cgwb_lock); + mutex_unlock(&bdi->cgwb_release_mutex); } /** diff --git a/mm/bootmem.c b/mm/bootmem.c index 9e197987b67d..97db0e8e362b 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -21,6 +21,53 @@ #include "internal.h" +/** + * DOC: bootmem overview + * + * Bootmem is a boot-time physical memory allocator and configurator. + * + * It is used early in the boot process before the page allocator is + * set up. + * + * Bootmem is based on the most basic of allocators, a First Fit + * allocator which uses a bitmap to represent memory. If a bit is 1, + * the page is allocated and 0 if unallocated. To satisfy allocations + * of sizes smaller than a page, the allocator records the Page Frame + * Number (PFN) of the last allocation and the offset the allocation + * ended at. Subsequent small allocations are merged together and + * stored on the same page. + * + * The information used by the bootmem allocator is represented by + * :c:type:`struct bootmem_data`. An array to hold up to %MAX_NUMNODES + * such structures is statically allocated and then it is discarded + * when the system initialization completes. Each entry in this array + * corresponds to a node with memory. For UMA systems only entry 0 is + * used. + * + * The bootmem allocator is initialized during early architecture + * specific setup. Each architecture is required to supply a + * :c:func:`setup_arch` function which, among other tasks, is + * responsible for acquiring the necessary parameters to initialise + * the boot memory allocator. These parameters define limits of usable + * physical memory: + * + * * @min_low_pfn - the lowest PFN that is available in the system + * * @max_low_pfn - the highest PFN that may be addressed by low + * memory (%ZONE_NORMAL) + * * @max_pfn - the last PFN available to the system. + * + * After those limits are determined, the :c:func:`init_bootmem` or + * :c:func:`init_bootmem_node` function should be called to initialize + * the bootmem allocator. The UMA case should use the `init_bootmem` + * function. It will initialize ``contig_page_data`` structure that + * represents the only memory node in the system. In the NUMA case the + * `init_bootmem_node` function should be called to initialize the + * bootmem allocator for each node. + * + * Once the allocator is set up, it is possible to use either single + * node or NUMA variant of the allocation APIs. + */ + #ifndef CONFIG_NEED_MULTIPLE_NODES struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] @@ -62,6 +109,8 @@ static unsigned long __init bootmap_bytes(unsigned long pages) /** * bootmem_bootmap_pages - calculate bitmap size in pages * @pages: number of pages the bitmap has to represent + * + * Return: the number of pages needed to hold the bitmap. */ unsigned long __init bootmem_bootmap_pages(unsigned long pages) { @@ -121,7 +170,7 @@ static unsigned long __init init_bootmem_core(bootmem_data_t *bdata, * @startpfn: first pfn on the node * @endpfn: first pfn after the node * - * Returns the number of bytes needed to hold the bitmap for this node. + * Return: the number of bytes needed to hold the bitmap for this node. */ unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn) @@ -134,7 +183,7 @@ unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, * @start: pfn where the bitmap is to be placed * @pages: number of available physical pages * - * Returns the number of bytes needed to hold the bitmap. + * Return: the number of bytes needed to hold the bitmap. */ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) { @@ -143,15 +192,6 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); } -/* - * free_bootmem_late - free bootmem pages directly to page allocator - * @addr: starting physical address of the range - * @size: size of the range in bytes - * - * This is only useful when the bootmem allocator has already been torn - * down, but we are still initializing the system. Pages are given directly - * to the page allocator, no bootmem metadata is updated because it is gone. - */ void __init free_bootmem_late(unsigned long physaddr, unsigned long size) { unsigned long cursor, end; @@ -264,11 +304,6 @@ void __init reset_all_zones_managed_pages(void) reset_managed_pages_done = 1; } -/** - * free_all_bootmem - release free pages to the buddy allocator - * - * Returns the number of pages actually released. - */ unsigned long __init free_all_bootmem(void) { unsigned long total_pages = 0; @@ -385,16 +420,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, BUG(); } -/** - * free_bootmem_node - mark a page range as usable - * @pgdat: node the range resides on - * @physaddr: starting address of the range - * @size: size of the range in bytes - * - * Partial pages will be considered reserved and left as they are. - * - * The range must reside completely on the specified node. - */ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { @@ -408,15 +433,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, mark_bootmem_node(pgdat->bdata, start, end, 0, 0); } -/** - * free_bootmem - mark a page range as usable - * @physaddr: starting physical address of the range - * @size: size of the range in bytes - * - * Partial pages will be considered reserved and left as they are. - * - * The range must be contiguous but may span node boundaries. - */ void __init free_bootmem(unsigned long physaddr, unsigned long size) { unsigned long start, end; @@ -439,6 +455,8 @@ void __init free_bootmem(unsigned long physaddr, unsigned long size) * Partial pages will be reserved. * * The range must reside completely on the specified node. + * + * Return: 0 on success, -errno on failure. */ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size, int flags) @@ -460,6 +478,8 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, * Partial pages will be reserved. * * The range must be contiguous but may span node boundaries. + * + * Return: 0 on success, -errno on failure. */ int __init reserve_bootmem(unsigned long addr, unsigned long size, int flags) @@ -646,19 +666,6 @@ restart: return NULL; } -/** - * __alloc_bootmem_nopanic - allocate boot memory without panicking - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may happen on any node in the system. - * - * Returns NULL on failure. - */ void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal) { @@ -682,19 +689,6 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, return NULL; } -/** - * __alloc_bootmem - allocate boot memory - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may happen on any node in the system. - * - * The function panics if the request can not be satisfied. - */ void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) { @@ -754,21 +748,6 @@ void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, return NULL; } -/** - * __alloc_bootmem_node - allocate boot memory from a specific node - * @pgdat: node to allocate from - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may fall back to any node in the system if the specified node - * can not hold the requested memory. - * - * The function panics if the request can not be satisfied. - */ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { @@ -807,19 +786,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, } -/** - * __alloc_bootmem_low - allocate low boot memory - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may happen on any node in the system. - * - * The function panics if the request can not be satisfied. - */ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal) { @@ -834,21 +800,6 @@ void * __init __alloc_bootmem_low_nopanic(unsigned long size, ARCH_LOW_ADDRESS_LIMIT); } -/** - * __alloc_bootmem_low_node - allocate low boot memory from a specific node - * @pgdat: node to allocate from - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may fall back to any node in the system if the specified node - * can not hold the requested memory. - * - * The function panics if the request can not be satisfied. - */ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { @@ -395,13 +395,13 @@ static inline void cma_debug_show_areas(struct cma *cma) { } * @cma: Contiguous memory region for which the allocation is performed. * @count: Requested number of pages. * @align: Requested alignment of pages (in PAGE_SIZE order). - * @gfp_mask: GFP mask to use during compaction + * @no_warn: Avoid printing message about failed allocation * * This function allocates part of contiguous memory on specific * contiguous memory area. */ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, - gfp_t gfp_mask) + bool no_warn) { unsigned long mask, offset; unsigned long pfn = -1; @@ -447,7 +447,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); mutex_lock(&cma_mutex); ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, - gfp_mask); + GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0)); mutex_unlock(&cma_mutex); if (ret == 0) { page = pfn_to_page(pfn); @@ -466,7 +466,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, trace_cma_alloc(pfn, page, count, align); - if (ret && !(gfp_mask & __GFP_NOWARN)) { + if (ret && !no_warn) { pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n", __func__, count, ret); cma_debug_show_areas(cma); diff --git a/mm/cma_debug.c b/mm/cma_debug.c index f23467291cfb..ad6723e9d110 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -139,7 +139,7 @@ static int cma_alloc_mem(struct cma *cma, int count) if (!mem) return -ENOMEM; - p = cma_alloc(cma, count, 0, GFP_KERNEL); + p = cma_alloc(cma, count, 0, false); if (!p) { kfree(mem); return -ENOMEM; diff --git a/mm/debug.c b/mm/debug.c index 56e2d9125ea5..38c926520c97 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -43,12 +43,25 @@ const struct trace_print_flags vmaflag_names[] = { void __dump_page(struct page *page, const char *reason) { + bool page_poisoned = PagePoisoned(page); + int mapcount; + + /* + * If struct page is poisoned don't access Page*() functions as that + * leads to recursive loop. Page*() check for poisoned pages, and calls + * dump_page() when detected. + */ + if (page_poisoned) { + pr_emerg("page:%px is uninitialized and poisoned", page); + goto hex_only; + } + /* * Avoid VM_BUG_ON() in page_mapcount(). * page->_mapcount space in struct page is used by sl[aou]b pages to * encode own info. */ - int mapcount = PageSlab(page) ? 0 : page_mapcount(page); + mapcount = PageSlab(page) ? 0 : page_mapcount(page); pr_emerg("page:%px count:%d mapcount:%d mapping:%px index:%#lx", page, page_ref_count(page), mapcount, @@ -60,6 +73,7 @@ void __dump_page(struct page *page, const char *reason) pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags); +hex_only: print_hex_dump(KERN_ALERT, "raw: ", DUMP_PREFIX_NONE, 32, sizeof(unsigned long), page, sizeof(struct page), false); @@ -68,7 +82,7 @@ void __dump_page(struct page *page, const char *reason) pr_alert("page dumped because: %s\n", reason); #ifdef CONFIG_MEMCG - if (page->mem_cgroup) + if (!page_poisoned && page->mem_cgroup) pr_alert("page->mem_cgroup:%px\n", page->mem_cgroup); #endif } diff --git a/mm/fadvise.c b/mm/fadvise.c index afa41491d324..2d8376e3c640 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -72,8 +72,12 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) goto out; } - /* Careful about overflows. Len == 0 means "as much as possible" */ - endbyte = offset + len; + /* + * Careful about overflows. Len == 0 means "as much as possible". Use + * unsigned math because signed overflows are undefined and UBSan + * complains. + */ + endbyte = (u64)offset + (u64)len; if (!len || endbyte < len) endbyte = -1; else @@ -1238,8 +1238,6 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) int locked = 0; long ret = 0; - VM_BUG_ON(start & ~PAGE_MASK); - VM_BUG_ON(len != PAGE_ALIGN(len)); end = start + len; for (nstart = start; nstart < end; nstart = nend) { @@ -299,14 +299,14 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; - int r; + vm_fault_t ret; flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; flags |= write_fault ? FAULT_FLAG_WRITE : 0; - r = handle_mm_fault(vma, addr, flags); - if (r & VM_FAULT_RETRY) + ret = handle_mm_fault(vma, addr, flags); + if (ret & VM_FAULT_RETRY) return -EBUSY; - if (r & VM_FAULT_ERROR) { + if (ret & VM_FAULT_ERROR) { *pfn = range->values[HMM_PFN_ERROR]; return -EFAULT; } @@ -676,7 +676,8 @@ int hmm_vma_get_pfns(struct hmm_range *range) return -EINVAL; /* FIXME support hugetlb fs */ - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { + if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || + vma_is_dax(vma)) { hmm_pfns_special(range); return -EINVAL; } @@ -849,7 +850,8 @@ int hmm_vma_fault(struct hmm_range *range, bool block) return -EINVAL; /* FIXME support hugetlb fs */ - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { + if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || + vma_is_dax(vma)) { hmm_pfns_special(range); return -EINVAL; } @@ -971,10 +973,7 @@ static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL); static void hmm_devmem_radix_release(struct resource *resource) { - resource_size_t key, align_start, align_size; - - align_start = resource->start & ~(PA_SECTION_SIZE - 1); - align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE); + resource_size_t key; mutex_lock(&hmm_devmem_lock); for (key = resource->start; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1cd7c1a57a14..78427af91de9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -552,7 +552,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) { + if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -762,11 +762,11 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, * but we need to be consistent with PTEs and architectures that * can't support a 'special' bit. */ - BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); + BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && + !pfn_t_devmap(pfn)); BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); - BUG_ON(!pfn_t_devmap(pfn)); if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; @@ -1142,7 +1142,7 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma, vmf->address, page_to_nid(page)); if (unlikely(!pages[i] || - mem_cgroup_try_charge(pages[i], vma->vm_mm, + mem_cgroup_try_charge_delay(pages[i], vma->vm_mm, GFP_KERNEL, &memcg, false))) { if (pages[i]) put_page(pages[i]); @@ -1312,7 +1312,7 @@ alloc: goto out; } - if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, + if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm, huge_gfp, &memcg, true))) { put_page(new_page); split_huge_pmd(vma, vmf->pmd, vmf->address); @@ -1328,7 +1328,8 @@ alloc: if (!page) clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR); else - copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); + copy_user_huge_page(new_page, page, vmf->address, + vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); mmun_start = haddr; @@ -1740,7 +1741,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, } else { if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); - add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR); + add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR); } spin_unlock(ptl); @@ -2084,11 +2085,13 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (vma_is_dax(vma)) return; page = pmd_page(_pmd); + if (!PageDirty(page) && pmd_dirty(_pmd)) + set_page_dirty(page); if (!PageReferenced(page) && pmd_young(_pmd)) SetPageReferenced(page); page_remove_rmap(page, true); put_page(page); - add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR); + add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); return; } else if (is_huge_zero_pmd(*pmd)) { /* diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3612fbb32e9d..47566bb0b4b1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2101,7 +2101,7 @@ int __alloc_bootmem_huge_page(struct hstate *h) for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { void *addr; - addr = memblock_virt_alloc_try_nid_nopanic( + addr = memblock_virt_alloc_try_nid_raw( huge_page_size(h), huge_page_size(h), 0, BOOTMEM_ALLOC_ACCESSIBLE, node); if (addr) { @@ -2119,6 +2119,7 @@ int __alloc_bootmem_huge_page(struct hstate *h) found: BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h))); /* Put them into a private list first because mem_map is not up yet */ + INIT_LIST_HEAD(&m->list); list_add(&m->list, &huge_boot_pages); m->hstate = h; return 1; @@ -2139,16 +2140,9 @@ static void __init gather_bootmem_prealloc(void) struct huge_bootmem_page *m; list_for_each_entry(m, &huge_boot_pages, list) { + struct page *page = virt_to_page(m); struct hstate *h = m->hstate; - struct page *page; -#ifdef CONFIG_HIGHMEM - page = pfn_to_page(m->phys >> PAGE_SHIFT); - memblock_free_late(__pa(m), - sizeof(struct huge_bootmem_page)); -#else - page = virt_to_page(m); -#endif WARN_ON(page_count(page) != 1); prep_compound_huge_page(page, h->order); WARN_ON(PageReserved(page)); @@ -2163,6 +2157,7 @@ static void __init gather_bootmem_prealloc(void) */ if (hstate_is_gigantic(h)) adjust_managed_page_count(page, 1 << h->order); + cond_resched(); } } @@ -3166,6 +3161,13 @@ static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) return 0; } +/* + * When a new function is introduced to vm_operations_struct and added + * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops. + * This is because under System V memory model, mappings created via + * shmget/shmat with "huge page" specified are backed by hugetlbfs files, + * their original vm_ops are overwritten with shm_vm_ops. + */ const struct vm_operations_struct hugetlb_vm_ops = { .fault = hugetlb_vm_op_fault, .open = hugetlb_vm_op_open, @@ -3510,6 +3512,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, int ret = 0, outside_reserve = 0; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ + unsigned long haddr = address & huge_page_mask(h); pte = huge_ptep_get(ptep); old_page = pte_page(pte); @@ -3519,7 +3522,7 @@ retry_avoidcopy: * and just make the page writable */ if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { page_move_anon_rmap(old_page, vma); - set_huge_ptep_writable(vma, address, ptep); + set_huge_ptep_writable(vma, haddr, ptep); return 0; } @@ -3543,7 +3546,7 @@ retry_avoidcopy: * be acquired again before returning to the caller, as expected. */ spin_unlock(ptl); - new_page = alloc_huge_page(vma, address, outside_reserve); + new_page = alloc_huge_page(vma, haddr, outside_reserve); if (IS_ERR(new_page)) { /* @@ -3556,11 +3559,10 @@ retry_avoidcopy: if (outside_reserve) { put_page(old_page); BUG_ON(huge_pte_none(pte)); - unmap_ref_private(mm, vma, old_page, address); + unmap_ref_private(mm, vma, old_page, haddr); BUG_ON(huge_pte_none(pte)); spin_lock(ptl); - ptep = huge_pte_offset(mm, address & huge_page_mask(h), - huge_page_size(h)); + ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) goto retry_avoidcopy; @@ -3590,7 +3592,7 @@ retry_avoidcopy: __SetPageUptodate(new_page); set_page_huge_active(new_page); - mmun_start = address & huge_page_mask(h); + mmun_start = haddr; mmun_end = mmun_start + huge_page_size(h); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); @@ -3599,25 +3601,24 @@ retry_avoidcopy: * before the page tables are altered */ spin_lock(ptl); - ptep = huge_pte_offset(mm, address & huge_page_mask(h), - huge_page_size(h)); + ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { ClearPagePrivate(new_page); /* Break COW */ - huge_ptep_clear_flush(vma, address, ptep); + huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); - set_huge_pte_at(mm, address, ptep, + set_huge_pte_at(mm, haddr, ptep, make_huge_pte(vma, new_page, 1)); page_remove_rmap(old_page, true); - hugepage_add_new_anon_rmap(new_page, vma, address); + hugepage_add_new_anon_rmap(new_page, vma, haddr); /* Make the old page be freed below */ new_page = old_page; } spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out_release_all: - restore_reserve_on_error(h, vma, address, new_page); + restore_reserve_on_error(h, vma, haddr, new_page); put_page(new_page); out_release_old: put_page(old_page); @@ -3820,7 +3821,7 @@ retry: hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_cow(mm, vma, haddr, ptep, page, ptl); + ret = hugetlb_cow(mm, vma, address, ptep, page, ptl); } spin_unlock(ptl); @@ -3974,7 +3975,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_WRITE) { if (!huge_pte_write(entry)) { - ret = hugetlb_cow(mm, vma, haddr, ptep, + ret = hugetlb_cow(mm, vma, address, ptep, pagecache_page, ptl); goto out_put_page; } diff --git a/mm/init-mm.c b/mm/init-mm.c index f0179c9c04c2..a787a319211e 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -15,6 +15,16 @@ #define INIT_MM_CONTEXT(name) #endif +/* + * For dynamically allocated mm_structs, there is a dynamically sized cpumask + * at the end of the structure, the size of which depends on the maximum CPU + * number the system can see. That way we allocate only as much memory for + * mm_cpumask() as needed for the hundreds, or thousands of processes that + * a system typically runs. + * + * Since there is only one init_mm in the entire system, keep it simple + * and size this cpu_bitmask to NR_CPUS. + */ struct mm_struct init_mm = { .mm_rb = RB_ROOT, .pgd = swapper_pg_dir, @@ -25,5 +35,6 @@ struct mm_struct init_mm = { .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), .user_ns = &init_user_ns, + .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0}, INIT_MM_CONTEXT(init_mm) }; diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index f185455b3406..c3bd5209da38 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -619,12 +619,13 @@ void kasan_kfree_large(void *ptr, unsigned long ip) int kasan_module_alloc(void *addr, size_t size) { void *ret; + size_t scaled_size; size_t shadow_size; unsigned long shadow_start; shadow_start = (unsigned long)kasan_mem_to_shadow(addr); - shadow_size = round_up(size >> KASAN_SHADOW_SCALE_SHIFT, - PAGE_SIZE); + scaled_size = (size + KASAN_SHADOW_MASK) >> KASAN_SHADOW_SCALE_SHIFT; + shadow_size = round_up(scaled_size, PAGE_SIZE); if (WARN_ON(!PAGE_ALIGNED(shadow_start))) return -EINVAL; diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c index f436246ccc79..7a2a2f13f86f 100644 --- a/mm/kasan/kasan_init.c +++ b/mm/kasan/kasan_init.c @@ -17,10 +17,13 @@ #include <linux/memblock.h> #include <linux/mm.h> #include <linux/pfn.h> +#include <linux/slab.h> #include <asm/page.h> #include <asm/pgalloc.h> +#include "kasan.h" + /* * This page serves two purposes: * - It used as early shadow memory. The entire shadow region populated @@ -32,22 +35,59 @@ unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss; #if CONFIG_PGTABLE_LEVELS > 4 p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss; +static inline bool kasan_p4d_table(pgd_t pgd) +{ + return pgd_page(pgd) == virt_to_page(lm_alias(kasan_zero_p4d)); +} +#else +static inline bool kasan_p4d_table(pgd_t pgd) +{ + return 0; +} #endif #if CONFIG_PGTABLE_LEVELS > 3 pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss; +static inline bool kasan_pud_table(p4d_t p4d) +{ + return p4d_page(p4d) == virt_to_page(lm_alias(kasan_zero_pud)); +} +#else +static inline bool kasan_pud_table(p4d_t p4d) +{ + return 0; +} #endif #if CONFIG_PGTABLE_LEVELS > 2 pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss; +static inline bool kasan_pmd_table(pud_t pud) +{ + return pud_page(pud) == virt_to_page(lm_alias(kasan_zero_pmd)); +} +#else +static inline bool kasan_pmd_table(pud_t pud) +{ + return 0; +} #endif pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss; +static inline bool kasan_pte_table(pmd_t pmd) +{ + return pmd_page(pmd) == virt_to_page(lm_alias(kasan_zero_pte)); +} + +static inline bool kasan_zero_page_entry(pte_t pte) +{ + return pte_page(pte) == virt_to_page(lm_alias(kasan_zero_page)); +} + static __init void *early_alloc(size_t size, int node) { return memblock_virt_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, node); } -static void __init zero_pte_populate(pmd_t *pmd, unsigned long addr, +static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr, unsigned long end) { pte_t *pte = pte_offset_kernel(pmd, addr); @@ -63,7 +103,7 @@ static void __init zero_pte_populate(pmd_t *pmd, unsigned long addr, } } -static void __init zero_pmd_populate(pud_t *pud, unsigned long addr, +static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr, unsigned long end) { pmd_t *pmd = pmd_offset(pud, addr); @@ -78,14 +118,24 @@ static void __init zero_pmd_populate(pud_t *pud, unsigned long addr, } if (pmd_none(*pmd)) { - pmd_populate_kernel(&init_mm, pmd, - early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + pte_t *p; + + if (slab_is_available()) + p = pte_alloc_one_kernel(&init_mm, addr); + else + p = early_alloc(PAGE_SIZE, NUMA_NO_NODE); + if (!p) + return -ENOMEM; + + pmd_populate_kernel(&init_mm, pmd, p); } zero_pte_populate(pmd, addr, next); } while (pmd++, addr = next, addr != end); + + return 0; } -static void __init zero_pud_populate(p4d_t *p4d, unsigned long addr, +static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long end) { pud_t *pud = pud_offset(p4d, addr); @@ -103,14 +153,24 @@ static void __init zero_pud_populate(p4d_t *p4d, unsigned long addr, } if (pud_none(*pud)) { - pud_populate(&init_mm, pud, - early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + pmd_t *p; + + if (slab_is_available()) { + p = pmd_alloc(&init_mm, pud, addr); + if (!p) + return -ENOMEM; + } else { + pud_populate(&init_mm, pud, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } } zero_pmd_populate(pud, addr, next); } while (pud++, addr = next, addr != end); + + return 0; } -static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr, +static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr, unsigned long end) { p4d_t *p4d = p4d_offset(pgd, addr); @@ -132,11 +192,21 @@ static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr, } if (p4d_none(*p4d)) { - p4d_populate(&init_mm, p4d, - early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + pud_t *p; + + if (slab_is_available()) { + p = pud_alloc(&init_mm, p4d, addr); + if (!p) + return -ENOMEM; + } else { + p4d_populate(&init_mm, p4d, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } } zero_pud_populate(p4d, addr, next); } while (p4d++, addr = next, addr != end); + + return 0; } /** @@ -145,7 +215,7 @@ static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr, * @shadow_start - start of the memory range to populate * @shadow_end - end of the memory range to populate */ -void __init kasan_populate_zero_shadow(const void *shadow_start, +int __ref kasan_populate_zero_shadow(const void *shadow_start, const void *shadow_end) { unsigned long addr = (unsigned long)shadow_start; @@ -191,9 +261,229 @@ void __init kasan_populate_zero_shadow(const void *shadow_start, } if (pgd_none(*pgd)) { - pgd_populate(&init_mm, pgd, - early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + p4d_t *p; + + if (slab_is_available()) { + p = p4d_alloc(&init_mm, pgd, addr); + if (!p) + return -ENOMEM; + } else { + pgd_populate(&init_mm, pgd, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } } zero_p4d_populate(pgd, addr, next); } while (pgd++, addr = next, addr != end); + + return 0; +} + +static void kasan_free_pte(pte_t *pte_start, pmd_t *pmd) +{ + pte_t *pte; + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (!pte_none(*pte)) + return; + } + + pte_free_kernel(&init_mm, (pte_t *)page_to_virt(pmd_page(*pmd))); + pmd_clear(pmd); +} + +static void kasan_free_pmd(pmd_t *pmd_start, pud_t *pud) +{ + pmd_t *pmd; + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (!pmd_none(*pmd)) + return; + } + + pmd_free(&init_mm, (pmd_t *)page_to_virt(pud_page(*pud))); + pud_clear(pud); +} + +static void kasan_free_pud(pud_t *pud_start, p4d_t *p4d) +{ + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (!pud_none(*pud)) + return; + } + + pud_free(&init_mm, (pud_t *)page_to_virt(p4d_page(*p4d))); + p4d_clear(p4d); +} + +static void kasan_free_p4d(p4d_t *p4d_start, pgd_t *pgd) +{ + p4d_t *p4d; + int i; + + for (i = 0; i < PTRS_PER_P4D; i++) { + p4d = p4d_start + i; + if (!p4d_none(*p4d)) + return; + } + + p4d_free(&init_mm, (p4d_t *)page_to_virt(pgd_page(*pgd))); + pgd_clear(pgd); +} + +static void kasan_remove_pte_table(pte_t *pte, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, pte++) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; + + if (!pte_present(*pte)) + continue; + + if (WARN_ON(!kasan_zero_page_entry(*pte))) + continue; + pte_clear(&init_mm, addr, pte); + } +} + +static void kasan_remove_pmd_table(pmd_t *pmd, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, pmd++) { + pte_t *pte; + + next = pmd_addr_end(addr, end); + + if (!pmd_present(*pmd)) + continue; + + if (kasan_pte_table(*pmd)) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) + pmd_clear(pmd); + continue; + } + pte = pte_offset_kernel(pmd, addr); + kasan_remove_pte_table(pte, addr, next); + kasan_free_pte(pte_offset_kernel(pmd, 0), pmd); + } +} + +static void kasan_remove_pud_table(pud_t *pud, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, pud++) { + pmd_t *pmd, *pmd_base; + + next = pud_addr_end(addr, end); + + if (!pud_present(*pud)) + continue; + + if (kasan_pmd_table(*pud)) { + if (IS_ALIGNED(addr, PUD_SIZE) && + IS_ALIGNED(next, PUD_SIZE)) + pud_clear(pud); + continue; + } + pmd = pmd_offset(pud, addr); + pmd_base = pmd_offset(pud, 0); + kasan_remove_pmd_table(pmd, addr, next); + kasan_free_pmd(pmd_base, pud); + } +} + +static void kasan_remove_p4d_table(p4d_t *p4d, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, p4d++) { + pud_t *pud; + + next = p4d_addr_end(addr, end); + + if (!p4d_present(*p4d)) + continue; + + if (kasan_pud_table(*p4d)) { + if (IS_ALIGNED(addr, P4D_SIZE) && + IS_ALIGNED(next, P4D_SIZE)) + p4d_clear(p4d); + continue; + } + pud = pud_offset(p4d, addr); + kasan_remove_pud_table(pud, addr, next); + kasan_free_pud(pud_offset(p4d, 0), p4d); + } +} + +void kasan_remove_zero_shadow(void *start, unsigned long size) +{ + unsigned long addr, end, next; + pgd_t *pgd; + + addr = (unsigned long)kasan_mem_to_shadow(start); + end = addr + (size >> KASAN_SHADOW_SCALE_SHIFT); + + if (WARN_ON((unsigned long)start % + (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) || + WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE))) + return; + + for (; addr < end; addr = next) { + p4d_t *p4d; + + next = pgd_addr_end(addr, end); + + pgd = pgd_offset_k(addr); + if (!pgd_present(*pgd)) + continue; + + if (kasan_p4d_table(*pgd)) { + if (IS_ALIGNED(addr, PGDIR_SIZE) && + IS_ALIGNED(next, PGDIR_SIZE)) + pgd_clear(pgd); + continue; + } + + p4d = p4d_offset(pgd, addr); + kasan_remove_p4d_table(p4d, addr, next); + kasan_free_p4d(p4d_offset(pgd, 0), pgd); + } +} + +int kasan_add_zero_shadow(void *start, unsigned long size) +{ + int ret; + void *shadow_start, *shadow_end; + + shadow_start = kasan_mem_to_shadow(start); + shadow_end = shadow_start + (size >> KASAN_SHADOW_SCALE_SHIFT); + + if (WARN_ON((unsigned long)start % + (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) || + WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE))) + return -EINVAL; + + ret = kasan_populate_zero_shadow(shadow_start, shadow_end); + if (ret) + kasan_remove_zero_shadow(shadow_start, + size >> KASAN_SHADOW_SCALE_SHIFT); + return ret; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index d7b2a4bf8671..961cbe9062a5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -397,6 +397,26 @@ static inline int khugepaged_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } +static bool hugepage_vma_check(struct vm_area_struct *vma, + unsigned long vm_flags) +{ + if ((!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || + (vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + return false; + if (shmem_file(vma->vm_file)) { + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) + return false; + return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, + HPAGE_PMD_NR); + } + if (!vma->anon_vma || vma->vm_ops) + return false; + if (is_vma_temporary_stack(vma)) + return false; + return !(vm_flags & VM_NO_KHUGEPAGED); +} + int __khugepaged_enter(struct mm_struct *mm) { struct mm_slot *mm_slot; @@ -434,15 +454,14 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma, unsigned long vm_flags) { unsigned long hstart, hend; - if (!vma->anon_vma) - /* - * Not yet faulted in so we will register later in the - * page fault if needed. - */ - return 0; - if (vma->vm_ops || (vm_flags & VM_NO_KHUGEPAGED)) - /* khugepaged not yet working on file or special mappings */ + + /* + * khugepaged does not yet work on non-shmem files or special + * mappings. And file-private shmem THP is not supported. + */ + if (!hugepage_vma_check(vma, vm_flags)) return 0; + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart < hend) @@ -819,25 +838,6 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) } #endif -static bool hugepage_vma_check(struct vm_area_struct *vma) -{ - if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || - (vma->vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - return false; - if (shmem_file(vma->vm_file)) { - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) - return false; - return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, - HPAGE_PMD_NR); - } - if (!vma->anon_vma || vma->vm_ops) - return false; - if (is_vma_temporary_stack(vma)) - return false; - return !(vma->vm_flags & VM_NO_KHUGEPAGED); -} - /* * If mmap_sem temporarily dropped, revalidate vma * before taking mmap_sem. @@ -862,7 +862,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, hend = vma->vm_end & HPAGE_PMD_MASK; if (address < hstart || address + HPAGE_PMD_SIZE > hend) return SCAN_ADDRESS_RANGE; - if (!hugepage_vma_check(vma)) + if (!hugepage_vma_check(vma, vma->vm_flags)) return SCAN_VMA_CHECK; return 0; } @@ -1517,6 +1517,8 @@ tree_unlocked: unlock_page(new_page); *hpage = NULL; + + khugepaged_pages_collapsed++; } else { /* Something went wrong: rollback changes to the radix-tree */ shmem_uncharge(mapping->host, nr_none); @@ -1694,7 +1696,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, progress++; break; } - if (!hugepage_vma_check(vma)) { + if (!hugepage_vma_check(vma, vma->vm_flags)) { skip: progress++; continue; @@ -470,7 +470,7 @@ static inline bool ksm_test_exit(struct mm_struct *mm) static int break_ksm(struct vm_area_struct *vma, unsigned long addr) { struct page *page; - int ret = 0; + vm_fault_t ret = 0; do { cond_resched(); @@ -2430,6 +2430,9 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, VM_HUGETLB | VM_MIXEDMAP)) return 0; /* just ignore the advice */ + if (vma_is_dax(vma)) + return 0; + #ifdef VM_SAO if (*vm_flags & VM_SAO) return 0; diff --git a/mm/list_lru.c b/mm/list_lru.c index fcfb6c89ed47..5b30625fd365 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -12,7 +12,7 @@ #include <linux/mutex.h> #include <linux/memcontrol.h> -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM static LIST_HEAD(list_lrus); static DEFINE_MUTEX(list_lrus_mutex); @@ -29,17 +29,12 @@ static void list_lru_unregister(struct list_lru *lru) list_del(&lru->list); mutex_unlock(&list_lrus_mutex); } -#else -static void list_lru_register(struct list_lru *lru) -{ -} -static void list_lru_unregister(struct list_lru *lru) +static int lru_shrinker_id(struct list_lru *lru) { + return lru->shrinker_id; } -#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) static inline bool list_lru_memcg_aware(struct list_lru *lru) { /* @@ -75,20 +70,39 @@ static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr) } static inline struct list_lru_one * -list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) +list_lru_from_kmem(struct list_lru_node *nlru, void *ptr, + struct mem_cgroup **memcg_ptr) { - struct mem_cgroup *memcg; + struct list_lru_one *l = &nlru->lru; + struct mem_cgroup *memcg = NULL; if (!nlru->memcg_lrus) - return &nlru->lru; + goto out; memcg = mem_cgroup_from_kmem(ptr); if (!memcg) - return &nlru->lru; + goto out; - return list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg)); + l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg)); +out: + if (memcg_ptr) + *memcg_ptr = memcg; + return l; } #else +static void list_lru_register(struct list_lru *lru) +{ +} + +static void list_lru_unregister(struct list_lru *lru) +{ +} + +static int lru_shrinker_id(struct list_lru *lru) +{ + return -1; +} + static inline bool list_lru_memcg_aware(struct list_lru *lru) { return false; @@ -101,23 +115,30 @@ list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) } static inline struct list_lru_one * -list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) +list_lru_from_kmem(struct list_lru_node *nlru, void *ptr, + struct mem_cgroup **memcg_ptr) { + if (memcg_ptr) + *memcg_ptr = NULL; return &nlru->lru; } -#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ bool list_lru_add(struct list_lru *lru, struct list_head *item) { int nid = page_to_nid(virt_to_page(item)); struct list_lru_node *nlru = &lru->node[nid]; + struct mem_cgroup *memcg; struct list_lru_one *l; spin_lock(&nlru->lock); if (list_empty(item)) { - l = list_lru_from_kmem(nlru, item); + l = list_lru_from_kmem(nlru, item, &memcg); list_add_tail(item, &l->list); - l->nr_items++; + /* Set shrinker bit if the first element was added */ + if (!l->nr_items++) + memcg_set_shrinker_bit(memcg, nid, + lru_shrinker_id(lru)); nlru->nr_items++; spin_unlock(&nlru->lock); return true; @@ -135,7 +156,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) spin_lock(&nlru->lock); if (!list_empty(item)) { - l = list_lru_from_kmem(nlru, item); + l = list_lru_from_kmem(nlru, item, NULL); list_del_init(item); l->nr_items--; nlru->nr_items--; @@ -162,26 +183,20 @@ void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, } EXPORT_SYMBOL_GPL(list_lru_isolate_move); -static unsigned long __list_lru_count_one(struct list_lru *lru, - int nid, int memcg_idx) +unsigned long list_lru_count_one(struct list_lru *lru, + int nid, struct mem_cgroup *memcg) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; unsigned long count; rcu_read_lock(); - l = list_lru_from_memcg_idx(nlru, memcg_idx); + l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg)); count = l->nr_items; rcu_read_unlock(); return count; } - -unsigned long list_lru_count_one(struct list_lru *lru, - int nid, struct mem_cgroup *memcg) -{ - return __list_lru_count_one(lru, nid, memcg_cache_id(memcg)); -} EXPORT_SYMBOL_GPL(list_lru_count_one); unsigned long list_lru_count_node(struct list_lru *lru, int nid) @@ -194,17 +209,15 @@ unsigned long list_lru_count_node(struct list_lru *lru, int nid) EXPORT_SYMBOL_GPL(list_lru_count_node); static unsigned long -__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, +__list_lru_walk_one(struct list_lru_node *nlru, int memcg_idx, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { - struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; struct list_head *item, *n; unsigned long isolated = 0; - spin_lock(&nlru->lock); l = list_lru_from_memcg_idx(nlru, memcg_idx); restart: list_for_each_safe(item, n, &l->list) { @@ -250,8 +263,6 @@ restart: BUG(); } } - - spin_unlock(&nlru->lock); return isolated; } @@ -260,11 +271,32 @@ list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { - return __list_lru_walk_one(lru, nid, memcg_cache_id(memcg), - isolate, cb_arg, nr_to_walk); + struct list_lru_node *nlru = &lru->node[nid]; + unsigned long ret; + + spin_lock(&nlru->lock); + ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg, + nr_to_walk); + spin_unlock(&nlru->lock); + return ret; } EXPORT_SYMBOL_GPL(list_lru_walk_one); +unsigned long +list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long *nr_to_walk) +{ + struct list_lru_node *nlru = &lru->node[nid]; + unsigned long ret; + + spin_lock_irq(&nlru->lock); + ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg, + nr_to_walk); + spin_unlock_irq(&nlru->lock); + return ret; +} + unsigned long list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) @@ -272,12 +304,18 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid, long isolated = 0; int memcg_idx; - isolated += __list_lru_walk_one(lru, nid, -1, isolate, cb_arg, - nr_to_walk); + isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg, + nr_to_walk); if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { for_each_memcg_cache_index(memcg_idx) { - isolated += __list_lru_walk_one(lru, nid, memcg_idx, - isolate, cb_arg, nr_to_walk); + struct list_lru_node *nlru = &lru->node[nid]; + + spin_lock(&nlru->lock); + isolated += __list_lru_walk_one(nlru, memcg_idx, + isolate, cb_arg, + nr_to_walk); + spin_unlock(&nlru->lock); + if (*nr_to_walk <= 0) break; } @@ -292,7 +330,7 @@ static void init_one_lru(struct list_lru_one *l) l->nr_items = 0; } -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus, int begin, int end) { @@ -500,10 +538,13 @@ fail: goto out; } -static void memcg_drain_list_lru_node(struct list_lru_node *nlru, - int src_idx, int dst_idx) +static void memcg_drain_list_lru_node(struct list_lru *lru, int nid, + int src_idx, struct mem_cgroup *dst_memcg) { + struct list_lru_node *nlru = &lru->node[nid]; + int dst_idx = dst_memcg->kmemcg_id; struct list_lru_one *src, *dst; + bool set; /* * Since list_lru_{add,del} may be called under an IRQ-safe lock, @@ -515,14 +556,17 @@ static void memcg_drain_list_lru_node(struct list_lru_node *nlru, dst = list_lru_from_memcg_idx(nlru, dst_idx); list_splice_init(&src->list, &dst->list); + set = (!dst->nr_items && src->nr_items); dst->nr_items += src->nr_items; + if (set) + memcg_set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); src->nr_items = 0; spin_unlock_irq(&nlru->lock); } static void memcg_drain_list_lru(struct list_lru *lru, - int src_idx, int dst_idx) + int src_idx, struct mem_cgroup *dst_memcg) { int i; @@ -530,16 +574,16 @@ static void memcg_drain_list_lru(struct list_lru *lru, return; for_each_node(i) - memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx); + memcg_drain_list_lru_node(lru, i, src_idx, dst_memcg); } -void memcg_drain_all_list_lrus(int src_idx, int dst_idx) +void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg) { struct list_lru *lru; mutex_lock(&list_lrus_mutex); list_for_each_entry(lru, &list_lrus, list) - memcg_drain_list_lru(lru, src_idx, dst_idx); + memcg_drain_list_lru(lru, src_idx, dst_memcg); mutex_unlock(&list_lrus_mutex); } #else @@ -551,15 +595,21 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) static void memcg_destroy_list_lru(struct list_lru *lru) { } -#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ int __list_lru_init(struct list_lru *lru, bool memcg_aware, - struct lock_class_key *key) + struct lock_class_key *key, struct shrinker *shrinker) { int i; size_t size = sizeof(*lru->node) * nr_node_ids; int err = -ENOMEM; +#ifdef CONFIG_MEMCG_KMEM + if (shrinker) + lru->shrinker_id = shrinker->id; + else + lru->shrinker_id = -1; +#endif memcg_get_cache_ids(); lru->node = kzalloc(size, GFP_KERNEL); @@ -602,6 +652,9 @@ void list_lru_destroy(struct list_lru *lru) kfree(lru->node); lru->node = NULL; +#ifdef CONFIG_MEMCG_KMEM + lru->shrinker_id = -1; +#endif memcg_put_cache_ids(); } EXPORT_SYMBOL_GPL(list_lru_destroy); diff --git a/mm/memblock.c b/mm/memblock.c index cc16d70b8333..237944479d25 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -27,6 +27,61 @@ #include "internal.h" +/** + * DOC: memblock overview + * + * Memblock is a method of managing memory regions during the early + * boot period when the usual kernel memory allocators are not up and + * running. + * + * Memblock views the system memory as collections of contiguous + * regions. There are several types of these collections: + * + * * ``memory`` - describes the physical memory available to the + * kernel; this may differ from the actual physical memory installed + * in the system, for instance when the memory is restricted with + * ``mem=`` command line parameter + * * ``reserved`` - describes the regions that were allocated + * * ``physmap`` - describes the actual physical memory regardless of + * the possible restrictions; the ``physmap`` type is only available + * on some architectures. + * + * Each region is represented by :c:type:`struct memblock_region` that + * defines the region extents, its attributes and NUMA node id on NUMA + * systems. Every memory type is described by the :c:type:`struct + * memblock_type` which contains an array of memory regions along with + * the allocator metadata. The memory types are nicely wrapped with + * :c:type:`struct memblock`. This structure is statically initialzed + * at build time. The region arrays for the "memory" and "reserved" + * types are initially sized to %INIT_MEMBLOCK_REGIONS and for the + * "physmap" type to %INIT_PHYSMEM_REGIONS. + * The :c:func:`memblock_allow_resize` enables automatic resizing of + * the region arrays during addition of new regions. This feature + * should be used with care so that memory allocated for the region + * array will not overlap with areas that should be reserved, for + * example initrd. + * + * The early architecture setup should tell memblock what the physical + * memory layout is by using :c:func:`memblock_add` or + * :c:func:`memblock_add_node` functions. The first function does not + * assign the region to a NUMA node and it is appropriate for UMA + * systems. Yet, it is possible to use it on NUMA systems as well and + * assign the region to a NUMA node later in the setup process using + * :c:func:`memblock_set_node`. The :c:func:`memblock_add_node` + * performs such an assignment directly. + * + * Once memblock is setup the memory can be allocated using either + * memblock or bootmem APIs. + * + * As the system boot progresses, the architecture specific + * :c:func:`mem_init` function frees all the memory to the buddy page + * allocator. + * + * If an architecure enables %CONFIG_ARCH_DISCARD_MEMBLOCK, the + * memblock data structures will be discarded after the system + * initialization compltes. + */ + static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP @@ -61,7 +116,7 @@ static int memblock_can_resize __initdata_memblock; static int memblock_memory_in_slab __initdata_memblock = 0; static int memblock_reserved_in_slab __initdata_memblock = 0; -ulong __init_memblock choose_memblock_flags(void) +enum memblock_flags __init_memblock choose_memblock_flags(void) { return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE; } @@ -93,10 +148,11 @@ bool __init_memblock memblock_overlaps_region(struct memblock_type *type, return i < type->cnt; } -/* +/** * __memblock_find_range_bottom_up - find free area utility in bottom-up * @start: start of candidate range - * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or + * %MEMBLOCK_ALLOC_ACCESSIBLE * @size: size of free area to find * @align: alignment of free area to find * @nid: nid of the free area to find, %NUMA_NO_NODE for any node @@ -104,13 +160,13 @@ bool __init_memblock memblock_overlaps_region(struct memblock_type *type, * * Utility called from memblock_find_in_range_node(), find free area bottom-up. * - * RETURNS: + * Return: * Found address on success, 0 on failure. */ static phys_addr_t __init_memblock __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align, int nid, - ulong flags) + enum memblock_flags flags) { phys_addr_t this_start, this_end, cand; u64 i; @@ -130,7 +186,8 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, /** * __memblock_find_range_top_down - find free area utility, in top-down * @start: start of candidate range - * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or + * %MEMBLOCK_ALLOC_ACCESSIBLE * @size: size of free area to find * @align: alignment of free area to find * @nid: nid of the free area to find, %NUMA_NO_NODE for any node @@ -138,13 +195,13 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, * * Utility called from memblock_find_in_range_node(), find free area top-down. * - * RETURNS: + * Return: * Found address on success, 0 on failure. */ static phys_addr_t __init_memblock __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align, int nid, - ulong flags) + enum memblock_flags flags) { phys_addr_t this_start, this_end, cand; u64 i; @@ -170,7 +227,8 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, * @size: size of free area to find * @align: alignment of free area to find * @start: start of candidate range - * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or + * %MEMBLOCK_ALLOC_ACCESSIBLE * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * @flags: pick from blocks based on memory attributes * @@ -184,12 +242,13 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, * * If bottom-up allocation failed, will try to allocate memory top-down. * - * RETURNS: + * Return: * Found address on success, 0 on failure. */ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, phys_addr_t start, - phys_addr_t end, int nid, ulong flags) + phys_addr_t end, int nid, + enum memblock_flags flags) { phys_addr_t kernel_end, ret; @@ -228,7 +287,8 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, * so we use WARN_ONCE() here to see the stack trace if * fail happens. */ - WARN_ONCE(1, "memblock: bottom-up allocation failed, memory hotunplug may be affected\n"); + WARN_ONCE(IS_ENABLED(CONFIG_MEMORY_HOTREMOVE), + "memblock: bottom-up allocation failed, memory hotremove may be affected\n"); } return __memblock_find_range_top_down(start, end, size, align, nid, @@ -238,13 +298,14 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, /** * memblock_find_in_range - find free area in given range * @start: start of candidate range - * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or + * %MEMBLOCK_ALLOC_ACCESSIBLE * @size: size of free area to find * @align: alignment of free area to find * * Find @size free area aligned to @align in the specified range. * - * RETURNS: + * Return: * Found address on success, 0 on failure. */ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, @@ -252,7 +313,7 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, phys_addr_t align) { phys_addr_t ret; - ulong flags = choose_memblock_flags(); + enum memblock_flags flags = choose_memblock_flags(); again: ret = memblock_find_in_range_node(size, align, start, end, @@ -288,7 +349,7 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK /** - * Discard memory and reserved arrays if they were allocated + * memblock_discard - discard memory and reserved arrays if they were allocated */ void __init memblock_discard(void) { @@ -318,11 +379,11 @@ void __init memblock_discard(void) * * Double the size of the @type regions array. If memblock is being used to * allocate memory for a new reserved regions array and there is a previously - * allocated memory range [@new_area_start,@new_area_start+@new_area_size] + * allocated memory range [@new_area_start, @new_area_start + @new_area_size] * waiting to be reserved, ensure the memory used by the new array does * not overlap. * - * RETURNS: + * Return: * 0 on success, -1 on failure. */ static int __init_memblock memblock_double_array(struct memblock_type *type, @@ -331,7 +392,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, { struct memblock_region *new_array, *old_array; phys_addr_t old_alloc_size, new_alloc_size; - phys_addr_t old_size, new_size, addr; + phys_addr_t old_size, new_size, addr, new_end; int use_slab = slab_is_available(); int *in_slab; @@ -392,9 +453,9 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, return -1; } - memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]", - type->name, type->max * 2, (u64)addr, - (u64)addr + new_size - 1); + new_end = addr + new_size - 1; + memblock_dbg("memblock: %s is doubled to %ld at [%pa-%pa]", + type->name, type->max * 2, &addr, &new_end); /* * Found space, we now need to move the array over before we add the @@ -467,13 +528,14 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) * @nid: node id of the new region * @flags: flags of the new region * - * Insert new memblock region [@base,@base+@size) into @type at @idx. + * Insert new memblock region [@base, @base + @size) into @type at @idx. * @type must already have extra room to accommodate the new region. */ static void __init_memblock memblock_insert_region(struct memblock_type *type, int idx, phys_addr_t base, phys_addr_t size, - int nid, unsigned long flags) + int nid, + enum memblock_flags flags) { struct memblock_region *rgn = &type->regions[idx]; @@ -495,17 +557,17 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, * @nid: nid of the new region * @flags: flags of the new region * - * Add new memblock region [@base,@base+@size) into @type. The new region + * Add new memblock region [@base, @base + @size) into @type. The new region * is allowed to overlap with existing ones - overlaps don't affect already * existing regions. @type is guaranteed to be minimal (all neighbouring * compatible regions are merged) after the addition. * - * RETURNS: + * Return: * 0 on success, -errno on failure. */ int __init_memblock memblock_add_range(struct memblock_type *type, phys_addr_t base, phys_addr_t size, - int nid, unsigned long flags) + int nid, enum memblock_flags flags) { bool insert = false; phys_addr_t obase = base; @@ -589,12 +651,35 @@ repeat: } } +/** + * memblock_add_node - add new memblock region within a NUMA node + * @base: base address of the new region + * @size: size of the new region + * @nid: nid of the new region + * + * Add new memblock region [@base, @base + @size) to the "memory" + * type. See memblock_add_range() description for mode details + * + * Return: + * 0 on success, -errno on failure. + */ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, int nid) { return memblock_add_range(&memblock.memory, base, size, nid, 0); } +/** + * memblock_add - add new memblock region + * @base: base address of the new region + * @size: size of the new region + * + * Add new memblock region [@base, @base + @size) to the "memory" + * type. See memblock_add_range() description for mode details + * + * Return: + * 0 on success, -errno on failure. + */ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) { phys_addr_t end = base + size - 1; @@ -614,11 +699,11 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) * @end_rgn: out parameter for the end of isolated region * * Walk @type and ensure that regions don't cross the boundaries defined by - * [@base,@base+@size). Crossing regions are split at the boundaries, + * [@base, @base + @size). Crossing regions are split at the boundaries, * which may create at most two more regions. The index of the first * region inside the range is returned in *@start_rgn and end in *@end_rgn. * - * RETURNS: + * Return: * 0 on success, -errno on failure. */ static int __init_memblock memblock_isolate_range(struct memblock_type *type, @@ -729,10 +814,15 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) } /** + * memblock_setclr_flag - set or clear flag for a memory region + * @base: base address of the region + * @size: size of the region + * @set: set or clear the flag + * @flag: the flag to udpate * * This function isolates region [@base, @base + @size), and sets/clears flag * - * Return 0 on success, -errno on failure. + * Return: 0 on success, -errno on failure. */ static int __init_memblock memblock_setclr_flag(phys_addr_t base, phys_addr_t size, int set, int flag) @@ -759,7 +849,7 @@ static int __init_memblock memblock_setclr_flag(phys_addr_t base, * @base: the base phys addr of the region * @size: the size of the region * - * Return 0 on success, -errno on failure. + * Return: 0 on success, -errno on failure. */ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) { @@ -771,7 +861,7 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) * @base: the base phys addr of the region * @size: the size of the region * - * Return 0 on success, -errno on failure. + * Return: 0 on success, -errno on failure. */ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) { @@ -783,7 +873,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) * @base: the base phys addr of the region * @size: the size of the region * - * Return 0 on success, -errno on failure. + * Return: 0 on success, -errno on failure. */ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) { @@ -797,7 +887,7 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) * @base: the base phys addr of the region * @size: the size of the region * - * Return 0 on success, -errno on failure. + * Return: 0 on success, -errno on failure. */ int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size) { @@ -809,7 +899,7 @@ int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size) * @base: the base phys addr of the region * @size: the size of the region * - * Return 0 on success, -errno on failure. + * Return: 0 on success, -errno on failure. */ int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size) { @@ -874,7 +964,8 @@ void __init_memblock __next_reserved_mem_region(u64 *idx, * As both region arrays are sorted, the function advances the two indices * in lockstep and returns each intersection. */ -void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, +void __init_memblock __next_mem_range(u64 *idx, int nid, + enum memblock_flags flags, struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, @@ -969,9 +1060,6 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, /** * __next_mem_range_rev - generic next function for for_each_*_range_rev() * - * Finds the next range from type_a which is not marked as unsuitable - * in type_b. - * * @idx: pointer to u64 loop variable * @nid: node selector, %NUMA_NO_NODE for all nodes * @flags: pick from blocks based on memory attributes @@ -981,9 +1069,13 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL * @out_nid: ptr to int for nid of the range, can be %NULL * + * Finds the next range from type_a which is not marked as unsuitable + * in type_b. + * * Reverse of __next_mem_range(). */ -void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, +void __init_memblock __next_mem_range_rev(u64 *idx, int nid, + enum memblock_flags flags, struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, @@ -1115,10 +1207,10 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, * @type: memblock type to set node ID for * @nid: node ID to set * - * Set the nid of memblock @type regions in [@base,@base+@size) to @nid. + * Set the nid of memblock @type regions in [@base, @base + @size) to @nid. * Regions which cross the area boundaries are split as necessary. * - * RETURNS: + * Return: * 0 on success, -errno on failure. */ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, @@ -1141,7 +1233,8 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, phys_addr_t align, phys_addr_t start, - phys_addr_t end, int nid, ulong flags) + phys_addr_t end, int nid, + enum memblock_flags flags) { phys_addr_t found; @@ -1163,7 +1256,7 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, phys_addr_t start, phys_addr_t end, - ulong flags) + enum memblock_flags flags) { return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE, flags); @@ -1171,14 +1264,14 @@ phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr, - int nid, ulong flags) + int nid, enum memblock_flags flags) { return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flags); } phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) { - ulong flags = choose_memblock_flags(); + enum memblock_flags flags = choose_memblock_flags(); phys_addr_t ret; again: @@ -1225,6 +1318,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); } +#if defined(CONFIG_NO_BOOTMEM) /** * memblock_virt_alloc_internal - allocate boot memory block * @size: size of memory block to be allocated in bytes @@ -1241,7 +1335,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i * The allocation is performed from memory region limited by * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE. * - * The memory block is aligned on SMP_CACHE_BYTES if @align == 0. + * The memory block is aligned on %SMP_CACHE_BYTES if @align == 0. * * The phys address of allocated boot memory block is converted to virtual and * allocated memory is reset to 0. @@ -1249,7 +1343,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i * In addition, function sets the min_count to 0 using kmemleak_alloc for * allocated boot memory block, so that it is never reported as leaks. * - * RETURNS: + * Return: * Virtual address of allocated memory block on success, NULL on failure. */ static void * __init memblock_virt_alloc_internal( @@ -1259,7 +1353,7 @@ static void * __init memblock_virt_alloc_internal( { phys_addr_t alloc; void *ptr; - ulong flags = choose_memblock_flags(); + enum memblock_flags flags = choose_memblock_flags(); if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) nid = NUMA_NO_NODE; @@ -1334,7 +1428,7 @@ done: * info), if enabled. Does not zero allocated memory, does not panic if request * cannot be satisfied. * - * RETURNS: + * Return: * Virtual address of allocated memory block on success, NULL on failure. */ void * __init memblock_virt_alloc_try_nid_raw( @@ -1344,9 +1438,9 @@ void * __init memblock_virt_alloc_try_nid_raw( { void *ptr; - memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", - __func__, (u64)size, (u64)align, nid, (u64)min_addr, - (u64)max_addr, (void *)_RET_IP_); + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n", + __func__, (u64)size, (u64)align, nid, &min_addr, + &max_addr, (void *)_RET_IP_); ptr = memblock_virt_alloc_internal(size, align, min_addr, max_addr, nid); @@ -1371,7 +1465,7 @@ void * __init memblock_virt_alloc_try_nid_raw( * Public function, provides additional debug information (including caller * info), if enabled. This function zeroes the allocated memory. * - * RETURNS: + * Return: * Virtual address of allocated memory block on success, NULL on failure. */ void * __init memblock_virt_alloc_try_nid_nopanic( @@ -1381,9 +1475,9 @@ void * __init memblock_virt_alloc_try_nid_nopanic( { void *ptr; - memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", - __func__, (u64)size, (u64)align, nid, (u64)min_addr, - (u64)max_addr, (void *)_RET_IP_); + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n", + __func__, (u64)size, (u64)align, nid, &min_addr, + &max_addr, (void *)_RET_IP_); ptr = memblock_virt_alloc_internal(size, align, min_addr, max_addr, nid); @@ -1407,7 +1501,7 @@ void * __init memblock_virt_alloc_try_nid_nopanic( * which provides debug information (including caller info), if enabled, * and panics if the request can not be satisfied. * - * RETURNS: + * Return: * Virtual address of allocated memory block on success, NULL on failure. */ void * __init memblock_virt_alloc_try_nid( @@ -1417,9 +1511,9 @@ void * __init memblock_virt_alloc_try_nid( { void *ptr; - memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", - __func__, (u64)size, (u64)align, nid, (u64)min_addr, - (u64)max_addr, (void *)_RET_IP_); + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n", + __func__, (u64)size, (u64)align, nid, &min_addr, + &max_addr, (void *)_RET_IP_); ptr = memblock_virt_alloc_internal(size, align, min_addr, max_addr, nid); if (ptr) { @@ -1427,11 +1521,11 @@ void * __init memblock_virt_alloc_try_nid( return ptr; } - panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n", - __func__, (u64)size, (u64)align, nid, (u64)min_addr, - (u64)max_addr); + panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa\n", + __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr); return NULL; } +#endif /** * __memblock_free_early - free boot memory block @@ -1443,16 +1537,17 @@ void * __init memblock_virt_alloc_try_nid( */ void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) { - memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", - __func__, (u64)base, (u64)base + size - 1, - (void *)_RET_IP_); + phys_addr_t end = base + size - 1; + + memblock_dbg("%s: [%pa-%pa] %pF\n", + __func__, &base, &end, (void *)_RET_IP_); kmemleak_free_part_phys(base, size); memblock_remove_range(&memblock.reserved, base, size); } -/* +/** * __memblock_free_late - free bootmem block pages directly to buddy allocator - * @addr: phys starting address of the boot memory block + * @base: phys starting address of the boot memory block * @size: size of the boot memory block in bytes * * This is only useful when the bootmem allocator has already been torn @@ -1461,11 +1556,11 @@ void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) */ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) { - u64 cursor, end; + phys_addr_t cursor, end; - memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", - __func__, (u64)base, (u64)base + size - 1, - (void *)_RET_IP_); + end = base + size - 1; + memblock_dbg("%s: [%pa-%pa] %pF\n", + __func__, &base, &end, (void *)_RET_IP_); kmemleak_free_part_phys(base, size); cursor = PFN_UP(base); end = PFN_DOWN(base + size); @@ -1664,9 +1759,9 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn, * @base: base of region to check * @size: size of region to check * - * Check if the region [@base, @base+@size) is a subset of a memory block. + * Check if the region [@base, @base + @size) is a subset of a memory block. * - * RETURNS: + * Return: * 0 if false, non-zero if true */ bool __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) @@ -1685,9 +1780,10 @@ bool __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t siz * @base: base of region to check * @size: size of region to check * - * Check if the region [@base, @base+@size) intersects a reserved memory block. + * Check if the region [@base, @base + @size) intersects a reserved + * memory block. * - * RETURNS: + * Return: * True if they intersect, false if not. */ bool __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) @@ -1734,7 +1830,7 @@ phys_addr_t __init_memblock memblock_get_current_limit(void) static void __init_memblock memblock_dump(struct memblock_type *type) { phys_addr_t base, end, size; - unsigned long flags; + enum memblock_flags flags; int idx; struct memblock_region *rgn; @@ -1752,7 +1848,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type) snprintf(nid_buf, sizeof(nid_buf), " on node %d", memblock_get_region_node(rgn)); #endif - pr_info(" %s[%#x]\t[%pa-%pa], %pa bytes%s flags: %#lx\n", + pr_info(" %s[%#x]\t[%pa-%pa], %pa bytes%s flags: %#x\n", type->name, idx, &base, &end, &size, nid_buf, flags); } } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e6f0d5ef320a..6a921890739f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -233,6 +233,21 @@ enum res_type { /* Used for OOM nofiier */ #define OOM_CONTROL (0) +/* + * Iteration constructs for visiting all cgroups (under a tree). If + * loops are exited prematurely (break), mem_cgroup_iter_break() must + * be used for reference counting. + */ +#define for_each_mem_cgroup_tree(iter, root) \ + for (iter = mem_cgroup_iter(root, NULL, NULL); \ + iter != NULL; \ + iter = mem_cgroup_iter(root, iter, NULL)) + +#define for_each_mem_cgroup(iter) \ + for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ + iter != NULL; \ + iter = mem_cgroup_iter(NULL, iter, NULL)) + /* Some nice accessors for the vmpressure. */ struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) { @@ -246,12 +261,7 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; } -static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) -{ - return (memcg == root_mem_cgroup); -} - -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM /* * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. * The main reason for not using cgroup id for this: @@ -305,7 +315,135 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key); struct workqueue_struct *memcg_kmem_cache_wq; -#endif /* !CONFIG_SLOB */ +static int memcg_shrinker_map_size; +static DEFINE_MUTEX(memcg_shrinker_map_mutex); + +static void memcg_free_shrinker_map_rcu(struct rcu_head *head) +{ + kvfree(container_of(head, struct memcg_shrinker_map, rcu)); +} + +static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg, + int size, int old_size) +{ + struct memcg_shrinker_map *new, *old; + int nid; + + lockdep_assert_held(&memcg_shrinker_map_mutex); + + for_each_node(nid) { + old = rcu_dereference_protected( + mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true); + /* Not yet online memcg */ + if (!old) + return 0; + + new = kvmalloc(sizeof(*new) + size, GFP_KERNEL); + if (!new) + return -ENOMEM; + + /* Set all old bits, clear all new bits */ + memset(new->map, (int)0xff, old_size); + memset((void *)new->map + old_size, 0, size - old_size); + + rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new); + call_rcu(&old->rcu, memcg_free_shrinker_map_rcu); + } + + return 0; +} + +static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) +{ + struct mem_cgroup_per_node *pn; + struct memcg_shrinker_map *map; + int nid; + + if (mem_cgroup_is_root(memcg)) + return; + + for_each_node(nid) { + pn = mem_cgroup_nodeinfo(memcg, nid); + map = rcu_dereference_protected(pn->shrinker_map, true); + if (map) + kvfree(map); + rcu_assign_pointer(pn->shrinker_map, NULL); + } +} + +static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) +{ + struct memcg_shrinker_map *map; + int nid, size, ret = 0; + + if (mem_cgroup_is_root(memcg)) + return 0; + + mutex_lock(&memcg_shrinker_map_mutex); + size = memcg_shrinker_map_size; + for_each_node(nid) { + map = kvzalloc(sizeof(*map) + size, GFP_KERNEL); + if (!map) { + memcg_free_shrinker_maps(memcg); + ret = -ENOMEM; + break; + } + rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map); + } + mutex_unlock(&memcg_shrinker_map_mutex); + + return ret; +} + +int memcg_expand_shrinker_maps(int new_id) +{ + int size, old_size, ret = 0; + struct mem_cgroup *memcg; + + size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long); + old_size = memcg_shrinker_map_size; + if (size <= old_size) + return 0; + + mutex_lock(&memcg_shrinker_map_mutex); + if (!root_mem_cgroup) + goto unlock; + + for_each_mem_cgroup(memcg) { + if (mem_cgroup_is_root(memcg)) + continue; + ret = memcg_expand_one_shrinker_map(memcg, size, old_size); + if (ret) + goto unlock; + } +unlock: + if (!ret) + memcg_shrinker_map_size = size; + mutex_unlock(&memcg_shrinker_map_mutex); + return ret; +} + +void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) +{ + if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { + struct memcg_shrinker_map *map; + + rcu_read_lock(); + map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map); + /* Pairs with smp mb in shrink_slab() */ + smp_mb__before_atomic(); + set_bit(shrinker_id, map->map); + rcu_read_unlock(); + } +} + +#else /* CONFIG_MEMCG_KMEM */ +static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) +{ + return 0; +} +static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { } +#endif /* CONFIG_MEMCG_KMEM */ /** * mem_cgroup_css_from_page - css of the memcg associated with a page @@ -678,9 +816,20 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) } EXPORT_SYMBOL(mem_cgroup_from_task); -static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) +/** + * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg. + * @mm: mm from which memcg should be extracted. It can be NULL. + * + * Obtain a reference on mm->memcg and returns it if successful. Otherwise + * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is + * returned. + */ +struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { - struct mem_cgroup *memcg = NULL; + struct mem_cgroup *memcg; + + if (mem_cgroup_disabled()) + return NULL; rcu_read_lock(); do { @@ -700,6 +849,46 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) rcu_read_unlock(); return memcg; } +EXPORT_SYMBOL(get_mem_cgroup_from_mm); + +/** + * get_mem_cgroup_from_page: Obtain a reference on given page's memcg. + * @page: page from which memcg should be extracted. + * + * Obtain a reference on page->memcg and returns it if successful. Otherwise + * root_mem_cgroup is returned. + */ +struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) +{ + struct mem_cgroup *memcg = page->mem_cgroup; + + if (mem_cgroup_disabled()) + return NULL; + + rcu_read_lock(); + if (!memcg || !css_tryget_online(&memcg->css)) + memcg = root_mem_cgroup; + rcu_read_unlock(); + return memcg; +} +EXPORT_SYMBOL(get_mem_cgroup_from_page); + +/** + * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg. + */ +static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) +{ + if (unlikely(current->active_memcg)) { + struct mem_cgroup *memcg = root_mem_cgroup; + + rcu_read_lock(); + if (css_tryget_online(¤t->active_memcg->css)) + memcg = current->active_memcg; + rcu_read_unlock(); + return memcg; + } + return get_mem_cgroup_from_mm(current->mm); +} /** * mem_cgroup_iter - iterate over memory cgroup hierarchy @@ -850,7 +1039,7 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) int nid; int i; - while ((memcg = parent_mem_cgroup(memcg))) { + for (; memcg; memcg = parent_mem_cgroup(memcg)) { for_each_node(nid) { mz = mem_cgroup_nodeinfo(memcg, nid); for (i = 0; i <= DEF_PRIORITY; i++) { @@ -862,21 +1051,6 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) } } -/* - * Iteration constructs for visiting all cgroups (under a tree). If - * loops are exited prematurely (break), mem_cgroup_iter_break() must - * be used for reference counting. - */ -#define for_each_mem_cgroup_tree(iter, root) \ - for (iter = mem_cgroup_iter(root, NULL, NULL); \ - iter != NULL; \ - iter = mem_cgroup_iter(root, iter, NULL)) - -#define for_each_mem_cgroup(iter) \ - for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ - iter != NULL; \ - iter = mem_cgroup_iter(NULL, iter, NULL)) - /** * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy * @memcg: hierarchy root @@ -1483,28 +1657,53 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); } -static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) +enum oom_status { + OOM_SUCCESS, + OOM_FAILED, + OOM_ASYNC, + OOM_SKIPPED +}; + +static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - if (!current->memcg_may_oom || order > PAGE_ALLOC_COSTLY_ORDER) - return; + if (order > PAGE_ALLOC_COSTLY_ORDER) + return OOM_SKIPPED; + /* * We are in the middle of the charge context here, so we * don't want to block when potentially sitting on a callstack * that holds all kinds of filesystem and mm locks. * - * Also, the caller may handle a failed allocation gracefully - * (like optional page cache readahead) and so an OOM killer - * invocation might not even be necessary. + * cgroup1 allows disabling the OOM killer and waiting for outside + * handling until the charge can succeed; remember the context and put + * the task to sleep at the end of the page fault when all locks are + * released. + * + * On the other hand, in-kernel OOM killer allows for an async victim + * memory reclaim (oom_reaper) and that means that we are not solely + * relying on the oom victim to make a forward progress and we can + * invoke the oom killer here. * - * That's why we don't do anything here except remember the - * OOM context and then deal with it at the end of the page - * fault when the stack is unwound, the locks are released, - * and when we know whether the fault was overall successful. + * Please note that mem_cgroup_out_of_memory might fail to find a + * victim and then we have to bail out from the charge path. */ - css_get(&memcg->css); - current->memcg_in_oom = memcg; - current->memcg_oom_gfp_mask = mask; - current->memcg_oom_order = order; + if (memcg->oom_kill_disable) { + if (!current->in_user_fault) + return OOM_SKIPPED; + css_get(&memcg->css); + current->memcg_in_oom = memcg; + current->memcg_oom_gfp_mask = mask; + current->memcg_oom_order = order; + + return OOM_ASYNC; + } + + if (mem_cgroup_out_of_memory(memcg, mask, order)) + return OOM_SUCCESS; + + WARN(1,"Memory cgroup charge failed because of no reclaimable memory! " + "This looks like a misconfiguration or a kernel bug."); + return OOM_FAILED; } /** @@ -1899,6 +2098,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned long nr_reclaimed; bool may_swap = true; bool drained = false; + bool oomed = false; + enum oom_status oom_status; if (mem_cgroup_is_root(memcg)) return 0; @@ -1986,6 +2187,9 @@ retry: if (nr_retries--) goto retry; + if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed) + goto nomem; + if (gfp_mask & __GFP_NOFAIL) goto force; @@ -1994,8 +2198,23 @@ retry: memcg_memory_event(mem_over_limit, MEMCG_OOM); - mem_cgroup_oom(mem_over_limit, gfp_mask, + /* + * keep retrying as long as the memcg oom killer is able to make + * a forward progress or bypass the charge if the oom killer + * couldn't make any progress. + */ + oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages * PAGE_SIZE)); + switch (oom_status) { + case OOM_SUCCESS: + nr_retries = MEM_CGROUP_RECLAIM_RETRIES; + oomed = true; + goto retry; + case OOM_FAILED: + goto force; + default: + goto nomem; + } nomem: if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; @@ -2119,7 +2338,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, unlock_page_lru(page, isolated); } -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM static int memcg_alloc_cache_id(void) { int id, size; @@ -2261,7 +2480,7 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) if (current->memcg_kmem_skip_account) return cachep; - memcg = get_mem_cgroup_from_mm(current->mm); + memcg = get_mem_cgroup_from_current(); kmemcg_id = READ_ONCE(memcg->kmemcg_id); if (kmemcg_id < 0) goto out; @@ -2345,7 +2564,7 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) if (memcg_kmem_bypass()) return 0; - memcg = get_mem_cgroup_from_mm(current->mm); + memcg = get_mem_cgroup_from_current(); if (!mem_cgroup_is_root(memcg)) { ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); if (!ret) @@ -2384,7 +2603,7 @@ void memcg_kmem_uncharge(struct page *page, int order) css_put_many(&memcg->css, nr_pages); } -#endif /* !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -2779,7 +2998,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, } } -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM static int memcg_online_kmem(struct mem_cgroup *memcg) { int memcg_id; @@ -2851,7 +3070,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) } rcu_read_unlock(); - memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id); + memcg_drain_all_list_lrus(kmemcg_id, parent); memcg_free_cache_id(kmemcg_id); } @@ -2879,7 +3098,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) static void memcg_free_kmem(struct mem_cgroup *memcg) { } -#endif /* !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ static int memcg_update_kmem_max(struct mem_cgroup *memcg, unsigned long max) @@ -4037,6 +4256,14 @@ static struct cftype mem_cgroup_legacy_files[] = { static DEFINE_IDR(mem_cgroup_idr); +static void mem_cgroup_id_remove(struct mem_cgroup *memcg) +{ + if (memcg->id.id > 0) { + idr_remove(&mem_cgroup_idr, memcg->id.id); + memcg->id.id = 0; + } +} + static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) { VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0); @@ -4047,8 +4274,7 @@ static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) { VM_BUG_ON(atomic_read(&memcg->id.ref) < n); if (atomic_sub_and_test(n, &memcg->id.ref)) { - idr_remove(&mem_cgroup_idr, memcg->id.id); - memcg->id.id = 0; + mem_cgroup_id_remove(memcg); /* Memcg ID pins CSS */ css_put(&memcg->css); @@ -4176,7 +4402,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) INIT_LIST_HEAD(&memcg->event_list); spin_lock_init(&memcg->event_list_lock); memcg->socket_pressure = jiffies; -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM memcg->kmemcg_id = -1; #endif #ifdef CONFIG_CGROUP_WRITEBACK @@ -4185,8 +4411,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); return memcg; fail: - if (memcg->id.id > 0) - idr_remove(&mem_cgroup_idr, memcg->id.id); + mem_cgroup_id_remove(memcg); __mem_cgroup_free(memcg); return NULL; } @@ -4245,6 +4470,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) return &memcg->css; fail: + mem_cgroup_id_remove(memcg); mem_cgroup_free(memcg); return ERR_PTR(-ENOMEM); } @@ -4253,6 +4479,16 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + /* + * A memcg must be visible for memcg_expand_shrinker_maps() + * by the time the maps are allocated. So, we allocate maps + * here, when for_each_mem_cgroup() can't skip it. + */ + if (memcg_alloc_shrinker_maps(memcg)) { + mem_cgroup_id_remove(memcg); + return -ENOMEM; + } + /* Online state pins memcg ID, memcg ID pins CSS */ atomic_set(&memcg->id.ref, 1); css_get(css); @@ -4305,6 +4541,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) vmpressure_cleanup(&memcg->vmpressure); cancel_work_sync(&memcg->high_work); mem_cgroup_remove_from_trees(memcg); + memcg_free_shrinker_maps(memcg); memcg_free_kmem(memcg); mem_cgroup_free(memcg); } @@ -5593,6 +5830,19 @@ out: return ret; } +int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask, struct mem_cgroup **memcgp, + bool compound) +{ + struct mem_cgroup *memcg; + int ret; + + ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound); + memcg = *memcgp; + mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask); + return ret; +} + /** * mem_cgroup_commit_charge - commit a page charge * @page: page to charge @@ -6003,7 +6253,7 @@ static int __init mem_cgroup_init(void) { int cpu, node; -#ifndef CONFIG_SLOB +#ifdef CONFIG_MEMCG_KMEM /* * Kmem cache creation is mostly done with the slab_mutex held, * so use a workqueue with limited concurrency to avoid stalling diff --git a/mm/memfd.c b/mm/memfd.c index 27069518e3c5..2bb5e257080e 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -326,7 +326,7 @@ SYSCALL_DEFINE2(memfd_create, goto err_fd; } file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; - file->f_flags |= O_RDWR | O_LARGEFILE; + file->f_flags |= O_LARGEFILE; if (flags & MFD_ALLOW_SEALING) { file_seals = memfd_file_seals_ptr(file); diff --git a/mm/memory.c b/mm/memory.c index 7206a634270b..19f47d7b9b86 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -326,16 +326,20 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ #ifdef CONFIG_HAVE_RCU_TABLE_FREE -/* - * See the comment near struct mmu_table_batch. - */ - static void tlb_remove_table_smp_sync(void *arg) { - /* Simply deliver the interrupt */ + struct mm_struct __maybe_unused *mm = arg; + /* + * On most architectures this does nothing. Simply delivering the + * interrupt is enough to prevent races with software page table + * walking like that done in get_user_pages_fast. + * + * See the comment near struct mmu_table_batch. + */ + tlb_flush_remove_tables_local(mm); } -static void tlb_remove_table_one(void *table) +static void tlb_remove_table_one(void *table, struct mmu_gather *tlb) { /* * This isn't an RCU grace period and hence the page-tables cannot be @@ -344,7 +348,7 @@ static void tlb_remove_table_one(void *table) * It is however sufficient for software page-table walkers that rely on * IRQ disabling. See the comment near struct mmu_table_batch. */ - smp_call_function(tlb_remove_table_smp_sync, NULL, 1); + smp_call_function(tlb_remove_table_smp_sync, tlb->mm, 1); __tlb_remove_table(table); } @@ -365,6 +369,8 @@ void tlb_table_flush(struct mmu_gather *tlb) { struct mmu_table_batch **batch = &tlb->batch; + tlb_flush_remove_tables(tlb->mm); + if (*batch) { call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); *batch = NULL; @@ -387,7 +393,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) if (*batch == NULL) { *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); if (*batch == NULL) { - tlb_remove_table_one(table); + tlb_remove_table_one(table, tlb); return; } (*batch)->nr = 0; @@ -853,6 +859,10 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, return NULL; } } + + if (pte_devmap(pte)) + return NULL; + print_bad_pte(vma, addr, pte, NULL); return NULL; } @@ -917,6 +927,8 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, } } + if (pmd_devmap(pmd)) + return NULL; if (is_zero_pfn(pfn)) return NULL; if (unlikely(pfn > highest_memmap_pfn)) @@ -1417,11 +1429,9 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, do { next = pmd_addr_end(addr, end); if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { - if (next - addr != HPAGE_PMD_SIZE) { - VM_BUG_ON_VMA(vma_is_anonymous(vma) && - !rwsem_is_locked(&tlb->mm->mmap_sem), vma); + if (next - addr != HPAGE_PMD_SIZE) __split_huge_pmd(vma, pmd, addr, false, NULL); - } else if (zap_huge_pmd(tlb, vma, pmd, addr)) + else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; /* fall through */ } @@ -1603,20 +1613,8 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, tlb_gather_mmu(&tlb, mm, start, end); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(mm, start, end); - for ( ; vma && vma->vm_start < end; vma = vma->vm_next) { + for ( ; vma && vma->vm_start < end; vma = vma->vm_next) unmap_single_vma(&tlb, vma, start, end, NULL); - - /* - * zap_page_range does not specify whether mmap_sem should be - * held for read or write. That allows parallel zap_page_range - * operations to unmap a PTE and defer a flush meaning that - * this call observes pte_none and fails to flush the TLB. - * Rather than adding a complex API, ensure that no stale - * TLB entries exist when this call returns. - */ - flush_tlb_range(vma, start, end); - } - mmu_notifier_invalidate_range_end(mm, start, end); tlb_finish_mmu(&tlb, start, end); } @@ -1886,6 +1884,9 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; + if (!pfn_modify_allowed(pfn, pgprot)) + return -EACCES; + track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, @@ -1921,6 +1922,9 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, track_pfn_insert(vma, &pgprot, pfn); + if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) + return -EACCES; + /* * If we don't have pte special, then we have to use the pfn_valid() * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* @@ -1982,6 +1986,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, { pte_t *pte; spinlock_t *ptl; + int err = 0; pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) @@ -1989,12 +1994,16 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, arch_enter_lazy_mmu_mode(); do { BUG_ON(!pte_none(*pte)); + if (!pfn_modify_allowed(pfn, prot)) { + err = -EACCES; + break; + } set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); - return 0; + return err; } static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, @@ -2003,6 +2012,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, { pmd_t *pmd; unsigned long next; + int err; pfn -= addr >> PAGE_SHIFT; pmd = pmd_alloc(mm, pud, addr); @@ -2011,9 +2021,10 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, VM_BUG_ON(pmd_trans_huge(*pmd)); do { next = pmd_addr_end(addr, end); - if (remap_pte_range(mm, pmd, addr, next, - pfn + (addr >> PAGE_SHIFT), prot)) - return -ENOMEM; + err = remap_pte_range(mm, pmd, addr, next, + pfn + (addr >> PAGE_SHIFT), prot); + if (err) + return err; } while (pmd++, addr = next, addr != end); return 0; } @@ -2024,6 +2035,7 @@ static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d, { pud_t *pud; unsigned long next; + int err; pfn -= addr >> PAGE_SHIFT; pud = pud_alloc(mm, p4d, addr); @@ -2031,9 +2043,10 @@ static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d, return -ENOMEM; do { next = pud_addr_end(addr, end); - if (remap_pmd_range(mm, pud, addr, next, - pfn + (addr >> PAGE_SHIFT), prot)) - return -ENOMEM; + err = remap_pmd_range(mm, pud, addr, next, + pfn + (addr >> PAGE_SHIFT), prot); + if (err) + return err; } while (pud++, addr = next, addr != end); return 0; } @@ -2044,6 +2057,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, { p4d_t *p4d; unsigned long next; + int err; pfn -= addr >> PAGE_SHIFT; p4d = p4d_alloc(mm, pgd, addr); @@ -2051,9 +2065,10 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, return -ENOMEM; do { next = p4d_addr_end(addr, end); - if (remap_pud_range(mm, p4d, addr, next, - pfn + (addr >> PAGE_SHIFT), prot)) - return -ENOMEM; + err = remap_pud_range(mm, p4d, addr, next, + pfn + (addr >> PAGE_SHIFT), prot); + if (err) + return err; } while (p4d++, addr = next, addr != end); return 0; } @@ -2503,7 +2518,7 @@ static int wp_page_copy(struct vm_fault *vmf) cow_user_page(new_page, old_page, vmf->address, vma); } - if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) + if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false)) goto oom_free_new; __SetPageUptodate(new_page); @@ -3003,8 +3018,8 @@ int do_swap_page(struct vm_fault *vmf) goto out_page; } - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, - &memcg, false)) { + if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, + &memcg, false)) { ret = VM_FAULT_OOM; goto out_page; } @@ -3165,7 +3180,8 @@ static int do_anonymous_page(struct vm_fault *vmf) if (!page) goto oom; - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) + if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg, + false)) goto oom_free_page; /* @@ -3372,7 +3388,7 @@ static int do_set_pmd(struct vm_fault *vmf, struct page *page) if (write) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR); + add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); page_add_file_rmap(page, true); /* * deposit and withdraw with pmd lock held @@ -3661,7 +3677,7 @@ static int do_cow_fault(struct vm_fault *vmf) if (!vmf->cow_page) return VM_FAULT_OOM; - if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL, + if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL, &vmf->memcg, false)) { put_page(vmf->cow_page); return VM_FAULT_OOM; @@ -4125,7 +4141,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, * space. Kernel faults are handled more gracefully. */ if (flags & FAULT_FLAG_USER) - mem_cgroup_oom_enable(); + mem_cgroup_enter_user_fault(); if (unlikely(is_vm_hugetlb_page(vma))) ret = hugetlb_fault(vma->vm_mm, vma, address, flags); @@ -4133,7 +4149,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, ret = __handle_mm_fault(vma, address, flags); if (flags & FAULT_FLAG_USER) { - mem_cgroup_oom_disable(); + mem_cgroup_exit_user_fault(); /* * The task may have entered a memcg OOM situation but * if the allocation error was handled gracefully (no @@ -4397,6 +4413,9 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, return -EINVAL; maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot); + if (!maddr) + return -ENOMEM; + if (write) memcpy_toio(maddr + offset, buf, len); else @@ -4568,71 +4587,93 @@ EXPORT_SYMBOL(__might_fault); #endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) -static void clear_gigantic_page(struct page *page, - unsigned long addr, - unsigned int pages_per_huge_page) -{ - int i; - struct page *p = page; - - might_sleep(); - for (i = 0; i < pages_per_huge_page; - i++, p = mem_map_next(p, page, i)) { - cond_resched(); - clear_user_highpage(p, addr + i * PAGE_SIZE); - } -} -void clear_huge_page(struct page *page, - unsigned long addr_hint, unsigned int pages_per_huge_page) +/* + * Process all subpages of the specified huge page with the specified + * operation. The target subpage will be processed last to keep its + * cache lines hot. + */ +static inline void process_huge_page( + unsigned long addr_hint, unsigned int pages_per_huge_page, + void (*process_subpage)(unsigned long addr, int idx, void *arg), + void *arg) { int i, n, base, l; unsigned long addr = addr_hint & ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); - if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { - clear_gigantic_page(page, addr, pages_per_huge_page); - return; - } - - /* Clear sub-page to access last to keep its cache lines hot */ + /* Process target subpage last to keep its cache lines hot */ might_sleep(); n = (addr_hint - addr) / PAGE_SIZE; if (2 * n <= pages_per_huge_page) { - /* If sub-page to access in first half of huge page */ + /* If target subpage in first half of huge page */ base = 0; l = n; - /* Clear sub-pages at the end of huge page */ + /* Process subpages at the end of huge page */ for (i = pages_per_huge_page - 1; i >= 2 * n; i--) { cond_resched(); - clear_user_highpage(page + i, addr + i * PAGE_SIZE); + process_subpage(addr + i * PAGE_SIZE, i, arg); } } else { - /* If sub-page to access in second half of huge page */ + /* If target subpage in second half of huge page */ base = pages_per_huge_page - 2 * (pages_per_huge_page - n); l = pages_per_huge_page - n; - /* Clear sub-pages at the begin of huge page */ + /* Process subpages at the begin of huge page */ for (i = 0; i < base; i++) { cond_resched(); - clear_user_highpage(page + i, addr + i * PAGE_SIZE); + process_subpage(addr + i * PAGE_SIZE, i, arg); } } /* - * Clear remaining sub-pages in left-right-left-right pattern - * towards the sub-page to access + * Process remaining subpages in left-right-left-right pattern + * towards the target subpage */ for (i = 0; i < l; i++) { int left_idx = base + i; int right_idx = base + 2 * l - 1 - i; cond_resched(); - clear_user_highpage(page + left_idx, - addr + left_idx * PAGE_SIZE); + process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg); cond_resched(); - clear_user_highpage(page + right_idx, - addr + right_idx * PAGE_SIZE); + process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg); } } +static void clear_gigantic_page(struct page *page, + unsigned long addr, + unsigned int pages_per_huge_page) +{ + int i; + struct page *p = page; + + might_sleep(); + for (i = 0; i < pages_per_huge_page; + i++, p = mem_map_next(p, page, i)) { + cond_resched(); + clear_user_highpage(p, addr + i * PAGE_SIZE); + } +} + +static void clear_subpage(unsigned long addr, int idx, void *arg) +{ + struct page *page = arg; + + clear_user_highpage(page + idx, addr); +} + +void clear_huge_page(struct page *page, + unsigned long addr_hint, unsigned int pages_per_huge_page) +{ + unsigned long addr = addr_hint & + ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); + + if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { + clear_gigantic_page(page, addr, pages_per_huge_page); + return; + } + + process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page); +} + static void copy_user_gigantic_page(struct page *dst, struct page *src, unsigned long addr, struct vm_area_struct *vma, @@ -4652,11 +4693,31 @@ static void copy_user_gigantic_page(struct page *dst, struct page *src, } } +struct copy_subpage_arg { + struct page *dst; + struct page *src; + struct vm_area_struct *vma; +}; + +static void copy_subpage(unsigned long addr, int idx, void *arg) +{ + struct copy_subpage_arg *copy_arg = arg; + + copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx, + addr, copy_arg->vma); +} + void copy_user_huge_page(struct page *dst, struct page *src, - unsigned long addr, struct vm_area_struct *vma, + unsigned long addr_hint, struct vm_area_struct *vma, unsigned int pages_per_huge_page) { - int i; + unsigned long addr = addr_hint & + ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); + struct copy_subpage_arg arg = { + .dst = dst, + .src = src, + .vma = vma, + }; if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { copy_user_gigantic_page(dst, src, addr, vma, @@ -4664,11 +4725,7 @@ void copy_user_huge_page(struct page *dst, struct page *src, return; } - might_sleep(); - for (i = 0; i < pages_per_huge_page; i++) { - cond_resched(); - copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); - } + process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg); } long copy_huge_page_from_user(struct page *dst_page, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 7deb49f69e27..4eb6e824a80c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1034,8 +1034,10 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) return pgdat; } -static void rollback_node_hotadd(int nid, pg_data_t *pgdat) +static void rollback_node_hotadd(int nid) { + pg_data_t *pgdat = NODE_DATA(nid); + arch_refresh_nodedata(nid, NULL); free_percpu(pgdat->per_cpu_nodestats); arch_free_nodedata(pgdat); @@ -1046,28 +1048,48 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) /** * try_online_node - online a node if offlined * @nid: the node ID - * + * @start: start addr of the node + * @set_node_online: Whether we want to online the node * called by cpu_up() to online a node without onlined memory. + * + * Returns: + * 1 -> a new node has been allocated + * 0 -> the node is already online + * -ENOMEM -> the node could not be allocated */ -int try_online_node(int nid) +static int __try_online_node(int nid, u64 start, bool set_node_online) { - pg_data_t *pgdat; - int ret; + pg_data_t *pgdat; + int ret = 1; if (node_online(nid)) return 0; - mem_hotplug_begin(); - pgdat = hotadd_new_pgdat(nid, 0); + pgdat = hotadd_new_pgdat(nid, start); if (!pgdat) { pr_err("Cannot online node %d due to NULL pgdat\n", nid); ret = -ENOMEM; goto out; } - node_set_online(nid); - ret = register_one_node(nid); - BUG_ON(ret); + + if (set_node_online) { + node_set_online(nid); + ret = register_one_node(nid); + BUG_ON(ret); + } out: + return ret; +} + +/* + * Users of this function always want to online/register the node + */ +int try_online_node(int nid) +{ + int ret; + + mem_hotplug_begin(); + ret = __try_online_node(nid, 0, true); mem_hotplug_done(); return ret; } @@ -1099,9 +1121,7 @@ static int online_memory_block(struct memory_block *mem, void *arg) int __ref add_memory_resource(int nid, struct resource *res, bool online) { u64 start, size; - pg_data_t *pgdat = NULL; - bool new_pgdat; - bool new_node; + bool new_node = false; int ret; start = res->start; @@ -1111,11 +1131,6 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) if (ret) return ret; - { /* Stupid hack to suppress address-never-null warning */ - void *p = NODE_DATA(nid); - new_pgdat = !p; - } - mem_hotplug_begin(); /* @@ -1126,48 +1141,31 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) */ memblock_add_node(start, size, nid); - new_node = !node_online(nid); - if (new_node) { - pgdat = hotadd_new_pgdat(nid, start); - ret = -ENOMEM; - if (!pgdat) - goto error; - } + ret = __try_online_node(nid, start, false); + if (ret < 0) + goto error; + new_node = ret; /* call arch's memory hotadd */ ret = arch_add_memory(nid, start, size, NULL, true); - if (ret < 0) goto error; - /* we online node here. we can't roll back from here. */ - node_set_online(nid); - if (new_node) { - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long nr_pages = size >> PAGE_SHIFT; - - ret = __register_one_node(nid); - if (ret) - goto register_fail; - - /* - * link memory sections under this node. This is already - * done when creatig memory section in register_new_memory - * but that depends to have the node registered so offline - * nodes have to go through register_node. - * TODO clean up this mess. - */ - ret = link_mem_sections(nid, start_pfn, nr_pages, false); -register_fail: - /* - * If sysfs file of new node can't create, cpu on the node + /* If sysfs file of new node can't be created, cpu on the node * can't be hot-added. There is no rollback way now. * So, check by BUG_ON() to catch it reluctantly.. + * We online node here. We can't roll back from here. */ + node_set_online(nid); + ret = __register_one_node(nid); BUG_ON(ret); } + /* link memory sections under this node.*/ + ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1)); + BUG_ON(ret); + /* create new memmap entry */ firmware_map_add_hotplug(start, start + size, "System RAM"); @@ -1180,8 +1178,8 @@ register_fail: error: /* rollback pgdat allocation and others */ - if (new_pgdat && pgdat) - rollback_node_hotadd(nid, pgdat); + if (new_node) + rollback_node_hotadd(nid); memblock_remove(start, size); out: diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9ac49ef17b4e..01f1a14facc4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2505,6 +2505,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) /* Create pseudo-vma that contains just the policy */ memset(&pvma, 0, sizeof(struct vm_area_struct)); + vma_init(&pvma, NULL); pvma.vm_end = TASK_SIZE; /* policy covers entire file */ mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ diff --git a/mm/mempool.c b/mm/mempool.c index b54f2c20e5e0..44f5fa98c1e7 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -111,7 +111,7 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element) kasan_free_pages(element, (unsigned long)pool->pool_data); } -static void kasan_unpoison_element(mempool_t *pool, void *element, gfp_t flags) +static void kasan_unpoison_element(mempool_t *pool, void *element) { if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) kasan_unpoison_slab(element); @@ -127,12 +127,12 @@ static __always_inline void add_element(mempool_t *pool, void *element) pool->elements[pool->curr_nr++] = element; } -static void *remove_element(mempool_t *pool, gfp_t flags) +static void *remove_element(mempool_t *pool) { void *element = pool->elements[--pool->curr_nr]; BUG_ON(pool->curr_nr < 0); - kasan_unpoison_element(pool, element, flags); + kasan_unpoison_element(pool, element); check_element(pool, element); return element; } @@ -151,7 +151,7 @@ static void *remove_element(mempool_t *pool, gfp_t flags) void mempool_exit(mempool_t *pool) { while (pool->curr_nr) { - void *element = remove_element(pool, GFP_KERNEL); + void *element = remove_element(pool); pool->free(element, pool->pool_data); } kfree(pool->elements); @@ -301,7 +301,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr) spin_lock_irqsave(&pool->lock, flags); if (new_min_nr <= pool->min_nr) { while (new_min_nr < pool->curr_nr) { - element = remove_element(pool, GFP_KERNEL); + element = remove_element(pool); spin_unlock_irqrestore(&pool->lock, flags); pool->free(element, pool->pool_data); spin_lock_irqsave(&pool->lock, flags); @@ -387,7 +387,7 @@ repeat_alloc: spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr)) { - element = remove_element(pool, gfp_temp); + element = remove_element(pool); spin_unlock_irqrestore(&pool->lock, flags); /* paired with rmb in mempool_free(), read comment there */ smp_wmb(); diff --git a/mm/migrate.c b/mm/migrate.c index 8c0af0f7cab1..4a83268e23c2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2951,7 +2951,8 @@ int migrate_vma(const struct migrate_vma_ops *ops, /* Sanity check the arguments */ start &= PAGE_MASK; end &= PAGE_MASK; - if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) + if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || + vma_is_dax(vma)) return -EINVAL; if (start < vma->vm_start || start >= vma->vm_end) return -EINVAL; diff --git a/mm/mlock.c b/mm/mlock.c index 74e5a6547c3d..41cc47e28ad6 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -527,7 +527,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, vm_flags_t old_flags = vma->vm_flags; if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || - is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) + is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) || + vma_is_dax(vma)) /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ goto out; diff --git a/mm/mmap.c b/mm/mmap.c index d1eb87ef4b1a..8d6449e74431 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -182,12 +182,12 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) if (vma->vm_file) fput(vma->vm_file); mpol_put(vma_policy(vma)); - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return next; } -static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf); - +static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags, + struct list_head *uf); SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long retval; @@ -245,7 +245,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) goto out; /* Ok, looks good - let it rip. */ - if (do_brk(oldbrk, newbrk-oldbrk, &uf) < 0) + if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0) goto out; set_brk: @@ -911,7 +911,7 @@ again: anon_vma_merge(vma, next); mm->map_count--; mpol_put(vma_policy(next)); - kmem_cache_free(vm_area_cachep, next); + vm_area_free(next); /* * In mprotect's case 6 (see comments on vma_merge), * we must remove another next too. It would clutter @@ -1729,19 +1729,17 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(mm); if (!vma) { error = -ENOMEM; goto unacct_error; } - vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; vma->vm_flags = vm_flags; vma->vm_page_prot = vm_get_page_prot(vm_flags); vma->vm_pgoff = pgoff; - INIT_LIST_HEAD(&vma->anon_vma_chain); if (file) { if (vm_flags & VM_DENYWRITE) { @@ -1780,6 +1778,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, error = shmem_zero_setup(vma); if (error) goto free_vma; + } else { + vma_set_anonymous(vma); } vma_link(mm, vma, prev, rb_link, rb_parent); @@ -1796,11 +1796,12 @@ out: vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { - if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || - vma == get_gate_vma(current->mm))) - mm->locked_vm += (len >> PAGE_SHIFT); - else + if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || + is_vm_hugetlb_page(vma) || + vma == get_gate_vma(current->mm)) vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + else + mm->locked_vm += (len >> PAGE_SHIFT); } if (file) @@ -1832,7 +1833,7 @@ allow_write_and_free_vma: if (vm_flags & VM_DENYWRITE) allow_write_access(file); free_vma: - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); unacct_error: if (charged) vm_unacct_memory(charged); @@ -2620,15 +2621,10 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return err; } - new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + new = vm_area_dup(vma); if (!new) return -ENOMEM; - /* most fields are the same, copy all, and then fixup */ - *new = *vma; - - INIT_LIST_HEAD(&new->anon_vma_chain); - if (new_below) new->vm_end = addr; else { @@ -2669,7 +2665,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, out_free_mpol: mpol_put(vma_policy(new)); out_free_vma: - kmem_cache_free(vm_area_cachep, new); + vm_area_free(new); return err; } @@ -2929,21 +2925,14 @@ static inline void verify_mm_writelocked(struct mm_struct *mm) * anonymous maps. eventually we may be able to do some * brk-specific accounting here. */ -static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags, struct list_head *uf) +static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long flags, struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; - unsigned long len; struct rb_node **rb_link, *rb_parent; pgoff_t pgoff = addr >> PAGE_SHIFT; int error; - len = PAGE_ALIGN(request); - if (len < request) - return -ENOMEM; - if (!len) - return 0; - /* Until we need other flags, refuse anything except VM_EXEC. */ if ((flags & (~VM_EXEC)) != 0) return -EINVAL; @@ -2991,14 +2980,13 @@ static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long /* * create a vma struct for an anonymous mapping */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(mm); if (!vma) { vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; } - INIT_LIST_HEAD(&vma->anon_vma_chain); - vma->vm_mm = mm; + vma_set_anonymous(vma); vma->vm_start = addr; vma->vm_end = addr + len; vma->vm_pgoff = pgoff; @@ -3015,18 +3003,20 @@ out: return 0; } -static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf) -{ - return do_brk_flags(addr, len, 0, uf); -} - -int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags) +int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) { struct mm_struct *mm = current->mm; + unsigned long len; int ret; bool populate; LIST_HEAD(uf); + len = PAGE_ALIGN(request); + if (len < request) + return -ENOMEM; + if (!len) + return 0; + if (down_write_killable(&mm->mmap_sem)) return -EINTR; @@ -3207,16 +3197,14 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, } *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); } else { - new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + new_vma = vm_area_dup(vma); if (!new_vma) goto out; - *new_vma = *vma; new_vma->vm_start = addr; new_vma->vm_end = addr + len; new_vma->vm_pgoff = pgoff; if (vma_dup_policy(vma, new_vma)) goto out_free_vma; - INIT_LIST_HEAD(&new_vma->anon_vma_chain); if (anon_vma_clone(new_vma, vma)) goto out_free_mempol; if (new_vma->vm_file) @@ -3231,7 +3219,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, out_free_mempol: mpol_put(vma_policy(new_vma)); out_free_vma: - kmem_cache_free(vm_area_cachep, new_vma); + vm_area_free(new_vma); out: return NULL; } @@ -3355,12 +3343,10 @@ static struct vm_area_struct *__install_special_mapping( int ret; struct vm_area_struct *vma; - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(mm); if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); - INIT_LIST_HEAD(&vma->anon_vma_chain); - vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; @@ -3381,7 +3367,7 @@ static struct vm_area_struct *__install_special_mapping( return vma; out: - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return ERR_PTR(ret); } diff --git a/mm/mprotect.c b/mm/mprotect.c index 625608bc8962..6d331620b9e5 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -306,6 +306,42 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, return pages; } +static int prot_none_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? + 0 : -EACCES; +} + +static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? + 0 : -EACCES; +} + +static int prot_none_test(unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + return 0; +} + +static int prot_none_walk(struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long newflags) +{ + pgprot_t new_pgprot = vm_get_page_prot(newflags); + struct mm_walk prot_none_walk = { + .pte_entry = prot_none_pte_entry, + .hugetlb_entry = prot_none_hugetlb_entry, + .test_walk = prot_none_test, + .mm = current->mm, + .private = &new_pgprot, + }; + + return walk_page_range(start, end, &prot_none_walk); +} + int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags) @@ -324,6 +360,19 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, } /* + * Do PROT_NONE PFN permission checks here when we can still + * bail out without undoing a lot of state. This is a rather + * uncommon case, so doesn't need to be very optimized. + */ + if (arch_has_pfn_modify_check() && + (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && + (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) { + error = prot_none_walk(vma, start, end, newflags); + if (error) + return error; + } + + /* * If we make a private mapping writable we increase our commit; * but (without finer accounting) cannot reduce our commit if we * make it unwritable again. hugetlb mapping were accounted for diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 9b02fda0886b..439af3b765a7 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -42,7 +42,7 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, { void *ptr; u64 addr; - ulong flags = choose_memblock_flags(); + enum memblock_flags flags = choose_memblock_flags(); if (limit > memblock.current_limit) limit = memblock.current_limit; @@ -72,7 +72,7 @@ again: return ptr; } -/* +/** * free_bootmem_late - free bootmem pages directly to page allocator * @addr: starting address of the range * @size: size of the range in bytes @@ -176,7 +176,7 @@ void __init reset_all_zones_managed_pages(void) /** * free_all_bootmem - release free pages to the buddy allocator * - * Returns the number of pages actually released. + * Return: the number of pages actually released. */ unsigned long __init free_all_bootmem(void) { @@ -193,7 +193,7 @@ unsigned long __init free_all_bootmem(void) /** * free_bootmem_node - mark a page range as usable * @pgdat: node the range resides on - * @physaddr: starting address of the range + * @physaddr: starting physical address of the range * @size: size of the range in bytes * * Partial pages will be considered reserved and left as they are. @@ -208,7 +208,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, /** * free_bootmem - mark a page range as usable - * @addr: starting address of the range + * @addr: starting physical address of the range * @size: size of the range in bytes * * Partial pages will be considered reserved and left as they are. @@ -256,7 +256,7 @@ restart: * * Allocation may happen on any node in the system. * - * Returns NULL on failure. + * Return: address of the allocated region or %NULL on failure. */ void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal) @@ -293,6 +293,8 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, * Allocation may happen on any node in the system. * * The function panics if the request can not be satisfied. + * + * Return: address of the allocated region. */ void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) @@ -367,6 +369,8 @@ static void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, * can not hold the requested memory. * * The function panics if the request can not be satisfied. + * + * Return: address of the allocated region. */ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) @@ -396,6 +400,8 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, * Allocation may happen on any node in the system. * * The function panics if the request can not be satisfied. + * + * Return: address of the allocated region. */ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal) @@ -425,6 +431,8 @@ void * __init __alloc_bootmem_low_nopanic(unsigned long size, * can not hold the requested memory. * * The function panics if the request can not be satisfied. + * + * Return: address of the allocated region. */ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) diff --git a/mm/nommu.c b/mm/nommu.c index 4452d8bd9ae4..e4aac33216ae 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -364,10 +364,6 @@ void *vzalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vzalloc_node); -#ifndef PAGE_KERNEL_EXEC -# define PAGE_KERNEL_EXEC PAGE_KERNEL -#endif - /** * vmalloc_exec - allocate virtually contiguous, executable memory * @size: allocation size @@ -769,7 +765,7 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) if (vma->vm_file) fput(vma->vm_file); put_nommu_region(vma->vm_region); - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); } /* @@ -1145,6 +1141,8 @@ static int do_mmap_private(struct vm_area_struct *vma, if (ret < len) memset(base + ret, 0, len - ret); + } else { + vma_set_anonymous(vma); } return 0; @@ -1204,7 +1202,7 @@ unsigned long do_mmap(struct file *file, if (!region) goto error_getting_region; - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(current->mm); if (!vma) goto error_getting_vma; @@ -1212,7 +1210,6 @@ unsigned long do_mmap(struct file *file, region->vm_flags = vm_flags; region->vm_pgoff = pgoff; - INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_flags = vm_flags; vma->vm_pgoff = pgoff; @@ -1368,7 +1365,7 @@ error: kmem_cache_free(vm_region_jar, region); if (vma->vm_file) fput(vma->vm_file); - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return ret; sharing_violation: @@ -1469,14 +1466,13 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (!region) return -ENOMEM; - new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + new = vm_area_dup(vma); if (!new) { kmem_cache_free(vm_region_jar, region); return -ENOMEM; } /* most fields are the same, copy all, and then fixup */ - *new = *vma; *region = *vma->vm_region; new->vm_region = region; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 2cc9b238368f..7c74dcc2d26d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -53,6 +53,14 @@ int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; int sysctl_oom_dump_tasks = 1; +/* + * Serializes oom killer invocations (out_of_memory()) from all contexts to + * prevent from over eager oom killing (e.g. when the oom killer is invoked + * from different domains). + * + * oom_killer_disable() relies on this lock to stabilize oom_killer_disabled + * and mark_oom_victim + */ DEFINE_MUTEX(oom_lock); #ifdef CONFIG_NUMA @@ -1077,15 +1085,9 @@ bool out_of_memory(struct oom_control *oc) dump_header(oc, NULL); panic("Out of memory and no killable processes...\n"); } - if (oc->chosen && oc->chosen != (void *)-1UL) { + if (oc->chosen && oc->chosen != (void *)-1UL) oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : "Memory cgroup out of memory"); - /* - * Give the killed process a good chance to exit before trying - * to allocate memory again. - */ - schedule_timeout_killable(1); - } return !!oc->chosen; } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 337c6afb3345..6551d3b0dc30 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2490,8 +2490,8 @@ EXPORT_SYMBOL(__set_page_dirty_nobuffers); /* * Call this whenever redirtying a page, to de-account the dirty counters - * (NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied), so that they match the written - * counters (NR_WRITTEN, BDI_WRITTEN) in long term. The mismatches will lead to + * (NR_DIRTIED, WB_DIRTIED, tsk->nr_dirtied), so that they match the written + * counters (NR_WRITTEN, WB_WRITTEN) in long term. The mismatches will lead to * systematic errors in balanced_dirty_ratelimit and the dirty pages position * control. */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1521100f1e63..15ea511fb41c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -155,16 +155,17 @@ static inline void set_pcppage_migratetype(struct page *page, int migratetype) * The following functions are used by the suspend/hibernate code to temporarily * change gfp_allowed_mask in order to avoid using I/O during memory allocations * while devices are suspended. To avoid races with the suspend/hibernate code, - * they should always be called with pm_mutex held (gfp_allowed_mask also should - * only be modified with pm_mutex held, unless the suspend/hibernate code is - * guaranteed not to run in parallel with that modification). + * they should always be called with system_transition_mutex held + * (gfp_allowed_mask also should only be modified with system_transition_mutex + * held, unless the suspend/hibernate code is guaranteed not to run in parallel + * with that modification). */ static gfp_t saved_gfp_mask; void pm_restore_gfp_mask(void) { - WARN_ON(!mutex_is_locked(&pm_mutex)); + WARN_ON(!mutex_is_locked(&system_transition_mutex)); if (saved_gfp_mask) { gfp_allowed_mask = saved_gfp_mask; saved_gfp_mask = 0; @@ -173,7 +174,7 @@ void pm_restore_gfp_mask(void) void pm_restrict_gfp_mask(void) { - WARN_ON(!mutex_is_locked(&pm_mutex)); + WARN_ON(!mutex_is_locked(&system_transition_mutex)); WARN_ON(saved_gfp_mask); saved_gfp_mask = gfp_allowed_mask; gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); @@ -4164,11 +4165,12 @@ retry: alloc_flags = reserve_flags; /* - * Reset the zonelist iterators if memory policies can be ignored. - * These allocations are high priority and system rather than user - * orientated. + * Reset the nodemask and zonelist iterators if memory policies can be + * ignored. These allocations are high priority and system rather than + * user oriented. */ if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { + ac->nodemask = NULL; ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); } @@ -4402,19 +4404,15 @@ out: EXPORT_SYMBOL(__alloc_pages_nodemask); /* - * Common helper functions. + * Common helper functions. Never use with __GFP_HIGHMEM because the returned + * address cannot represent highmem pages. Use alloc_pages and then kmap if + * you need to access high mem. */ unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) { struct page *page; - /* - * __get_free_pages() returns a virtual address, which cannot represent - * a highmem page - */ - VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); - - page = alloc_pages(gfp_mask, order); + page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order); if (!page) return 0; return (unsigned long) page_address(page); @@ -5566,13 +5564,12 @@ static int zone_batchsize(struct zone *zone) /* * The per-cpu-pages pools are set to around 1000th of the - * size of the zone. But no more than 1/2 of a meg. - * - * OK, so we don't know how big the cache is. So guess. + * size of the zone. */ batch = zone->managed_pages / 1024; - if (batch * PAGE_SIZE > 512 * 1024) - batch = (512 * 1024) / PAGE_SIZE; + /* But no more than a meg. */ + if (batch * PAGE_SIZE > 1024 * 1024) + batch = (1024 * 1024) / PAGE_SIZE; batch /= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; @@ -6383,7 +6380,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, free_area_init_core(pgdat); } -#ifdef CONFIG_HAVE_MEMBLOCK +#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP) /* * Only struct pages that are backed by physical memory are zeroed and * initialized by going through __init_single_page(). But, there are some @@ -6404,8 +6401,11 @@ void __paginginit zero_resv_unavail(void) pgcnt = 0; for_each_resv_unavail_range(i, &start, &end) { for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) { - if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) + if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { + pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) + + pageblock_nr_pages - 1; continue; + } mm_zero_struct_page(pfn_to_page(pfn)); pgcnt++; } @@ -6421,7 +6421,7 @@ void __paginginit zero_resv_unavail(void) if (pgcnt) pr_info("Reserved but unavailable: %lld pages", pgcnt); } -#endif /* CONFIG_HAVE_MEMBLOCK */ +#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */ #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP @@ -6847,6 +6847,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) /* Initialise every node */ mminit_verify_pageflags_layout(); setup_nr_node_ids(); + zero_resv_unavail(); for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); free_area_init_node(nid, NULL, @@ -6857,7 +6858,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) node_set_state(nid, N_MEMORY); check_for_memory(pgdat, nid); } - zero_resv_unavail(); } static int __init cmdline_parse_core(char *p, unsigned long *core, @@ -6939,9 +6939,21 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s) start = (void *)PAGE_ALIGN((unsigned long)start); end = (void *)((unsigned long)end & PAGE_MASK); for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { + struct page *page = virt_to_page(pos); + void *direct_map_addr; + + /* + * 'direct_map_addr' might be different from 'pos' + * because some architectures' virt_to_page() + * work with aliases. Getting the direct map + * address ensures that we get a _writeable_ + * alias for the memset(). + */ + direct_map_addr = page_address(page); if ((unsigned int)poison <= 0xFF) - memset(pos, poison, PAGE_SIZE); - free_reserved_page(virt_to_page(pos)); + memset(direct_map_addr, poison, PAGE_SIZE); + + free_reserved_page(page); } if (pages && s) @@ -7033,9 +7045,9 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) void __init free_area_init(unsigned long *zones_size) { + zero_resv_unavail(); free_area_init_node(0, zones_size, __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); - zero_resv_unavail(); } static int page_alloc_cpu_dead(unsigned int cpu) diff --git a/mm/page_ext.c b/mm/page_ext.c index 5295ef331165..a9826da84ccb 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -120,7 +120,7 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) pgdat->node_page_ext = NULL; } -struct page_ext *lookup_page_ext(struct page *page) +struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); unsigned long index; @@ -195,7 +195,7 @@ fail: #else /* CONFIG_FLAT_NODE_MEM_MAP */ -struct page_ext *lookup_page_ext(struct page *page) +struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); diff --git a/mm/page_io.c b/mm/page_io.c index b41cf9644585..aafd19ec1db4 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -338,7 +338,8 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, ret = -ENOMEM; goto out; } - bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); + bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc); + bio_associate_blkcg_from_page(bio, page); count_swpout_vm_event(page); set_page_writeback(page); unlock_page(page); diff --git a/mm/readahead.c b/mm/readahead.c index e273f0de3376..a59ea70527b9 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -19,6 +19,7 @@ #include <linux/syscalls.h> #include <linux/file.h> #include <linux/mm_inline.h> +#include <linux/blk-cgroup.h> #include "internal.h" @@ -385,6 +386,7 @@ ondemand_readahead(struct address_space *mapping, { struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long max_pages = ra->ra_pages; + unsigned long add_pages; pgoff_t prev_offset; /* @@ -474,10 +476,17 @@ readit: * Will this read hit the readahead marker made by itself? * If so, trigger the readahead marker hit now, and merge * the resulted next readahead window into the current one. + * Take care of maximum IO pages as above. */ if (offset == ra->start && ra->size == ra->async_size) { - ra->async_size = get_next_ra_size(ra, max_pages); - ra->size += ra->async_size; + add_pages = get_next_ra_size(ra, max_pages); + if (ra->size + add_pages <= max_pages) { + ra->async_size = add_pages; + ra->size += add_pages; + } else { + ra->size = max_pages; + ra->async_size = max_pages >> 1; + } } return ra_submit(ra, mapping, filp); @@ -505,6 +514,9 @@ void page_cache_sync_readahead(struct address_space *mapping, if (!ra->ra_pages) return; + if (blk_cgroup_congested()) + return; + /* be dumb */ if (filp && (filp->f_mode & FMODE_RANDOM)) { force_page_cache_readahead(mapping, filp, offset, req_size); @@ -555,6 +567,9 @@ page_cache_async_readahead(struct address_space *mapping, if (inode_read_congested(mapping->host)) return; + if (blk_cgroup_congested()) + return; + /* do read-ahead */ ondemand_readahead(mapping, ra, filp, true, offset, req_size); } diff --git a/mm/rmap.c b/mm/rmap.c index 6db729dc4c50..eb477809a5c0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -64,6 +64,7 @@ #include <linux/backing-dev.h> #include <linux/page_idle.h> #include <linux/memremap.h> +#include <linux/userfaultfd_k.h> #include <asm/tlbflush.h> @@ -1481,11 +1482,16 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, set_pte_at(mm, address, pvmw.pte, pteval); } - } else if (pte_unused(pteval)) { + } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) { /* * The guest indicated that the page content is of no * interest anymore. Simply discard the pte, vmscan * will take care of the rest. + * A future reference will then fault in a new zero + * page. When userfaultfd is active, we must not drop + * this page though, as its main user (postcopy + * migration) will not expect userfaults on already + * copied pages. */ dec_mm_counter(mm, mm_counter(page)); /* We have to invalidate as we cleared the pte */ diff --git a/mm/shmem.c b/mm/shmem.c index 2cab84403055..c48c79018a7c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -29,6 +29,7 @@ #include <linux/pagemap.h> #include <linux/file.h> #include <linux/mm.h> +#include <linux/random.h> #include <linux/sched/signal.h> #include <linux/export.h> #include <linux/swap.h> @@ -1239,8 +1240,8 @@ int shmem_unuse(swp_entry_t swap, struct page *page) * the shmem_swaplist_mutex which might hold up shmem_writepage(). * Charged back to the user (not to caller) when swap account is used. */ - error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg, - false); + error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL, + &memcg, false); if (error) goto out; /* No radix_tree_preload: swap entry keeps a place for page in tree */ @@ -1421,6 +1422,7 @@ static void shmem_pseudo_vma_init(struct vm_area_struct *vma, { /* Create a pseudo vma that just contains the policy */ memset(vma, 0, sizeof(*vma)); + vma_init(vma, NULL); /* Bias interleave by inode number to distribute better across nodes */ vma->vm_pgoff = index + info->vfs_inode.i_ino; vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index); @@ -1712,7 +1714,7 @@ repeat: goto failed; } - error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg, + error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, false); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, @@ -1818,7 +1820,7 @@ alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, if (sgp == SGP_WRITE) __SetPageReferenced(page); - error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg, + error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, PageTransHuge(page)); if (error) goto unacct; @@ -2187,7 +2189,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode inode_init_owner(inode, dir, mode); inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); - inode->i_generation = get_seconds(); + inode->i_generation = prandom_u32(); info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); spin_lock_init(&info->lock); @@ -2291,7 +2293,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, __SetPageSwapBacked(page); __SetPageUptodate(page); - ret = mem_cgroup_try_charge(page, dst_mm, gfp, &memcg, false); + ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false); if (ret) goto out_release; @@ -3896,18 +3898,11 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); /* common code */ -static const struct dentry_operations anon_ops = { - .d_dname = simple_dname -}; - static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size, unsigned long flags, unsigned int i_flags) { - struct file *res; struct inode *inode; - struct path path; - struct super_block *sb; - struct qstr this; + struct file *res; if (IS_ERR(mnt)) return ERR_CAST(mnt); @@ -3918,41 +3913,21 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, l if (shmem_acct_size(flags, size)) return ERR_PTR(-ENOMEM); - res = ERR_PTR(-ENOMEM); - this.name = name; - this.len = strlen(name); - this.hash = 0; /* will go */ - sb = mnt->mnt_sb; - path.mnt = mntget(mnt); - path.dentry = d_alloc_pseudo(sb, &this); - if (!path.dentry) - goto put_memory; - d_set_d_op(path.dentry, &anon_ops); - - res = ERR_PTR(-ENOSPC); - inode = shmem_get_inode(sb, NULL, S_IFREG | 0777, 0, flags); - if (!inode) - goto put_memory; - + inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0, + flags); + if (unlikely(!inode)) { + shmem_unacct_size(flags, size); + return ERR_PTR(-ENOSPC); + } inode->i_flags |= i_flags; - d_instantiate(path.dentry, inode); inode->i_size = size; clear_nlink(inode); /* It is unlinked */ res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); + if (!IS_ERR(res)) + res = alloc_file_pseudo(inode, mnt, name, O_RDWR, + &shmem_file_operations); if (IS_ERR(res)) - goto put_path; - - res = alloc_file(&path, FMODE_WRITE | FMODE_READ, - &shmem_file_operations); - if (IS_ERR(res)) - goto put_path; - - return res; - -put_memory: - shmem_unacct_size(flags, size); -put_path: - path_put(&path); + iput(inode); return res; } diff --git a/mm/slab.h b/mm/slab.h index 68bdf498da3b..58c6c1c2a78e 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -203,7 +203,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM /* List of all root caches. */ extern struct list_head slab_root_caches; @@ -296,7 +296,7 @@ extern void memcg_link_cache(struct kmem_cache *s); extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s, void (*deact_fn)(struct kmem_cache *)); -#else /* CONFIG_MEMCG && !CONFIG_SLOB */ +#else /* CONFIG_MEMCG_KMEM */ /* If !memcg, all caches are root. */ #define slab_root_caches slab_caches @@ -351,7 +351,7 @@ static inline void memcg_link_cache(struct kmem_cache *s) { } -#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) { diff --git a/mm/slab_common.c b/mm/slab_common.c index 890b1f04a03a..fea3376f9816 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -127,7 +127,7 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, return i; } -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM LIST_HEAD(slab_root_caches); @@ -256,7 +256,7 @@ static inline void destroy_memcg_params(struct kmem_cache *s) static inline void memcg_unlink_cache(struct kmem_cache *s) { } -#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ /* * Figure out what the alignment of the objects will be given a set of @@ -567,10 +567,14 @@ static int shutdown_cache(struct kmem_cache *s) list_del(&s->list); if (s->flags & SLAB_TYPESAFE_BY_RCU) { +#ifdef SLAB_SUPPORTS_SYSFS + sysfs_slab_unlink(s); +#endif list_add_tail(&s->list, &slab_caches_to_rcu_destroy); schedule_work(&slab_caches_to_rcu_destroy_work); } else { #ifdef SLAB_SUPPORTS_SYSFS + sysfs_slab_unlink(s); sysfs_slab_release(s); #else slab_kmem_cache_release(s); @@ -580,7 +584,7 @@ static int shutdown_cache(struct kmem_cache *s) return 0; } -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#ifdef CONFIG_MEMCG_KMEM /* * memcg_create_kmem_cache - Create a cache for a memory cgroup. * @memcg: The memory cgroup the new cache is for. @@ -857,7 +861,7 @@ static inline int shutdown_memcg_caches(struct kmem_cache *s) static inline void flush_memcg_workqueue(struct kmem_cache *s) { } -#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ +#endif /* CONFIG_MEMCG_KMEM */ void slab_kmem_cache_release(struct kmem_cache *s) { diff --git a/mm/slub.c b/mm/slub.c index a3b8467c14af..ce2b9e5cea77 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -271,8 +271,7 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object) static void prefetch_freepointer(const struct kmem_cache *s, void *object) { - if (object) - prefetch(freelist_dereference(s, object + s->offset)); + prefetch(object + s->offset); } static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) @@ -5667,7 +5666,6 @@ static void sysfs_slab_remove_workfn(struct work_struct *work) kset_unregister(s->memcg_kset); #endif kobject_uevent(&s->kobj, KOBJ_REMOVE); - kobject_del(&s->kobj); out: kobject_put(&s->kobj); } @@ -5752,6 +5750,12 @@ static void sysfs_slab_remove(struct kmem_cache *s) schedule_work(&s->kobj_remove_work); } +void sysfs_slab_unlink(struct kmem_cache *s) +{ + if (slab_state >= FULL) + kobject_del(&s->kobj); +} + void sysfs_slab_release(struct kmem_cache *s) { if (slab_state >= FULL) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index bd0276d5f66b..8301293331a2 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -43,12 +43,9 @@ static void * __ref __earlyonly_bootmem_alloc(int node, unsigned long goal) { return memblock_virt_alloc_try_nid_raw(size, align, goal, - BOOTMEM_ALLOC_ACCESSIBLE, node); + BOOTMEM_ALLOC_ACCESSIBLE, node); } -static void *vmemmap_buf; -static void *vmemmap_buf_end; - void * __meminit vmemmap_alloc_block(unsigned long size, int node) { /* If the main allocator is up use that, fallback to bootmem. */ @@ -76,18 +73,10 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) /* need to make sure size is all the same during early stage */ void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node) { - void *ptr; - - if (!vmemmap_buf) - return vmemmap_alloc_block(size, node); - - /* take the from buf */ - ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size); - if (ptr + size > vmemmap_buf_end) - return vmemmap_alloc_block(size, node); - - vmemmap_buf = ptr + size; + void *ptr = sparse_buffer_alloc(size); + if (!ptr) + ptr = vmemmap_alloc_block(size, node); return ptr; } @@ -272,45 +261,3 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid, return map; } - -void __init sparse_mem_maps_populate_node(struct page **map_map, - unsigned long pnum_begin, - unsigned long pnum_end, - unsigned long map_count, int nodeid) -{ - unsigned long pnum; - unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; - void *vmemmap_buf_start; - - size = ALIGN(size, PMD_SIZE); - vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count, - PMD_SIZE, __pa(MAX_DMA_ADDRESS)); - - if (vmemmap_buf_start) { - vmemmap_buf = vmemmap_buf_start; - vmemmap_buf_end = vmemmap_buf_start + size * map_count; - } - - for (pnum = pnum_begin; pnum < pnum_end; pnum++) { - struct mem_section *ms; - - if (!present_section_nr(pnum)) - continue; - - map_map[pnum] = sparse_mem_map_populate(pnum, nodeid, NULL); - if (map_map[pnum]) - continue; - ms = __nr_to_section(pnum); - pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", - __func__); - ms->section_mem_map = 0; - } - - if (vmemmap_buf_start) { - /* need to free left buf */ - memblock_free_early(__pa(vmemmap_buf), - vmemmap_buf_end - vmemmap_buf); - vmemmap_buf = NULL; - vmemmap_buf_end = NULL; - } -} diff --git a/mm/sparse.c b/mm/sparse.c index f13f2723950a..10b07eea9a6e 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -200,6 +200,11 @@ static inline int next_present_section_nr(int section_nr) (section_nr <= __highest_present_section_nr)); \ section_nr = next_present_section_nr(section_nr)) +static inline unsigned long first_present_section_nr(void) +{ + return next_present_section_nr(-1); +} + /* Record a memory area against a node. */ void __init memory_present(int nid, unsigned long start, unsigned long end) { @@ -257,19 +262,14 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum); } -static int __meminit sparse_init_one_section(struct mem_section *ms, +static void __meminit sparse_init_one_section(struct mem_section *ms, unsigned long pnum, struct page *mem_map, unsigned long *pageblock_bitmap) { - if (!present_section(ms)) - return -EINVAL; - ms->section_mem_map &= ~SECTION_MAP_MASK; ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) | SECTION_HAS_MEM_MAP; ms->pageblock_flags = pageblock_bitmap; - - return 1; } unsigned long usemap_size(void) @@ -370,160 +370,121 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) } #endif /* CONFIG_MEMORY_HOTREMOVE */ -static void __init sparse_early_usemaps_alloc_node(void *data, - unsigned long pnum_begin, - unsigned long pnum_end, - unsigned long usemap_count, int nodeid) +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static unsigned long __init section_map_size(void) { - void *usemap; - unsigned long pnum; - unsigned long **usemap_map = (unsigned long **)data; - int size = usemap_size(); - - usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), - size * usemap_count); - if (!usemap) { - pr_warn("%s: allocation failed\n", __func__); - return; - } + return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE); +} - for (pnum = pnum_begin; pnum < pnum_end; pnum++) { - if (!present_section_nr(pnum)) - continue; - usemap_map[pnum] = usemap; - usemap += size; - check_usemap_section_nr(nodeid, usemap_map[pnum]); - } +#else +static unsigned long __init section_map_size(void) +{ + return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); } -#ifndef CONFIG_SPARSEMEM_VMEMMAP struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid, struct vmem_altmap *altmap) { - struct page *map; - unsigned long size; + unsigned long size = section_map_size(); + struct page *map = sparse_buffer_alloc(size); + + if (map) + return map; - size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); map = memblock_virt_alloc_try_nid(size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid); return map; } -void __init sparse_mem_maps_populate_node(struct page **map_map, - unsigned long pnum_begin, - unsigned long pnum_end, - unsigned long map_count, int nodeid) -{ - void *map; - unsigned long pnum; - unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; - - size = PAGE_ALIGN(size); - map = memblock_virt_alloc_try_nid_raw(size * map_count, - PAGE_SIZE, __pa(MAX_DMA_ADDRESS), - BOOTMEM_ALLOC_ACCESSIBLE, nodeid); - if (map) { - for (pnum = pnum_begin; pnum < pnum_end; pnum++) { - if (!present_section_nr(pnum)) - continue; - map_map[pnum] = map; - map += size; - } - return; - } +#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ - /* fallback */ - for (pnum = pnum_begin; pnum < pnum_end; pnum++) { - struct mem_section *ms; +static void *sparsemap_buf __meminitdata; +static void *sparsemap_buf_end __meminitdata; - if (!present_section_nr(pnum)) - continue; - map_map[pnum] = sparse_mem_map_populate(pnum, nodeid, NULL); - if (map_map[pnum]) - continue; - ms = __nr_to_section(pnum); - pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", - __func__); - ms->section_mem_map = 0; - } +static void __init sparse_buffer_init(unsigned long size, int nid) +{ + WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */ + sparsemap_buf = + memblock_virt_alloc_try_nid_raw(size, PAGE_SIZE, + __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, nid); + sparsemap_buf_end = sparsemap_buf + size; } -#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER -static void __init sparse_early_mem_maps_alloc_node(void *data, - unsigned long pnum_begin, - unsigned long pnum_end, - unsigned long map_count, int nodeid) +static void __init sparse_buffer_fini(void) { - struct page **map_map = (struct page **)data; - sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end, - map_count, nodeid); + unsigned long size = sparsemap_buf_end - sparsemap_buf; + + if (sparsemap_buf && size > 0) + memblock_free_early(__pa(sparsemap_buf), size); + sparsemap_buf = NULL; } -#else -static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) -{ - struct page *map; - struct mem_section *ms = __nr_to_section(pnum); - int nid = sparse_early_nid(ms); - map = sparse_mem_map_populate(pnum, nid, NULL); - if (map) - return map; +void * __meminit sparse_buffer_alloc(unsigned long size) +{ + void *ptr = NULL; - pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", - __func__); - ms->section_mem_map = 0; - return NULL; + if (sparsemap_buf) { + ptr = PTR_ALIGN(sparsemap_buf, size); + if (ptr + size > sparsemap_buf_end) + ptr = NULL; + else + sparsemap_buf = ptr + size; + } + return ptr; } -#endif void __weak __meminit vmemmap_populate_print_last(void) { } -/** - * alloc_usemap_and_memmap - memory alloction for pageblock flags and vmemmap - * @map: usemap_map for pageblock flags or mmap_map for vmemmap +/* + * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end) + * And number of present sections in this node is map_count. */ -static void __init alloc_usemap_and_memmap(void (*alloc_func) - (void *, unsigned long, unsigned long, - unsigned long, int), void *data) +static void __init sparse_init_nid(int nid, unsigned long pnum_begin, + unsigned long pnum_end, + unsigned long map_count) { - unsigned long pnum; - unsigned long map_count; - int nodeid_begin = 0; - unsigned long pnum_begin = 0; - - for_each_present_section_nr(0, pnum) { - struct mem_section *ms; + unsigned long pnum, usemap_longs, *usemap; + struct page *map; - ms = __nr_to_section(pnum); - nodeid_begin = sparse_early_nid(ms); - pnum_begin = pnum; - break; + usemap_longs = BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS); + usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid), + usemap_size() * + map_count); + if (!usemap) { + pr_err("%s: node[%d] usemap allocation failed", __func__, nid); + goto failed; + } + sparse_buffer_init(map_count * section_map_size(), nid); + for_each_present_section_nr(pnum_begin, pnum) { + if (pnum >= pnum_end) + break; + + map = sparse_mem_map_populate(pnum, nid, NULL); + if (!map) { + pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.", + __func__, nid); + pnum_begin = pnum; + goto failed; + } + check_usemap_section_nr(nid, usemap); + sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap); + usemap += usemap_longs; } - map_count = 1; - for_each_present_section_nr(pnum_begin + 1, pnum) { + sparse_buffer_fini(); + return; +failed: + /* We failed to allocate, mark all the following pnums as not present */ + for_each_present_section_nr(pnum_begin, pnum) { struct mem_section *ms; - int nodeid; + if (pnum >= pnum_end) + break; ms = __nr_to_section(pnum); - nodeid = sparse_early_nid(ms); - if (nodeid == nodeid_begin) { - map_count++; - continue; - } - /* ok, we need to take cake of from pnum_begin to pnum - 1*/ - alloc_func(data, pnum_begin, pnum, - map_count, nodeid_begin); - /* new start, update count etc*/ - nodeid_begin = nodeid; - pnum_begin = pnum; - map_count = 1; + ms->section_mem_map = 0; } - /* ok, last chunk */ - alloc_func(data, pnum_begin, __highest_present_section_nr+1, - map_count, nodeid_begin); } /* @@ -532,72 +493,29 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func) */ void __init sparse_init(void) { - unsigned long pnum; - struct page *map; - unsigned long *usemap; - unsigned long **usemap_map; - int size; -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - int size2; - struct page **map_map; -#endif - - /* see include/linux/mmzone.h 'struct mem_section' definition */ - BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section))); + unsigned long pnum_begin = first_present_section_nr(); + int nid_begin = sparse_early_nid(__nr_to_section(pnum_begin)); + unsigned long pnum_end, map_count = 1; /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ set_pageblock_order(); - /* - * map is using big page (aka 2M in x86 64 bit) - * usemap is less one page (aka 24 bytes) - * so alloc 2M (with 2M align) and 24 bytes in turn will - * make next 2M slip to one more 2M later. - * then in big system, the memory will have a lot of holes... - * here try to allocate 2M pages continuously. - * - * powerpc need to call sparse_init_one_section right after each - * sparse_early_mem_map_alloc, so allocate usemap_map at first. - */ - size = sizeof(unsigned long *) * NR_MEM_SECTIONS; - usemap_map = memblock_virt_alloc(size, 0); - if (!usemap_map) - panic("can not allocate usemap_map\n"); - alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, - (void *)usemap_map); - -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - size2 = sizeof(struct page *) * NR_MEM_SECTIONS; - map_map = memblock_virt_alloc(size2, 0); - if (!map_map) - panic("can not allocate map_map\n"); - alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, - (void *)map_map); -#endif - - for_each_present_section_nr(0, pnum) { - usemap = usemap_map[pnum]; - if (!usemap) - continue; + for_each_present_section_nr(pnum_begin + 1, pnum_end) { + int nid = sparse_early_nid(__nr_to_section(pnum_end)); -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - map = map_map[pnum]; -#else - map = sparse_early_mem_map_alloc(pnum); -#endif - if (!map) + if (nid == nid_begin) { + map_count++; continue; - - sparse_init_one_section(__nr_to_section(pnum), pnum, map, - usemap); + } + /* Init node with sections in range [pnum_begin, pnum_end) */ + sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count); + nid_begin = nid; + pnum_begin = pnum_end; + map_count = 1; } - + /* cover the last node */ + sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count); vmemmap_populate_print_last(); - -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - memblock_free_early(__pa(map_map), size2); -#endif - memblock_free_early(__pa(usemap_map), size); } #ifdef CONFIG_MEMORY_HOTPLUG @@ -760,6 +678,7 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat, ret = sparse_index_init(section_nr, pgdat->node_id); if (ret < 0 && ret != -EEXIST) return ret; + ret = 0; memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, altmap); if (!memmap) return -ENOMEM; @@ -786,12 +705,11 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat, #endif section_mark_present(ms); - - ret = sparse_init_one_section(ms, section_nr, memmap, usemap); + sparse_init_one_section(ms, section_nr, memmap, usemap); out: pgdat_resize_unlock(pgdat, &flags); - if (ret <= 0) { + if (ret < 0) { kfree(usemap); __kfree_section_memmap(memmap, altmap); } diff --git a/mm/swap_slots.c b/mm/swap_slots.c index a791411fed71..008ccb22fee6 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -38,9 +38,9 @@ static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots); static bool swap_slot_cache_active; bool swap_slot_cache_enabled; static bool swap_slot_cache_initialized; -DEFINE_MUTEX(swap_slots_cache_mutex); +static DEFINE_MUTEX(swap_slots_cache_mutex); /* Serialize swap slots cache enable/disable operations */ -DEFINE_MUTEX(swap_slots_cache_enable_mutex); +static DEFINE_MUTEX(swap_slots_cache_enable_mutex); static void __drain_swap_slots_cache(unsigned int type); static void deactivate_swap_slots_cache(void); diff --git a/mm/swapfile.c b/mm/swapfile.c index 2cc2972eedaf..8837b22c848d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2909,6 +2909,35 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) return 0; } + +/* + * Find out how many pages are allowed for a single swap device. There + * are two limiting factors: + * 1) the number of bits for the swap offset in the swp_entry_t type, and + * 2) the number of bits in the swap pte, as defined by the different + * architectures. + * + * In order to find the largest possible bit mask, a swap entry with + * swap type 0 and swap offset ~0UL is created, encoded to a swap pte, + * decoded to a swp_entry_t again, and finally the swap offset is + * extracted. + * + * This will mask all the bits from the initial ~0UL mask that can't + * be encoded in either the swp_entry_t or the architecture definition + * of a swap pte. + */ +unsigned long generic_max_swapfile_size(void) +{ + return swp_offset(pte_to_swp_entry( + swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; +} + +/* Can be overridden by an architecture for additional checks. */ +__weak unsigned long max_swapfile_size(void) +{ + return generic_max_swapfile_size(); +} + static unsigned long read_swap_header(struct swap_info_struct *p, union swap_header *swap_header, struct inode *inode) @@ -2944,22 +2973,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p, p->cluster_next = 1; p->cluster_nr = 0; - /* - * Find out how many pages are allowed for a single swap - * device. There are two limiting factors: 1) the number - * of bits for the swap offset in the swp_entry_t type, and - * 2) the number of bits in the swap pte as defined by the - * different architectures. In order to find the - * largest possible bit mask, a swap entry with swap type 0 - * and swap offset ~0UL is created, encoded to a swap pte, - * decoded to a swp_entry_t again, and finally the swap - * offset is extracted. This will mask all the bits from - * the initial ~0UL mask that can't be encoded in either - * the swp_entry_t or the architecture definition of a - * swap pte. - */ - maxpages = swp_offset(pte_to_swp_entry( - swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; + maxpages = max_swapfile_size(); last_page = swap_header->info.last_page; if (!last_page) { pr_warn("Empty swap-file\n"); @@ -3731,6 +3745,37 @@ static void free_swap_count_continuations(struct swap_info_struct *si) } } +#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) +void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node, + gfp_t gfp_mask) +{ + struct swap_info_struct *si, *next; + if (!(gfp_mask & __GFP_IO) || !memcg) + return; + + if (!blk_cgroup_congested()) + return; + + /* + * We've already scheduled a throttle, avoid taking the global swap + * lock. + */ + if (current->throttle_queue) + return; + + spin_lock(&swap_avail_lock); + plist_for_each_entry_safe(si, next, &swap_avail_heads[node], + avail_lists[node]) { + if (si->bdev) { + blkcg_schedule_throttle(bdev_get_queue(si->bdev), + true); + break; + } + } + spin_unlock(&swap_avail_lock); +} +#endif + static int __init swapfile_init(void) { int nid; diff --git a/mm/usercopy.c b/mm/usercopy.c index e9e9325f7638..852eb4e53f06 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -20,6 +20,8 @@ #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/thread_info.h> +#include <linux/atomic.h> +#include <linux/jump_label.h> #include <asm/sections.h> /* @@ -240,6 +242,8 @@ static inline void check_heap_object(const void *ptr, unsigned long n, } } +static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks); + /* * Validates that the given object is: * - not bogus address @@ -248,6 +252,9 @@ static inline void check_heap_object(const void *ptr, unsigned long n, */ void __check_object_size(const void *ptr, unsigned long n, bool to_user) { + if (static_branch_unlikely(&bypass_usercopy_checks)) + return; + /* Skip all tests if size is zero. */ if (!n) return; @@ -279,3 +286,21 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user) check_kernel_text_object((const unsigned long)ptr, n, to_user); } EXPORT_SYMBOL(__check_object_size); + +static bool enable_checks __initdata = true; + +static int __init parse_hardened_usercopy(char *str) +{ + return strtobool(str, &enable_checks); +} + +__setup("hardened_usercopy=", parse_hardened_usercopy); + +static int __init set_hardened_usercopy(void) +{ + if (enable_checks == false) + static_branch_enable(&bypass_usercopy_checks); + return 1; +} + +late_initcall(set_hardened_usercopy); diff --git a/mm/vmacache.c b/mm/vmacache.c index db7596eb6132..ea517bef7dc5 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c @@ -6,6 +6,18 @@ #include <linux/sched/task.h> #include <linux/mm.h> #include <linux/vmacache.h> +#include <asm/pgtable.h> + +/* + * Hash based on the pmd of addr if configured with MMU, which provides a good + * hit rate for workloads with spatial locality. Otherwise, use pages. + */ +#ifdef CONFIG_MMU +#define VMACACHE_SHIFT PMD_SHIFT +#else +#define VMACACHE_SHIFT PAGE_SHIFT +#endif +#define VMACACHE_HASH(addr) ((addr >> VMACACHE_SHIFT) & VMACACHE_MASK) /* * Flush vma caches for threads that share a given mm. @@ -87,6 +99,7 @@ static bool vmacache_valid(struct mm_struct *mm) struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) { + int idx = VMACACHE_HASH(addr); int i; count_vm_vmacache_event(VMACACHE_FIND_CALLS); @@ -95,16 +108,20 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) return NULL; for (i = 0; i < VMACACHE_SIZE; i++) { - struct vm_area_struct *vma = current->vmacache.vmas[i]; + struct vm_area_struct *vma = current->vmacache.vmas[idx]; - if (!vma) - continue; - if (WARN_ON_ONCE(vma->vm_mm != mm)) - break; - if (vma->vm_start <= addr && vma->vm_end > addr) { - count_vm_vmacache_event(VMACACHE_FIND_HITS); - return vma; + if (vma) { +#ifdef CONFIG_DEBUG_VM_VMACACHE + if (WARN_ON_ONCE(vma->vm_mm != mm)) + break; +#endif + if (vma->vm_start <= addr && vma->vm_end > addr) { + count_vm_vmacache_event(VMACACHE_FIND_HITS); + return vma; + } } + if (++idx == VMACACHE_SIZE) + idx = 0; } return NULL; @@ -115,6 +132,7 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, unsigned long start, unsigned long end) { + int idx = VMACACHE_HASH(start); int i; count_vm_vmacache_event(VMACACHE_FIND_CALLS); @@ -123,12 +141,14 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, return NULL; for (i = 0; i < VMACACHE_SIZE; i++) { - struct vm_area_struct *vma = current->vmacache.vmas[i]; + struct vm_area_struct *vma = current->vmacache.vmas[idx]; if (vma && vma->vm_start == start && vma->vm_end == end) { count_vm_vmacache_event(VMACACHE_FIND_HITS); return vma; } + if (++idx == VMACACHE_SIZE) + idx = 0; } return NULL; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index cfea25be7754..a728fc492557 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1907,10 +1907,6 @@ void *vzalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vzalloc_node); -#ifndef PAGE_KERNEL_EXEC -# define PAGE_KERNEL_EXEC PAGE_KERNEL -#endif - /** * vmalloc_exec - allocate virtually contiguous, executable memory * @size: allocation size diff --git a/mm/vmscan.c b/mm/vmscan.c index 03822f86f288..4375b1e9bd56 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -65,12 +65,6 @@ struct scan_control { /* How many pages shrink_list() should reclaim */ unsigned long nr_to_reclaim; - /* This context's GFP mask */ - gfp_t gfp_mask; - - /* Allocation order */ - int order; - /* * Nodemask of nodes allowed by the caller. If NULL, all nodes * are scanned. @@ -83,12 +77,6 @@ struct scan_control { */ struct mem_cgroup *target_mem_cgroup; - /* Scan (total_size >> priority) pages at once */ - int priority; - - /* The highest zone to isolate pages for reclaim from */ - enum zone_type reclaim_idx; - /* Writepage batching in laptop mode; RECLAIM_WRITE */ unsigned int may_writepage:1; @@ -111,6 +99,18 @@ struct scan_control { /* One of the zones is ready for compaction */ unsigned int compaction_ready:1; + /* Allocation order */ + s8 order; + + /* Scan (total_size >> priority) pages at once */ + s8 priority; + + /* The highest zone to isolate pages for reclaim from */ + s8 reclaim_idx; + + /* This context's GFP mask */ + gfp_t gfp_mask; + /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; @@ -169,6 +169,70 @@ unsigned long vm_total_pages; static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); +#ifdef CONFIG_MEMCG_KMEM + +/* + * We allow subsystems to populate their shrinker-related + * LRU lists before register_shrinker_prepared() is called + * for the shrinker, since we don't want to impose + * restrictions on their internal registration order. + * In this case shrink_slab_memcg() may find corresponding + * bit is set in the shrinkers map. + * + * This value is used by the function to detect registering + * shrinkers and to skip do_shrink_slab() calls for them. + */ +#define SHRINKER_REGISTERING ((struct shrinker *)~0UL) + +static DEFINE_IDR(shrinker_idr); +static int shrinker_nr_max; + +static int prealloc_memcg_shrinker(struct shrinker *shrinker) +{ + int id, ret = -ENOMEM; + + down_write(&shrinker_rwsem); + /* This may call shrinker, so it must use down_read_trylock() */ + id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL); + if (id < 0) + goto unlock; + + if (id >= shrinker_nr_max) { + if (memcg_expand_shrinker_maps(id)) { + idr_remove(&shrinker_idr, id); + goto unlock; + } + + shrinker_nr_max = id + 1; + } + shrinker->id = id; + ret = 0; +unlock: + up_write(&shrinker_rwsem); + return ret; +} + +static void unregister_memcg_shrinker(struct shrinker *shrinker) +{ + int id = shrinker->id; + + BUG_ON(id < 0); + + down_write(&shrinker_rwsem); + idr_remove(&shrinker_idr, id); + up_write(&shrinker_rwsem); +} +#else /* CONFIG_MEMCG_KMEM */ +static int prealloc_memcg_shrinker(struct shrinker *shrinker) +{ + return 0; +} + +static void unregister_memcg_shrinker(struct shrinker *shrinker) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + #ifdef CONFIG_MEMCG static bool global_reclaim(struct scan_control *sc) { @@ -313,11 +377,28 @@ int prealloc_shrinker(struct shrinker *shrinker) shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); if (!shrinker->nr_deferred) return -ENOMEM; + + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { + if (prealloc_memcg_shrinker(shrinker)) + goto free_deferred; + } + return 0; + +free_deferred: + kfree(shrinker->nr_deferred); + shrinker->nr_deferred = NULL; + return -ENOMEM; } void free_prealloced_shrinker(struct shrinker *shrinker) { + if (!shrinker->nr_deferred) + return; + + if (shrinker->flags & SHRINKER_MEMCG_AWARE) + unregister_memcg_shrinker(shrinker); + kfree(shrinker->nr_deferred); shrinker->nr_deferred = NULL; } @@ -326,6 +407,9 @@ void register_shrinker_prepared(struct shrinker *shrinker) { down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); +#ifdef CONFIG_MEMCG_KMEM + idr_replace(&shrinker_idr, shrinker, shrinker->id); +#endif up_write(&shrinker_rwsem); } @@ -347,6 +431,8 @@ void unregister_shrinker(struct shrinker *shrinker) { if (!shrinker->nr_deferred) return; + if (shrinker->flags & SHRINKER_MEMCG_AWARE) + unregister_memcg_shrinker(shrinker); down_write(&shrinker_rwsem); list_del(&shrinker->list); up_write(&shrinker_rwsem); @@ -371,9 +457,12 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, : SHRINK_BATCH; long scanned = 0, next_deferred; + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + nid = 0; + freeable = shrinker->count_objects(shrinker, shrinkctl); - if (freeable == 0) - return 0; + if (freeable == 0 || freeable == SHRINK_EMPTY) + return freeable; /* * copy the current shrinker scan count into a local variable @@ -474,6 +563,84 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, return freed; } +#ifdef CONFIG_MEMCG_KMEM +static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, + struct mem_cgroup *memcg, int priority) +{ + struct memcg_shrinker_map *map; + unsigned long freed = 0; + int ret, i; + + if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)) + return 0; + + if (!down_read_trylock(&shrinker_rwsem)) + return 0; + + map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map, + true); + if (unlikely(!map)) + goto unlock; + + for_each_set_bit(i, map->map, shrinker_nr_max) { + struct shrink_control sc = { + .gfp_mask = gfp_mask, + .nid = nid, + .memcg = memcg, + }; + struct shrinker *shrinker; + + shrinker = idr_find(&shrinker_idr, i); + if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) { + if (!shrinker) + clear_bit(i, map->map); + continue; + } + + ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) { + clear_bit(i, map->map); + /* + * After the shrinker reported that it had no objects to + * free, but before we cleared the corresponding bit in + * the memcg shrinker map, a new object might have been + * added. To make sure, we have the bit set in this + * case, we invoke the shrinker one more time and reset + * the bit if it reports that it is not empty anymore. + * The memory barrier here pairs with the barrier in + * memcg_set_shrinker_bit(): + * + * list_lru_add() shrink_slab_memcg() + * list_add_tail() clear_bit() + * <MB> <MB> + * set_bit() do_shrink_slab() + */ + smp_mb__after_atomic(); + ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) + ret = 0; + else + memcg_set_shrinker_bit(memcg, nid, i); + } + freed += ret; + + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + break; + } + } +unlock: + up_read(&shrinker_rwsem); + return freed; +} +#else /* CONFIG_MEMCG_KMEM */ +static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, + struct mem_cgroup *memcg, int priority) +{ + return 0; +} +#endif /* CONFIG_MEMCG_KMEM */ + /** * shrink_slab - shrink slab caches * @gfp_mask: allocation context @@ -486,10 +653,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, * unaware shrinkers will receive a node id of 0 instead. * - * @memcg specifies the memory cgroup to target. If it is not NULL, - * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan - * objects from the memory cgroup specified. Otherwise, only unaware - * shrinkers are called. + * @memcg specifies the memory cgroup to target. Unaware shrinkers + * are called only if it is the root cgroup. * * @priority is sc->priority, we take the number of objects and >> by priority * in order to get the scan target. @@ -502,9 +667,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, { struct shrinker *shrinker; unsigned long freed = 0; + int ret; - if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))) - return 0; + if (!mem_cgroup_is_root(memcg)) + return shrink_slab_memcg(gfp_mask, nid, memcg, priority); if (!down_read_trylock(&shrinker_rwsem)) goto out; @@ -516,19 +682,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, .memcg = memcg, }; - /* - * If kernel memory accounting is disabled, we ignore - * SHRINKER_MEMCG_AWARE flag and call all shrinkers - * passing NULL for memcg. - */ - if (memcg_kmem_enabled() && - !!memcg != !!(shrinker->flags & SHRINKER_MEMCG_AWARE)) - continue; - - if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) - sc.nid = 0; - - freed += do_shrink_slab(&sc, shrinker, priority); + ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) + ret = 0; + freed += ret; /* * Bail out if someone want to register a new shrinker to * prevent the regsitration from being stalled for long periods @@ -554,6 +711,7 @@ void drop_slab_node(int nid) struct mem_cgroup *memcg = NULL; freed = 0; + memcg = mem_cgroup_iter(NULL, NULL, NULL); do { freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); @@ -2573,9 +2731,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) shrink_node_memcg(pgdat, memcg, sc, &lru_pages); node_lru_pages += lru_pages; - if (memcg) - shrink_slab(sc->gfp_mask, pgdat->node_id, - memcg, sc->priority); + shrink_slab(sc->gfp_mask, pgdat->node_id, + memcg, sc->priority); /* Record the group's reclaim efficiency */ vmpressure(sc->gfp_mask, memcg, false, @@ -2599,10 +2756,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) } } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim))); - if (global_reclaim(sc)) - shrink_slab(sc->gfp_mask, pgdat->node_id, NULL, - sc->priority); - if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; @@ -3064,6 +3217,14 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, }; /* + * scan_control uses s8 fields for order, priority, and reclaim_idx. + * Confirm they are large enough for max values. + */ + BUILD_BUG_ON(MAX_ORDER > S8_MAX); + BUILD_BUG_ON(DEF_PRIORITY > S8_MAX); + BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX); + + /* * Do not enter reclaim if fatal signal was delivered while throttled. * 1 is returned so that the page allocator does not OOM kill at this * point. diff --git a/mm/vmstat.c b/mm/vmstat.c index 75eda9c2b260..8ba0870ecddd 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1796,11 +1796,9 @@ static void vmstat_update(struct work_struct *w) * to occur in the future. Keep on running the * update worker thread. */ - preempt_disable(); queue_delayed_work_on(smp_processor_id(), mm_percpu_wq, this_cpu_ptr(&vmstat_work), round_jiffies_relative(sysctl_stat_interval)); - preempt_enable(); } } diff --git a/mm/workingset.c b/mm/workingset.c index 40ee02c83978..4516dd790129 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -366,10 +366,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, unsigned long nodes; unsigned long cache; - /* list_lru lock nests inside the IRQ-safe i_pages lock */ - local_irq_disable(); nodes = list_lru_shrink_count(&shadow_nodes, sc); - local_irq_enable(); /* * Approximate a reasonable limit for the radix tree nodes @@ -402,6 +399,9 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, } max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3); + if (!nodes) + return SHRINK_EMPTY; + if (nodes <= max_nodes) return 0; return nodes - max_nodes; @@ -434,7 +434,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, /* Coming from the list, invert the lock order */ if (!xa_trylock(&mapping->i_pages)) { - spin_unlock(lru_lock); + spin_unlock_irq(lru_lock); ret = LRU_RETRY; goto out; } @@ -472,26 +472,20 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, workingset_lookup_update(mapping)); out_invalid: - xa_unlock(&mapping->i_pages); + xa_unlock_irq(&mapping->i_pages); ret = LRU_REMOVED_RETRY; out: - local_irq_enable(); cond_resched(); - local_irq_disable(); - spin_lock(lru_lock); + spin_lock_irq(lru_lock); return ret; } static unsigned long scan_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc) { - unsigned long ret; - /* list_lru lock nests inside the IRQ-safe i_pages lock */ - local_irq_disable(); - ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL); - local_irq_enable(); - return ret; + return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate, + NULL); } static struct shrinker workingset_shadow_shrinker = { @@ -528,15 +522,17 @@ static int __init workingset_init(void) pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); - ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key); + ret = prealloc_shrinker(&workingset_shadow_shrinker); if (ret) goto err; - ret = register_shrinker(&workingset_shadow_shrinker); + ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key, + &workingset_shadow_shrinker); if (ret) goto err_list_lru; + register_shrinker_prepared(&workingset_shadow_shrinker); return 0; err_list_lru: - list_lru_destroy(&shadow_nodes); + free_prealloced_shrinker(&workingset_shadow_shrinker); err: return ret; } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 8d87e973a4f5..9da65552e7ca 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -924,20 +924,7 @@ static void reset_page(struct page *page) page->freelist = NULL; } -/* - * To prevent zspage destroy during migration, zspage freeing should - * hold locks of all pages in the zspage. - */ -void lock_zspage(struct zspage *zspage) -{ - struct page *page = get_first_page(zspage); - - do { - lock_page(page); - } while ((page = get_next_page(page)) != NULL); -} - -int trylock_zspage(struct zspage *zspage) +static int trylock_zspage(struct zspage *zspage) { struct page *cursor, *fail; @@ -1814,6 +1801,19 @@ static enum fullness_group putback_zspage(struct size_class *class, } #ifdef CONFIG_COMPACTION +/* + * To prevent zspage destroy during migration, zspage freeing should + * hold locks of all pages in the zspage. + */ +static void lock_zspage(struct zspage *zspage) +{ + struct page *page = get_first_page(zspage); + + do { + lock_page(page); + } while ((page = get_next_page(page)) != NULL); +} + static struct dentry *zs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { @@ -1905,7 +1905,7 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage, __SetPageMovable(newpage, page_mapping(oldpage)); } -bool zs_page_isolate(struct page *page, isolate_mode_t mode) +static bool zs_page_isolate(struct page *page, isolate_mode_t mode) { struct zs_pool *pool; struct size_class *class; @@ -1960,7 +1960,7 @@ bool zs_page_isolate(struct page *page, isolate_mode_t mode) return true; } -int zs_page_migrate(struct address_space *mapping, struct page *newpage, +static int zs_page_migrate(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode) { struct zs_pool *pool; @@ -2076,7 +2076,7 @@ unpin_objects: return ret; } -void zs_page_putback(struct page *page) +static void zs_page_putback(struct page *page) { struct zs_pool *pool; struct size_class *class; @@ -2108,7 +2108,7 @@ void zs_page_putback(struct page *page) spin_unlock(&class->lock); } -const struct address_space_operations zsmalloc_aops = { +static const struct address_space_operations zsmalloc_aops = { .isolate_page = zs_page_isolate, .migratepage = zs_page_migrate, .putback_page = zs_page_putback, diff --git a/mm/zswap.c b/mm/zswap.c index 7d34e69507e3..cd91fd9d96b8 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1026,6 +1026,15 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, ret = -ENOMEM; goto reject; } + + /* A second zswap_is_full() check after + * zswap_shrink() to make sure it's now + * under the max_pool_percent + */ + if (zswap_is_full()) { + ret = -ENOMEM; + goto reject; + } } /* allocate entry */ |