diff options
Diffstat (limited to 'mm')
53 files changed, 1331 insertions, 1073 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index d75a0107f61f..e72e61c1d62e 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -383,7 +383,7 @@ config NOMMU_INITIAL_TRIM_EXCESS This option specifies the initial value of this option. The default of 1 says that all excess pages should be trimmed. - See Documentation/mm/nommu-mmap.rst for more information. + See Documentation/admin-guide/mm/nommu-mmap.rst for more information. config TRANSPARENT_HUGEPAGE bool "Transparent Hugepage Support" @@ -832,10 +832,10 @@ config PERCPU_STATS be used to help understand percpu memory usage. config GUP_BENCHMARK - bool "Enable infrastructure for get_user_pages_fast() benchmarking" + bool "Enable infrastructure for get_user_pages() and related calls benchmarking" help Provides /sys/kernel/debug/gup_benchmark that helps with testing - performance of get_user_pages_fast(). + performance of get_user_pages() and related calls. See tools/testing/selftests/vm/gup_benchmark.c diff --git a/mm/Makefile b/mm/Makefile index d5649f1c12c0..d73aed0fc99c 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -94,7 +94,6 @@ obj-$(CONFIG_GUP_BENCHMARK) += gup_benchmark.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o -obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o obj-$(CONFIG_PAGE_OWNER) += page_owner.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 8e8b00627bb2..408d5051d05b 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -14,9 +14,7 @@ #include <linux/device.h> #include <trace/events/writeback.h> -struct backing_dev_info noop_backing_dev_info = { - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, -}; +struct backing_dev_info noop_backing_dev_info; EXPORT_SYMBOL_GPL(noop_backing_dev_info); static struct class *bdi_class; @@ -204,10 +202,9 @@ static ssize_t stable_pages_required_show(struct device *dev, struct device_attribute *attr, char *page) { - struct backing_dev_info *bdi = dev_get_drvdata(dev); - - return snprintf(page, PAGE_SIZE-1, "%d\n", - bdi_cap_stable_pages_required(bdi) ? 1 : 0); + dev_warn_once(dev, + "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n"); + return snprintf(page, PAGE_SIZE-1, "%d\n", 0); } static DEVICE_ATTR_RO(stable_pages_required); @@ -746,6 +743,9 @@ struct backing_dev_info *bdi_alloc(int node_id) kfree(bdi); return NULL; } + bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT; + bdi->ra_pages = VM_READAHEAD_PAGES; + bdi->io_pages = VM_READAHEAD_PAGES; return bdi; } EXPORT_SYMBOL(bdi_alloc); diff --git a/mm/compaction.c b/mm/compaction.c index 176dcded298e..6c63844fc061 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -180,11 +180,10 @@ bool compaction_deferred(struct zone *zone, int order) return false; /* Avoid possible overflow */ - if (++zone->compact_considered > defer_limit) + if (++zone->compact_considered >= defer_limit) { zone->compact_considered = defer_limit; - - if (zone->compact_considered >= defer_limit) return false; + } trace_mm_compaction_deferred(zone, order); diff --git a/mm/debug.c b/mm/debug.c index ca8d1cacdecc..ccca576b2899 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -102,12 +102,12 @@ void __dump_page(struct page *page, const char *reason) if (hpage_pincount_available(page)) { pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n", head, compound_order(head), - head_mapcount(head), - head_pincount(head)); + head_compound_mapcount(head), + head_compound_pincount(head)); } else { pr_warn("head:%p order:%u compound_mapcount:%d\n", head, compound_order(head), - head_mapcount(head)); + head_compound_mapcount(head)); } } if (PageKsm(page)) @@ -120,6 +120,7 @@ void __dump_page(struct page *page, const char *reason) struct hlist_node *dentry_first; struct dentry *dentry_ptr; struct dentry dentry; + unsigned long ino; /* * mapping can be invalid pointer and we don't want to crash @@ -136,21 +137,22 @@ void __dump_page(struct page *page, const char *reason) goto out_mapping; } - if (get_kernel_nofault(dentry_first, &host->i_dentry.first)) { + if (get_kernel_nofault(dentry_first, &host->i_dentry.first) || + get_kernel_nofault(ino, &host->i_ino)) { pr_warn("aops:%ps with invalid host inode %px\n", a_ops, host); goto out_mapping; } if (!dentry_first) { - pr_warn("aops:%ps ino:%lx\n", a_ops, host->i_ino); + pr_warn("aops:%ps ino:%lx\n", a_ops, ino); goto out_mapping; } dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); if (get_kernel_nofault(dentry, dentry_ptr)) { - pr_warn("aops:%ps with invalid dentry %px\n", a_ops, - dentry_ptr); + pr_warn("aops:%ps ino:%lx with invalid dentry %px\n", + a_ops, ino, dentry_ptr); } else { /* * if dentry is corrupted, the %pd handler may still @@ -158,7 +160,7 @@ void __dump_page(struct page *page, const char *reason) * corrupted struct page */ pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n", - a_ops, host->i_ino, &dentry); + a_ops, ino, &dentry); } } out_mapping: diff --git a/mm/dmapool.c b/mm/dmapool.c index f9fb9bbd733e..a97c97232337 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -266,6 +266,7 @@ static void pool_free_page(struct dma_pool *pool, struct dma_page *page) */ void dma_pool_destroy(struct dma_pool *pool) { + struct dma_page *page, *tmp; bool empty = false; if (unlikely(!pool)) @@ -281,17 +282,13 @@ void dma_pool_destroy(struct dma_pool *pool) device_remove_file(pool->dev, &dev_attr_pools); mutex_unlock(&pools_reg_lock); - while (!list_empty(&pool->page_list)) { - struct dma_page *page; - page = list_entry(pool->page_list.next, - struct dma_page, page_list); + list_for_each_entry_safe(page, tmp, &pool->page_list, page_list) { if (is_page_busy(page)) { if (pool->dev) - dev_err(pool->dev, - "dma_pool_destroy %s, %p busy\n", + dev_err(pool->dev, "%s %s, %p busy\n", __func__, pool->name, page->vaddr); else - pr_err("dma_pool_destroy %s, %p busy\n", + pr_err("%s %s, %p busy\n", __func__, pool->name, page->vaddr); /* leak the still-in-use consistent memory */ list_del(&page->page_list); @@ -355,12 +352,11 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, if (data[i] == POOL_POISON_FREED) continue; if (pool->dev) - dev_err(pool->dev, - "dma_pool_alloc %s, %p (corrupted)\n", - pool->name, retval); + dev_err(pool->dev, "%s %s, %p (corrupted)\n", + __func__, pool->name, retval); else - pr_err("dma_pool_alloc %s, %p (corrupted)\n", - pool->name, retval); + pr_err("%s %s, %p (corrupted)\n", + __func__, pool->name, retval); /* * Dump the first 4 bytes even if they are not @@ -416,12 +412,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) if (!page) { spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) - dev_err(pool->dev, - "dma_pool_free %s, %p/%lx (bad dma)\n", - pool->name, vaddr, (unsigned long)dma); + dev_err(pool->dev, "%s %s, %p/%pad (bad dma)\n", + __func__, pool->name, vaddr, &dma); else - pr_err("dma_pool_free %s, %p/%lx (bad dma)\n", - pool->name, vaddr, (unsigned long)dma); + pr_err("%s %s, %p/%pad (bad dma)\n", + __func__, pool->name, vaddr, &dma); return; } @@ -432,12 +427,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) if ((dma - page->dma) != offset) { spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) - dev_err(pool->dev, - "dma_pool_free %s, %p (bad vaddr)/%pad\n", - pool->name, vaddr, &dma); + dev_err(pool->dev, "%s %s, %p (bad vaddr)/%pad\n", + __func__, pool->name, vaddr, &dma); else - pr_err("dma_pool_free %s, %p (bad vaddr)/%pad\n", - pool->name, vaddr, &dma); + pr_err("%s %s, %p (bad vaddr)/%pad\n", + __func__, pool->name, vaddr, &dma); return; } { @@ -449,11 +443,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) } spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) - dev_err(pool->dev, "dma_pool_free %s, dma %pad already free\n", - pool->name, &dma); + dev_err(pool->dev, "%s %s, dma %pad already free\n", + __func__, pool->name, &dma); else - pr_err("dma_pool_free %s, dma %pad already free\n", - pool->name, &dma); + pr_err("%s %s, dma %pad already free\n", + __func__, pool->name, &dma); return; } } diff --git a/mm/fadvise.c b/mm/fadvise.c index 0e66f2aaeea3..d6baa4f451c5 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -141,7 +141,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) } if (end_index >= start_index) { - unsigned long count; + unsigned long nr_pagevec = 0; /* * It's common to FADV_DONTNEED right after @@ -154,8 +154,9 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) */ lru_add_drain(); - count = invalidate_mapping_pages(mapping, - start_index, end_index); + invalidate_mapping_pagevec(mapping, + start_index, end_index, + &nr_pagevec); /* * If fewer pages were invalidated than expected then @@ -163,7 +164,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) * a per-cpu pagevec for a remote CPU. Drain all * pagevecs and try again. */ - if (count < (end_index - start_index + 1)) { + if (nr_pagevec) { lru_add_drain_all(); invalidate_mapping_pages(mapping, start_index, end_index); diff --git a/mm/filemap.c b/mm/filemap.c index 5202e38ab79e..38546dca58fe 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -414,7 +414,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, .range_end = end, }; - if (!mapping_cap_writeback_dirty(mapping) || + if (!mapping_can_writeback(mapping) || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; @@ -1645,19 +1645,19 @@ EXPORT_SYMBOL(page_cache_prev_miss); /** * find_get_entry - find and get a page cache entry * @mapping: the address_space to search - * @offset: the page cache index + * @index: The page cache index. * * Looks up the page cache slot at @mapping & @offset. If there is a - * page cache page, it is returned with an increased refcount. + * page cache page, the head page is returned with an increased refcount. * * If the slot holds a shadow entry of a previously evicted page, or a * swap entry from shmem/tmpfs, it is returned. * - * Return: the found page or shadow entry, %NULL if nothing is found. + * Return: The head page or shadow entry, %NULL if nothing is found. */ -struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) +struct page *find_get_entry(struct address_space *mapping, pgoff_t index) { - XA_STATE(xas, &mapping->i_pages, offset); + XA_STATE(xas, &mapping->i_pages, index); struct page *page; rcu_read_lock(); @@ -1685,7 +1685,6 @@ repeat: put_page(page); goto repeat; } - page = find_subpage(page, offset); out: rcu_read_unlock(); @@ -1693,40 +1692,37 @@ out: } /** - * find_lock_entry - locate, pin and lock a page cache entry - * @mapping: the address_space to search - * @offset: the page cache index + * find_lock_entry - Locate and lock a page cache entry. + * @mapping: The address_space to search. + * @index: The page cache index. * - * Looks up the page cache slot at @mapping & @offset. If there is a - * page cache page, it is returned locked and with an increased - * refcount. + * Looks up the page at @mapping & @index. If there is a page in the + * cache, the head page is returned locked and with an increased refcount. * * If the slot holds a shadow entry of a previously evicted page, or a * swap entry from shmem/tmpfs, it is returned. * - * find_lock_entry() may sleep. - * - * Return: the found page or shadow entry, %NULL if nothing is found. + * Context: May sleep. + * Return: The head page or shadow entry, %NULL if nothing is found. */ -struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) +struct page *find_lock_entry(struct address_space *mapping, pgoff_t index) { struct page *page; repeat: - page = find_get_entry(mapping, offset); + page = find_get_entry(mapping, index); if (page && !xa_is_value(page)) { lock_page(page); /* Has the page been truncated? */ - if (unlikely(page_mapping(page) != mapping)) { + if (unlikely(page->mapping != mapping)) { unlock_page(page); put_page(page); goto repeat; } - VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); + VM_BUG_ON_PAGE(!thp_contains(page, index), page); } return page; } -EXPORT_SYMBOL(find_lock_entry); /** * pagecache_get_page - Find and get a reference to a page. @@ -1741,6 +1737,8 @@ EXPORT_SYMBOL(find_lock_entry); * * * %FGP_ACCESSED - The page will be marked accessed. * * %FGP_LOCK - The page is returned locked. + * * %FGP_HEAD - If the page is present and a THP, return the head page + * rather than the exact page specified by the index. * * %FGP_CREAT - If no page is present then a new page is allocated using * @gfp_mask and added to the page cache and the VM's LRU list. * The page is returned locked and with an increased refcount. @@ -1781,12 +1779,12 @@ repeat: } /* Has the page been truncated? */ - if (unlikely(compound_head(page)->mapping != mapping)) { + if (unlikely(page->mapping != mapping)) { unlock_page(page); put_page(page); goto repeat; } - VM_BUG_ON_PAGE(page->index != index, page); + VM_BUG_ON_PAGE(!thp_contains(page, index), page); } if (fgp_flags & FGP_ACCESSED) @@ -1796,11 +1794,13 @@ repeat: if (page_is_idle(page)) clear_page_idle(page); } + if (!(fgp_flags & FGP_HEAD)) + page = find_subpage(page, index); no_page: if (!page && (fgp_flags & FGP_CREAT)) { int err; - if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping)) + if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping)) gfp_mask |= __GFP_WRITE; if (fgp_flags & FGP_NOFS) gfp_mask &= ~__GFP_FS; @@ -2365,7 +2365,11 @@ readpage: } if (!PageUptodate(page)) { - error = lock_page_killable(page); + if (iocb->ki_flags & IOCB_WAITQ) + error = lock_page_async(page, iocb->ki_waitq); + else + error = lock_page_killable(page); + if (unlikely(error)) goto readpage_error; if (!PageUptodate(page)) { @@ -2789,42 +2793,42 @@ void filemap_map_pages(struct vm_fault *vmf, pgoff_t last_pgoff = start_pgoff; unsigned long max_idx; XA_STATE(xas, &mapping->i_pages, start_pgoff); - struct page *page; + struct page *head, *page; unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); rcu_read_lock(); - xas_for_each(&xas, page, end_pgoff) { - if (xas_retry(&xas, page)) + xas_for_each(&xas, head, end_pgoff) { + if (xas_retry(&xas, head)) continue; - if (xa_is_value(page)) + if (xa_is_value(head)) goto next; /* * Check for a locked page first, as a speculative * reference may adversely influence page migration. */ - if (PageLocked(page)) + if (PageLocked(head)) goto next; - if (!page_cache_get_speculative(page)) + if (!page_cache_get_speculative(head)) goto next; /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) + if (unlikely(head != xas_reload(&xas))) goto skip; - page = find_subpage(page, xas.xa_index); + page = find_subpage(head, xas.xa_index); - if (!PageUptodate(page) || + if (!PageUptodate(head) || PageReadahead(page) || PageHWPoison(page)) goto skip; - if (!trylock_page(page)) + if (!trylock_page(head)) goto skip; - if (page->mapping != mapping || !PageUptodate(page)) + if (head->mapping != mapping || !PageUptodate(head)) goto unlock; max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); - if (page->index >= max_idx) + if (xas.xa_index >= max_idx) goto unlock; if (mmap_miss > 0) @@ -2836,12 +2840,12 @@ void filemap_map_pages(struct vm_fault *vmf, last_pgoff = xas.xa_index; if (alloc_set_pte(vmf, page)) goto unlock; - unlock_page(page); + unlock_page(head); goto next; unlock: - unlock_page(page); + unlock_page(head); skip: - put_page(page); + put_page(head); next: /* Huge page is mapped? No need to proceed. */ if (pmd_trans_huge(*vmf->pmd)) @@ -329,6 +329,13 @@ void unpin_user_pages(struct page **pages, unsigned long npages) unsigned long index; /* + * If this WARN_ON() fires, then the system *might* be leaking pages (by + * leaving them pinned), but probably not. More likely, gup/pup returned + * a hard -ERRNO error to the caller, who erroneously passed it here. + */ + if (WARN_ON(IS_ERR_VALUE(npages))) + return; + /* * TODO: this can be optimized for huge pages: if a series of pages is * physically contiguous and part of the same compound page, then a * single operation to the head page should suffice. @@ -1255,6 +1262,9 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, BUG_ON(*locked != 1); } + if (flags & FOLL_PIN) + atomic_set(&mm->has_pinned, 1); + /* * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior * is to set FOLL_GET if the caller wants pages[] filled in (but has @@ -1744,6 +1754,25 @@ static __always_inline long __gup_longterm_locked(struct mm_struct *mm, } #endif /* CONFIG_FS_DAX || CONFIG_CMA */ +static bool is_valid_gup_flags(unsigned int gup_flags) +{ + /* + * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, + * never directly by the caller, so enforce that with an assertion: + */ + if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) + return false; + /* + * FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying + * that is, FOLL_LONGTERM is a specific case, more restrictive case of + * FOLL_PIN. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return false; + + return true; +} + #ifdef CONFIG_MMU static long __get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, @@ -1839,11 +1868,7 @@ long get_user_pages_remote(struct mm_struct *mm, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) { - /* - * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, - * never directly by the caller, so enforce that with an assertion: - */ - if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) + if (!is_valid_gup_flags(gup_flags)) return -EINVAL; return __get_user_pages_remote(mm, start, nr_pages, gup_flags, @@ -1889,11 +1914,7 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { - /* - * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, - * never directly by the caller, so enforce that with an assertion: - */ - if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) + if (!is_valid_gup_flags(gup_flags)) return -EINVAL; return __gup_longterm_locked(current->mm, start, nr_pages, @@ -2485,13 +2506,13 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, return 1; } -static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, +static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pmd_t *pmdp; - pmdp = pmd_offset(&pud, addr); + pmdp = pmd_offset_lockless(pudp, pud, addr); do { pmd_t pmd = READ_ONCE(*pmdp); @@ -2528,13 +2549,13 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, return 1; } -static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, +static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; - pudp = pud_offset(&p4d, addr); + pudp = pud_offset_lockless(p4dp, p4d, addr); do { pud_t pud = READ_ONCE(*pudp); @@ -2549,20 +2570,20 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, if (!gup_huge_pd(__hugepd(pud_val(pud)), addr, PUD_SHIFT, next, flags, pages, nr)) return 0; - } else if (!gup_pmd_range(pud, addr, next, flags, pages, nr)) + } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr)) return 0; } while (pudp++, addr = next, addr != end); return 1; } -static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, +static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; p4d_t *p4dp; - p4dp = p4d_offset(&pgd, addr); + p4dp = p4d_offset_lockless(pgdp, pgd, addr); do { p4d_t p4d = READ_ONCE(*p4dp); @@ -2574,7 +2595,7 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr, P4D_SHIFT, next, flags, pages, nr)) return 0; - } else if (!gup_pud_range(p4d, addr, next, flags, pages, nr)) + } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr)) return 0; } while (p4dp++, addr = next, addr != end); @@ -2602,7 +2623,7 @@ static void gup_pgd_range(unsigned long addr, unsigned long end, if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, PGDIR_SHIFT, next, flags, pages, nr)) return; - } else if (!gup_p4d_range(pgd, addr, next, flags, pages, nr)) + } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr)) return; } while (pgdp++, addr = next, addr != end); } @@ -2660,6 +2681,9 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, FOLL_FAST_ONLY))) return -EINVAL; + if (gup_flags & FOLL_PIN) + atomic_set(¤t->mm->has_pinned, 1); + if (!(gup_flags & FOLL_FAST_ONLY)) might_lock_read(¤t->mm->mmap_lock); @@ -2780,11 +2804,7 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast_only); int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { - /* - * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, - * never directly by the caller, so enforce that: - */ - if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) + if (!is_valid_gup_flags(gup_flags)) return -EINVAL; /* diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c index be690fa66a46..464cae1fa3ea 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_benchmark.c @@ -6,10 +6,10 @@ #include <linux/debugfs.h> #define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark) -#define GUP_LONGTERM_BENCHMARK _IOWR('g', 2, struct gup_benchmark) -#define GUP_BENCHMARK _IOWR('g', 3, struct gup_benchmark) -#define PIN_FAST_BENCHMARK _IOWR('g', 4, struct gup_benchmark) -#define PIN_BENCHMARK _IOWR('g', 5, struct gup_benchmark) +#define GUP_BENCHMARK _IOWR('g', 2, struct gup_benchmark) +#define PIN_FAST_BENCHMARK _IOWR('g', 3, struct gup_benchmark) +#define PIN_BENCHMARK _IOWR('g', 4, struct gup_benchmark) +#define PIN_LONGTERM_BENCHMARK _IOWR('g', 5, struct gup_benchmark) struct gup_benchmark { __u64 get_delta_usec; @@ -28,7 +28,6 @@ static void put_back_pages(unsigned int cmd, struct page **pages, switch (cmd) { case GUP_FAST_BENCHMARK: - case GUP_LONGTERM_BENCHMARK: case GUP_BENCHMARK: for (i = 0; i < nr_pages; i++) put_page(pages[i]); @@ -36,6 +35,7 @@ static void put_back_pages(unsigned int cmd, struct page **pages, case PIN_FAST_BENCHMARK: case PIN_BENCHMARK: + case PIN_LONGTERM_BENCHMARK: unpin_user_pages(pages, nr_pages); break; } @@ -50,6 +50,7 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages, switch (cmd) { case PIN_FAST_BENCHMARK: case PIN_BENCHMARK: + case PIN_LONGTERM_BENCHMARK: for (i = 0; i < nr_pages; i++) { page = pages[i]; if (WARN(!page_maybe_dma_pinned(page), @@ -101,11 +102,6 @@ static int __gup_benchmark_ioctl(unsigned int cmd, nr = get_user_pages_fast(addr, nr, gup->flags, pages + i); break; - case GUP_LONGTERM_BENCHMARK: - nr = get_user_pages(addr, nr, - gup->flags | FOLL_LONGTERM, - pages + i, NULL); - break; case GUP_BENCHMARK: nr = get_user_pages(addr, nr, gup->flags, pages + i, NULL); @@ -118,6 +114,11 @@ static int __gup_benchmark_ioctl(unsigned int cmd, nr = pin_user_pages(addr, nr, gup->flags, pages + i, NULL); break; + case PIN_LONGTERM_BENCHMARK: + nr = pin_user_pages(addr, nr, + gup->flags | FOLL_LONGTERM, + pages + i, NULL); + break; default: kvfree(pages); ret = -EINVAL; @@ -162,10 +163,10 @@ static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd, switch (cmd) { case GUP_FAST_BENCHMARK: - case GUP_LONGTERM_BENCHMARK: case GUP_BENCHMARK: case PIN_FAST_BENCHMARK: case PIN_BENCHMARK: + case PIN_LONGTERM_BENCHMARK: break; default: return -EINVAL; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index faadc449cca5..65c289c13b58 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1074,6 +1074,24 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, src_page = pmd_page(pmd); VM_BUG_ON_PAGE(!PageHead(src_page), src_page); + + /* + * If this page is a potentially pinned page, split and retry the fault + * with smaller page size. Normally this should not happen because the + * userspace should use MADV_DONTFORK upon pinned regions. This is a + * best effort that the pinned pages won't be replaced by another + * random page during the coming copy-on-write. + */ + if (unlikely(is_cow_mapping(vma->vm_flags) && + atomic_read(&src_mm->has_pinned) && + page_maybe_dma_pinned(src_page))) { + pte_free(dst_mm, pgtable); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + __split_huge_pmd(vma, src_pmd, addr, false, NULL); + return -EAGAIN; + } + get_page(src_page); page_dup_rmap(src_page, true); add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); @@ -1177,6 +1195,16 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, /* No huge zero pud yet */ } + /* Please refer to comments in copy_huge_pmd() */ + if (unlikely(is_cow_mapping(vma->vm_flags) && + atomic_read(&src_mm->has_pinned) && + page_maybe_dma_pinned(pud_page(pud)))) { + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + __split_huge_pud(vma, src_pud, addr); + return -EAGAIN; + } + pudp_set_wrprotect(src_mm, addr, src_pud); pud = pud_mkold(pud_wrprotect(pud)); set_pud_at(dst_mm, addr, dst_pud, pud); @@ -2278,13 +2306,13 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, /* * If we're also updating the vma->vm_next->vm_start, if the new - * vm_next->vm_start isn't page aligned and it could previously + * vm_next->vm_start isn't hpage aligned and it could previously * contain an hugepage: check if we need to split an huge pmd. */ if (adjust_next > 0) { struct vm_area_struct *next = vma->vm_next; unsigned long nstart = next->vm_start; - nstart += adjust_next << PAGE_SHIFT; + nstart += adjust_next; if (nstart & ~HPAGE_PMD_MASK && (nstart & HPAGE_PMD_MASK) >= next->vm_start && (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) @@ -2342,6 +2370,9 @@ static void __split_huge_page_tail(struct page *head, int tail, (1L << PG_workingset) | (1L << PG_locked) | (1L << PG_unevictable) | +#ifdef CONFIG_64BIT + (1L << PG_arch_2) | +#endif (1L << PG_dirty))); /* ->mapping in first tail page is compound_mapcount */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1cc743d52565..fe76f8fd5a73 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -240,7 +240,6 @@ get_file_region_entry_from_cache(struct resv_map *resv, long from, long to) resv->region_cache_count--; nrg = list_first_entry(&resv->region_cache, struct file_region, link); - VM_BUG_ON(!nrg); list_del(&nrg->link); nrg->from = from; @@ -309,8 +308,7 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) list_del(&rg->link); kfree(rg); - coalesce_file_region(resv, prg); - return; + rg = prg; } nrg = list_next_entry(rg, link); @@ -320,22 +318,20 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) list_del(&rg->link); kfree(rg); - - coalesce_file_region(resv, nrg); - return; } } -/* Must be called with resv->lock held. Calling this with count_only == true - * will count the number of pages to be added but will not modify the linked - * list. If regions_needed != NULL and count_only == true, then regions_needed - * will indicate the number of file_regions needed in the cache to carry out to - * add the regions for this range. +/* + * Must be called with resv->lock held. + * + * Calling this with regions_needed != NULL will count the number of pages + * to be added but will not modify the linked list. And regions_needed will + * indicate the number of file_regions needed in the cache to carry out to add + * the regions for this range. */ static long add_reservation_in_range(struct resv_map *resv, long f, long t, struct hugetlb_cgroup *h_cg, - struct hstate *h, long *regions_needed, - bool count_only) + struct hstate *h, long *regions_needed) { long add = 0; struct list_head *head = &resv->regions; @@ -371,14 +367,14 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t, */ if (rg->from > last_accounted_offset) { add += rg->from - last_accounted_offset; - if (!count_only) { + if (!regions_needed) { nrg = get_file_region_entry_from_cache( resv, last_accounted_offset, rg->from); record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg); list_add(&nrg->link, rg->link.prev); coalesce_file_region(resv, nrg); - } else if (regions_needed) + } else *regions_needed += 1; } @@ -390,13 +386,13 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t, */ if (last_accounted_offset < t) { add += t - last_accounted_offset; - if (!count_only) { + if (!regions_needed) { nrg = get_file_region_entry_from_cache( resv, last_accounted_offset, t); record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg); list_add(&nrg->link, rg->link.prev); coalesce_file_region(resv, nrg); - } else if (regions_needed) + } else *regions_needed += 1; } @@ -448,11 +444,8 @@ static int allocate_file_region_entries(struct resv_map *resv, spin_lock(&resv->lock); - list_for_each_entry_safe(rg, trg, &allocated_regions, link) { - list_del(&rg->link); - list_add(&rg->link, &resv->region_cache); - resv->region_cache_count++; - } + list_splice(&allocated_regions, &resv->region_cache); + resv->region_cache_count += to_allocate; } return 0; @@ -492,8 +485,8 @@ static long region_add(struct resv_map *resv, long f, long t, retry: /* Count how many regions are actually needed to execute this add. */ - add_reservation_in_range(resv, f, t, NULL, NULL, &actual_regions_needed, - true); + add_reservation_in_range(resv, f, t, NULL, NULL, + &actual_regions_needed); /* * Check for sufficient descriptors in the cache to accommodate @@ -521,7 +514,7 @@ retry: goto retry; } - add = add_reservation_in_range(resv, f, t, h_cg, h, NULL, false); + add = add_reservation_in_range(resv, f, t, h_cg, h, NULL); resv->adds_in_progress -= in_regions_needed; @@ -557,9 +550,9 @@ static long region_chg(struct resv_map *resv, long f, long t, spin_lock(&resv->lock); - /* Count how many hugepages in this range are NOT respresented. */ + /* Count how many hugepages in this range are NOT represented. */ chg = add_reservation_in_range(resv, f, t, NULL, NULL, - out_regions_needed, true); + out_regions_needed); if (*out_regions_needed == 0) *out_regions_needed = 1; @@ -1047,21 +1040,17 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) if (nocma && is_migrate_cma_page(page)) continue; - if (!PageHWPoison(page)) - break; + if (PageHWPoison(page)) + continue; + + list_move(&page->lru, &h->hugepage_activelist); + set_page_refcounted(page); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + return page; } - /* - * if 'non-isolated free hugepage' not found on the list, - * the allocation fails. - */ - if (&h->hugepage_freelists[nid] == &page->lru) - return NULL; - list_move(&page->lru, &h->hugepage_activelist); - set_page_refcounted(page); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; - return page; + return NULL; } static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid, @@ -1511,9 +1500,9 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); - spin_lock(&hugetlb_lock); set_hugetlb_cgroup(page, NULL); set_hugetlb_cgroup_rsvd(page, NULL); + spin_lock(&hugetlb_lock); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; spin_unlock(&hugetlb_lock); @@ -2423,7 +2412,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, h->resv_huge_pages--; } spin_lock(&hugetlb_lock); - list_move(&page->lru, &h->hugepage_activelist); + list_add(&page->lru, &h->hugepage_activelist); /* Fall through */ } hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); @@ -3582,18 +3571,20 @@ void hugetlb_report_meminfo(struct seq_file *m) seq_printf(m, "Hugetlb: %8lu kB\n", total / 1024); } -int hugetlb_report_node_meminfo(int nid, char *buf) +int hugetlb_report_node_meminfo(char *buf, int len, int nid) { struct hstate *h = &default_hstate; + if (!hugepages_supported()) return 0; - return sprintf(buf, - "Node %d HugePages_Total: %5u\n" - "Node %d HugePages_Free: %5u\n" - "Node %d HugePages_Surp: %5u\n", - nid, h->nr_huge_pages_node[nid], - nid, h->free_huge_pages_node[nid], - nid, h->surplus_huge_pages_node[nid]); + + return sysfs_emit_at(buf, len, + "Node %d HugePages_Total: %5u\n" + "Node %d HugePages_Free: %5u\n" + "Node %d HugePages_Surp: %5u\n", + nid, h->nr_huge_pages_node[nid], + nid, h->free_huge_pages_node[nid], + nid, h->surplus_huge_pages_node[nid]); } void hugetlb_show_meminfo(void) @@ -3799,23 +3790,23 @@ bool is_hugetlb_entry_migration(pte_t pte) if (huge_pte_none(pte) || pte_present(pte)) return false; swp = pte_to_swp_entry(pte); - if (non_swap_entry(swp) && is_migration_entry(swp)) + if (is_migration_entry(swp)) return true; else return false; } -static int is_hugetlb_entry_hwpoisoned(pte_t pte) +static bool is_hugetlb_entry_hwpoisoned(pte_t pte) { swp_entry_t swp; if (huge_pte_none(pte) || pte_present(pte)) - return 0; + return false; swp = pte_to_swp_entry(pte); - if (non_swap_entry(swp) && is_hwpoison_entry(swp)) - return 1; + if (is_hwpoison_entry(swp)) + return true; else - return 0; + return false; } int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, @@ -5348,10 +5339,16 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, * !shared pmd case because we can allocate the pmd later as well, it makes the * code much cleaner. * - * This routine must be called with i_mmap_rwsem held in at least read mode. - * For hugetlbfs, this prevents removal of any page table entries associated - * with the address space. This is important as we are setting up sharing - * based on existing page table entries (mappings). + * This routine must be called with i_mmap_rwsem held in at least read mode if + * sharing is possible. For hugetlbfs, this prevents removal of any page + * table entries associated with the address space. This is important as we + * are setting up sharing based on existing page table entries (mappings). + * + * NOTE: This routine is only called from huge_pte_alloc. Some callers of + * huge_pte_alloc know that sharing is not possible and do not take + * i_mmap_rwsem as a performance optimization. This is handled by the + * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is + * only required for subsequent processing. */ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) { @@ -5368,6 +5365,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!vma_shareable(vma, addr)) return (pte_t *)pmd_alloc(mm, pud, addr); + i_mmap_assert_locked(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; diff --git a/mm/internal.h b/mm/internal.h index 10c677655912..a801a4d51f26 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -65,6 +65,9 @@ static inline void ra_submit(struct file_ra_state *ra, ra->start, ra->size, ra->async_size); } +struct page *find_get_entry(struct address_space *mapping, pgoff_t index); +struct page *find_lock_entry(struct address_space *mapping, pgoff_t index); + /** * page_evictable - test whether a page is evictable * @page: the page to test diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 4f49fa6cd1aa..00a53f1355ae 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -33,6 +33,8 @@ #include <asm/sections.h> +#include <kunit/test.h> + #include "kasan.h" #include "../slab.h" @@ -93,7 +95,7 @@ static void end_report(unsigned long *flags) pr_err("==================================================================\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, *flags); - if (panic_on_warn) { + if (panic_on_warn && !test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) { /* * This thread may hit another WARN() in the panic path. * Resetting this prevents additional WARN() from panicking the @@ -464,12 +466,37 @@ static bool report_enabled(void) return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); } +#if IS_ENABLED(CONFIG_KUNIT) +static void kasan_update_kunit_status(struct kunit *cur_test) +{ + struct kunit_resource *resource; + struct kunit_kasan_expectation *kasan_data; + + resource = kunit_find_named_resource(cur_test, "kasan_data"); + + if (!resource) { + kunit_set_failure(cur_test); + return; + } + + kasan_data = (struct kunit_kasan_expectation *)resource->data; + kasan_data->report_found = true; + kunit_put_resource(resource); +} +#endif /* IS_ENABLED(CONFIG_KUNIT) */ + void kasan_report_invalid_free(void *object, unsigned long ip) { unsigned long flags; u8 tag = get_tag(object); object = reset_tag(object); + +#if IS_ENABLED(CONFIG_KUNIT) + if (current->kunit_test) + kasan_update_kunit_status(current->kunit_test); +#endif /* IS_ENABLED(CONFIG_KUNIT) */ + start_report(&flags); pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip); print_tags(tag, object); @@ -488,6 +515,11 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write, void *untagged_addr; unsigned long flags; +#if IS_ENABLED(CONFIG_KUNIT) + if (current->kunit_test) + kasan_update_kunit_status(current->kunit_test); +#endif /* IS_ENABLED(CONFIG_KUNIT) */ + disable_trace_on_warning(); tagged_addr = (void *)addr; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index cfa0dba5fd3b..58b0d9c502a1 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -56,6 +56,9 @@ enum scan_result { #define CREATE_TRACE_POINTS #include <trace/events/huge_memory.h> +static struct task_struct *khugepaged_thread __read_mostly; +static DEFINE_MUTEX(khugepaged_mutex); + /* default scan 8*512 pte (or vmas) every 30 second */ static unsigned int khugepaged_pages_to_scan __read_mostly; static unsigned int khugepaged_pages_collapsed; @@ -914,6 +917,18 @@ static struct page *khugepaged_alloc_hugepage(bool *wait) static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) { + /* + * If the hpage allocated earlier was briefly exposed in page cache + * before collapse_file() failed, it is possible that racing lookups + * have not yet completed, and would then be unpleasantly surprised by + * finding the hpage reused for the same mapping at a different offset. + * Just release the previous allocation if there is any danger of that. + */ + if (*hpage && page_count(*hpage) > 1) { + put_page(*hpage); + *hpage = NULL; + } + if (!*hpage) *hpage = khugepaged_alloc_hugepage(wait); @@ -2292,8 +2307,6 @@ static void set_recommended_min_free_kbytes(void) int start_stop_khugepaged(void) { - static struct task_struct *khugepaged_thread __read_mostly; - static DEFINE_MUTEX(khugepaged_mutex); int err = 0; mutex_lock(&khugepaged_mutex); @@ -2320,3 +2333,11 @@ fail: mutex_unlock(&khugepaged_mutex); return err; } + +void khugepaged_min_free_kbytes_update(void) +{ + mutex_lock(&khugepaged_mutex); + if (khugepaged_enabled() && khugepaged_thread) + set_recommended_min_free_kbytes(); + mutex_unlock(&khugepaged_mutex); +} diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c deleted file mode 100644 index e19279ff6aa3..000000000000 --- a/mm/kmemleak-test.c +++ /dev/null @@ -1,99 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * mm/kmemleak-test.c - * - * Copyright (C) 2008 ARM Limited - * Written by Catalin Marinas <catalin.marinas@arm.com> - */ - -#define pr_fmt(fmt) "kmemleak: " fmt - -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/list.h> -#include <linux/percpu.h> -#include <linux/fdtable.h> - -#include <linux/kmemleak.h> - -struct test_node { - long header[25]; - struct list_head list; - long footer[25]; -}; - -static LIST_HEAD(test_list); -static DEFINE_PER_CPU(void *, kmemleak_test_pointer); - -/* - * Some very simple testing. This function needs to be extended for - * proper testing. - */ -static int __init kmemleak_test_init(void) -{ - struct test_node *elem; - int i; - - pr_info("Kmemleak testing\n"); - - /* make some orphan objects */ - pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); - pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); - pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); - pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); - pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); - pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); - pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); - pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); -#ifndef CONFIG_MODULES - pr_info("kmem_cache_alloc(files_cachep) = %p\n", - kmem_cache_alloc(files_cachep, GFP_KERNEL)); - pr_info("kmem_cache_alloc(files_cachep) = %p\n", - kmem_cache_alloc(files_cachep, GFP_KERNEL)); -#endif - pr_info("vmalloc(64) = %p\n", vmalloc(64)); - pr_info("vmalloc(64) = %p\n", vmalloc(64)); - pr_info("vmalloc(64) = %p\n", vmalloc(64)); - pr_info("vmalloc(64) = %p\n", vmalloc(64)); - pr_info("vmalloc(64) = %p\n", vmalloc(64)); - - /* - * Add elements to a list. They should only appear as orphan - * after the module is removed. - */ - for (i = 0; i < 10; i++) { - elem = kzalloc(sizeof(*elem), GFP_KERNEL); - pr_info("kzalloc(sizeof(*elem)) = %p\n", elem); - if (!elem) - return -ENOMEM; - INIT_LIST_HEAD(&elem->list); - list_add_tail(&elem->list, &test_list); - } - - for_each_possible_cpu(i) { - per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL); - pr_info("kmalloc(129) = %p\n", - per_cpu(kmemleak_test_pointer, i)); - } - - return 0; -} -module_init(kmemleak_test_init); - -static void __exit kmemleak_test_exit(void) -{ - struct test_node *elem, *tmp; - - /* - * Remove the list elements without actually freeing the - * memory. - */ - list_for_each_entry_safe(elem, tmp, &test_list, list) - list_del(&elem->list); -} -module_exit(kmemleak_test_exit); - -MODULE_LICENSE("GPL"); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 5e252d91eb14..c0014d3b91c1 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1471,15 +1471,15 @@ static void kmemleak_scan(void) if (kmemleak_stack_scan) { struct task_struct *p, *g; - read_lock(&tasklist_lock); - do_each_thread(g, p) { + rcu_read_lock(); + for_each_process_thread(g, p) { void *stack = try_get_task_stack(p); if (stack) { scan_block(stack, stack + THREAD_SIZE, NULL); put_task_stack(p); } - } while_each_thread(g, p); - read_unlock(&tasklist_lock); + } + rcu_read_unlock(); } /* diff --git a/mm/madvise.c b/mm/madvise.c index d4aa5f776543..9b065d412e5f 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -224,25 +224,28 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct address_space *mapping) { - pgoff_t index; + XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); + pgoff_t end_index = end / PAGE_SIZE; struct page *page; - swp_entry_t swap; - for (; start < end; start += PAGE_SIZE) { - index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + rcu_read_lock(); + xas_for_each(&xas, page, end_index) { + swp_entry_t swap; - page = find_get_entry(mapping, index); - if (!xa_is_value(page)) { - if (page) - put_page(page); + if (!xa_is_value(page)) continue; - } + xas_pause(&xas); + rcu_read_unlock(); + swap = radix_to_swp_entry(page); page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, NULL, 0, false); if (page) put_page(page); + + rcu_read_lock(); } + rcu_read_unlock(); lru_add_drain(); /* Push any new pages onto the LRU now */ } @@ -381,9 +384,9 @@ huge_unlock: return 0; } +regular_page: if (pmd_trans_unstable(pmd)) return 0; -regular_page: #endif tlb_change_page_size(tlb, PAGE_SIZE); orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); diff --git a/mm/memblock.c b/mm/memblock.c index 45f198750be9..165f40a8a254 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -132,7 +132,26 @@ struct memblock_type physmem = { }; #endif -int memblock_debug __initdata_memblock; +/* + * keep a pointer to &memblock.memory in the text section to use it in + * __next_mem_range() and its helpers. + * For architectures that do not keep memblock data after init, this + * pointer will be reset to NULL at memblock_discard() + */ +static __refdata struct memblock_type *memblock_memory = &memblock.memory; + +#define for_each_memblock_type(i, memblock_type, rgn) \ + for (i = 0, rgn = &memblock_type->regions[0]; \ + i < memblock_type->cnt; \ + i++, rgn = &memblock_type->regions[i]) + +#define memblock_dbg(fmt, ...) \ + do { \ + if (memblock_debug) \ + pr_info(fmt, ##__VA_ARGS__); \ + } while (0) + +static int memblock_debug __initdata_memblock; static bool system_has_some_mirror __initdata_memblock = false; static int memblock_can_resize __initdata_memblock; static int memblock_memory_in_slab __initdata_memblock = 0; @@ -391,6 +410,8 @@ void __init memblock_discard(void) memblock.memory.max); __memblock_free_late(addr, size); } + + memblock_memory = NULL; } #endif @@ -941,42 +962,16 @@ int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size) return memblock_setclr_flag(base, size, 0, MEMBLOCK_NOMAP); } -/** - * __next_reserved_mem_region - next function for for_each_reserved_region() - * @idx: pointer to u64 loop variable - * @out_start: ptr to phys_addr_t for start address of the region, can be %NULL - * @out_end: ptr to phys_addr_t for end address of the region, can be %NULL - * - * Iterate over all reserved memory regions. - */ -void __init_memblock __next_reserved_mem_region(u64 *idx, - phys_addr_t *out_start, - phys_addr_t *out_end) -{ - struct memblock_type *type = &memblock.reserved; - - if (*idx < type->cnt) { - struct memblock_region *r = &type->regions[*idx]; - phys_addr_t base = r->base; - phys_addr_t size = r->size; - - if (out_start) - *out_start = base; - if (out_end) - *out_end = base + size - 1; - - *idx += 1; - return; - } - - /* signal end of iteration */ - *idx = ULLONG_MAX; -} - -static bool should_skip_region(struct memblock_region *m, int nid, int flags) +static bool should_skip_region(struct memblock_type *type, + struct memblock_region *m, + int nid, int flags) { int m_nid = memblock_get_region_node(m); + /* we never skip regions when iterating memblock.reserved or physmem */ + if (type != memblock_memory) + return false; + /* only memory regions are associated with nodes, check it */ if (nid != NUMA_NO_NODE && nid != m_nid) return true; @@ -1041,7 +1036,7 @@ void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags, phys_addr_t m_end = m->base + m->size; int m_nid = memblock_get_region_node(m); - if (should_skip_region(m, nid, flags)) + if (should_skip_region(type_a, m, nid, flags)) continue; if (!type_b) { @@ -1145,7 +1140,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, phys_addr_t m_end = m->base + m->size; int m_nid = memblock_get_region_node(m); - if (should_skip_region(m, nid, flags)) + if (should_skip_region(type_a, m, nid, flags)) continue; if (!type_b) { @@ -1649,23 +1644,6 @@ phys_addr_t __init_memblock memblock_reserved_size(void) return memblock.reserved.total_size; } -phys_addr_t __init memblock_mem_size(unsigned long limit_pfn) -{ - unsigned long pages = 0; - struct memblock_region *r; - unsigned long start_pfn, end_pfn; - - for_each_memblock(memory, r) { - start_pfn = memblock_region_memory_base_pfn(r); - end_pfn = memblock_region_memory_end_pfn(r); - start_pfn = min_t(unsigned long, start_pfn, limit_pfn); - end_pfn = min_t(unsigned long, end_pfn, limit_pfn); - pages += end_pfn - start_pfn; - } - - return PFN_PHYS(pages); -} - /* lowest address */ phys_addr_t __init_memblock memblock_start_of_DRAM(void) { @@ -1689,7 +1667,7 @@ static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit) * the memory memblock regions, if the @limit exceeds the total size * of those regions, max_addr will keep original value PHYS_ADDR_MAX */ - for_each_memblock(memory, r) { + for_each_mem_region(r) { if (limit <= r->size) { max_addr = r->base + limit; break; @@ -1859,7 +1837,7 @@ void __init_memblock memblock_trim_memory(phys_addr_t align) phys_addr_t start, end, orig_start, orig_end; struct memblock_region *r; - for_each_memblock(memory, r) { + for_each_mem_region(r) { orig_start = r->base; orig_end = r->base + r->size; start = round_up(orig_start, align); @@ -1915,7 +1893,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type) } } -void __init_memblock __memblock_dump_all(void) +static void __init_memblock __memblock_dump_all(void) { pr_info("MEMBLOCK configuration:\n"); pr_info(" memory size = %pa reserved size = %pa\n", @@ -1929,6 +1907,12 @@ void __init_memblock __memblock_dump_all(void) #endif } +void __init_memblock memblock_dump_all(void) +{ + if (memblock_debug) + __memblock_dump_all(); +} + void __init memblock_allow_resize(void) { memblock_can_resize = 1; @@ -1981,7 +1965,7 @@ static unsigned long __init free_low_memory_core_early(void) memblock_clear_hotplug(0, -1); - for_each_reserved_mem_region(i, &start, &end) + for_each_reserved_mem_range(i, &start, &end) reserve_bootmem_region(start, end); /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index cfa6cbad21d5..7f74a158cfa8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -197,14 +197,6 @@ static struct move_charge_struct { #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 -enum charge_type { - MEM_CGROUP_CHARGE_TYPE_CACHE = 0, - MEM_CGROUP_CHARGE_TYPE_ANON, - MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ - MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ - NR_CHARGE_TYPE, -}; - /* for encoding cft->private value on file */ enum res_type { _MEM, @@ -1102,9 +1094,9 @@ static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) * invocations for reference counting, or use mem_cgroup_iter_break() * to cancel a hierarchy walk before the round-trip is complete. * - * Reclaimers can specify a node and a priority level in @reclaim to - * divide up the memcgs in the hierarchy among all concurrent - * reclaimers operating on the same node and priority. + * Reclaimers can specify a node in @reclaim to divide up the memcgs + * in the hierarchy among all concurrent reclaimers operating on the + * same node. */ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, @@ -1456,6 +1448,70 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) return false; } +struct memory_stat { + const char *name; + unsigned int ratio; + unsigned int idx; +}; + +static struct memory_stat memory_stats[] = { + { "anon", PAGE_SIZE, NR_ANON_MAPPED }, + { "file", PAGE_SIZE, NR_FILE_PAGES }, + { "kernel_stack", 1024, NR_KERNEL_STACK_KB }, + { "percpu", 1, MEMCG_PERCPU_B }, + { "sock", PAGE_SIZE, MEMCG_SOCK }, + { "shmem", PAGE_SIZE, NR_SHMEM }, + { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED }, + { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY }, + { "file_writeback", PAGE_SIZE, NR_WRITEBACK }, +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* + * The ratio will be initialized in memory_stats_init(). Because + * on some architectures, the macro of HPAGE_PMD_SIZE is not + * constant(e.g. powerpc). + */ + { "anon_thp", 0, NR_ANON_THPS }, +#endif + { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON }, + { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON }, + { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE }, + { "active_file", PAGE_SIZE, NR_ACTIVE_FILE }, + { "unevictable", PAGE_SIZE, NR_UNEVICTABLE }, + + /* + * Note: The slab_reclaimable and slab_unreclaimable must be + * together and slab_reclaimable must be in front. + */ + { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B }, + { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B }, + + /* The memory events */ + { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON }, + { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE }, + { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON }, + { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE }, + { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON }, + { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE }, + { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM }, +}; + +static int __init memory_stats_init(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (memory_stats[i].idx == NR_ANON_THPS) + memory_stats[i].ratio = HPAGE_PMD_SIZE; +#endif + VM_BUG_ON(!memory_stats[i].ratio); + VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT); + } + + return 0; +} +pure_initcall(memory_stats_init); + static char *memory_stat_format(struct mem_cgroup *memcg) { struct seq_buf s; @@ -1476,52 +1532,19 @@ static char *memory_stat_format(struct mem_cgroup *memcg) * Current memory state: */ - seq_buf_printf(&s, "anon %llu\n", - (u64)memcg_page_state(memcg, NR_ANON_MAPPED) * - PAGE_SIZE); - seq_buf_printf(&s, "file %llu\n", - (u64)memcg_page_state(memcg, NR_FILE_PAGES) * - PAGE_SIZE); - seq_buf_printf(&s, "kernel_stack %llu\n", - (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) * - 1024); - seq_buf_printf(&s, "slab %llu\n", - (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) + - memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B))); - seq_buf_printf(&s, "percpu %llu\n", - (u64)memcg_page_state(memcg, MEMCG_PERCPU_B)); - seq_buf_printf(&s, "sock %llu\n", - (u64)memcg_page_state(memcg, MEMCG_SOCK) * - PAGE_SIZE); - - seq_buf_printf(&s, "shmem %llu\n", - (u64)memcg_page_state(memcg, NR_SHMEM) * - PAGE_SIZE); - seq_buf_printf(&s, "file_mapped %llu\n", - (u64)memcg_page_state(memcg, NR_FILE_MAPPED) * - PAGE_SIZE); - seq_buf_printf(&s, "file_dirty %llu\n", - (u64)memcg_page_state(memcg, NR_FILE_DIRTY) * - PAGE_SIZE); - seq_buf_printf(&s, "file_writeback %llu\n", - (u64)memcg_page_state(memcg, NR_WRITEBACK) * - PAGE_SIZE); + for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { + u64 size; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - seq_buf_printf(&s, "anon_thp %llu\n", - (u64)memcg_page_state(memcg, NR_ANON_THPS) * - HPAGE_PMD_SIZE); -#endif - - for (i = 0; i < NR_LRU_LISTS; i++) - seq_buf_printf(&s, "%s %llu\n", lru_list_name(i), - (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * - PAGE_SIZE); + size = memcg_page_state(memcg, memory_stats[i].idx); + size *= memory_stats[i].ratio; + seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size); - seq_buf_printf(&s, "slab_reclaimable %llu\n", - (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B)); - seq_buf_printf(&s, "slab_unreclaimable %llu\n", - (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B)); + if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) { + size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) + + memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B); + seq_buf_printf(&s, "slab %llu\n", size); + } + } /* Accumulated memory events */ @@ -1529,22 +1552,6 @@ static char *memory_stat_format(struct mem_cgroup *memcg) memcg_events(memcg, PGFAULT)); seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT), memcg_events(memcg, PGMAJFAULT)); - - seq_buf_printf(&s, "workingset_refault_anon %lu\n", - memcg_page_state(memcg, WORKINGSET_REFAULT_ANON)); - seq_buf_printf(&s, "workingset_refault_file %lu\n", - memcg_page_state(memcg, WORKINGSET_REFAULT_FILE)); - seq_buf_printf(&s, "workingset_activate_anon %lu\n", - memcg_page_state(memcg, WORKINGSET_ACTIVATE_ANON)); - seq_buf_printf(&s, "workingset_activate_file %lu\n", - memcg_page_state(memcg, WORKINGSET_ACTIVATE_FILE)); - seq_buf_printf(&s, "workingset_restore %lu\n", - memcg_page_state(memcg, WORKINGSET_RESTORE_ANON)); - seq_buf_printf(&s, "workingset_restore %lu\n", - memcg_page_state(memcg, WORKINGSET_RESTORE_FILE)); - seq_buf_printf(&s, "workingset_nodereclaim %lu\n", - memcg_page_state(memcg, WORKINGSET_NODERECLAIM)); - seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL), memcg_events(memcg, PGREFILL)); seq_buf_printf(&s, "pgscan %lu\n", @@ -1641,17 +1648,19 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) */ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { - unsigned long max; + unsigned long max = READ_ONCE(memcg->memory.max); - max = READ_ONCE(memcg->memory.max); - if (mem_cgroup_swappiness(memcg)) { - unsigned long memsw_max; - unsigned long swap_max; + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) { + if (mem_cgroup_swappiness(memcg)) + max += min(READ_ONCE(memcg->swap.max), + (unsigned long)total_swap_pages); + } else { /* v1 */ + if (mem_cgroup_swappiness(memcg)) { + /* Calculate swap excess capacity from memsw limit */ + unsigned long swap = READ_ONCE(memcg->memsw.max) - max; - memsw_max = memcg->memsw.max; - swap_max = READ_ONCE(memcg->swap.max); - swap_max = min(swap_max, (unsigned long)total_swap_pages); - max = min(max + swap_max, memsw_max); + max += min(swap, (unsigned long)total_swap_pages); + } } return max; } @@ -1817,8 +1826,8 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) struct mem_cgroup *iter; /* - * When a new child is created while the hierarchy is under oom, - * mem_cgroup_oom_lock() may not be called. Watch for underflow. + * Be careful about under_oom underflows becase a child memcg + * could have been added after mem_cgroup_mark_under_oom. */ spin_lock(&memcg_oom_lock); for_each_mem_cgroup_tree(iter, memcg) @@ -2888,6 +2897,17 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p) page = virt_to_head_page(p); /* + * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer + * or a pointer to obj_cgroup vector. In the latter case the lowest + * bit of the pointer is set. + * The page->mem_cgroup pointer can be asynchronously changed + * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed + * from a valid memcg pointer to objcg vector or back. + */ + if (!page->mem_cgroup) + return NULL; + + /* * Slab objects are accounted individually, not per-page. * Memcg membership data for each individual object is saved in * the page->obj_cgroups. @@ -4255,17 +4275,16 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, new->size = size; /* Copy thresholds (if any) to new array */ - if (thresholds->primary) { - memcpy(new->entries, thresholds->primary->entries, (size - 1) * - sizeof(struct mem_cgroup_threshold)); - } + if (thresholds->primary) + memcpy(new->entries, thresholds->primary->entries, + flex_array_size(new, entries, size - 1)); /* Add new threshold */ new->entries[size - 1].eventfd = eventfd; new->entries[size - 1].threshold = threshold; /* Sort thresholds. Registering of new threshold isn't time-critical */ - sort(new->entries, size, sizeof(struct mem_cgroup_threshold), + sort(new->entries, size, sizeof(*new->entries), compare_thresholds, NULL); /* Find current threshold */ @@ -5291,13 +5310,11 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) memcg->use_hierarchy = true; page_counter_init(&memcg->memory, &parent->memory); page_counter_init(&memcg->swap, &parent->swap); - page_counter_init(&memcg->memsw, &parent->memsw); page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); } else { page_counter_init(&memcg->memory, NULL); page_counter_init(&memcg->swap, NULL); - page_counter_init(&memcg->memsw, NULL); page_counter_init(&memcg->kmem, NULL); page_counter_init(&memcg->tcpmem, NULL); /* @@ -5426,7 +5443,6 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); - page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); page_counter_set_min(&memcg->memory, 0); @@ -5500,7 +5516,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, struct page *page = NULL; swp_entry_t ent = pte_to_swp_entry(ptent); - if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent)) + if (!(mc.flags & MOVE_ANON)) return NULL; /* @@ -5519,6 +5535,9 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, return page; } + if (non_swap_entry(ent)) + return NULL; + /* * Because lookup_swap_cache() updates some statistics counter, * we call find_get_page() with swapper_space directly. @@ -5539,35 +5558,15 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, static struct page *mc_handle_file_pte(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, swp_entry_t *entry) { - struct page *page = NULL; - struct address_space *mapping; - pgoff_t pgoff; - if (!vma->vm_file) /* anonymous vma */ return NULL; if (!(mc.flags & MOVE_FILE)) return NULL; - mapping = vma->vm_file->f_mapping; - pgoff = linear_page_index(vma, addr); - /* page is moved even if it's not RSS of this task(page-faulted). */ -#ifdef CONFIG_SWAP /* shmem/tmpfs may report page out on swap: account for that too. */ - if (shmem_mapping(mapping)) { - page = find_get_entry(mapping, pgoff); - if (xa_is_value(page)) { - swp_entry_t swp = radix_to_swp_entry(page); - *entry = swp; - page = find_get_page(swap_address_space(swp), - swp_offset(swp)); - } - } else - page = find_get_page(mapping, pgoff); -#else - page = find_get_page(mapping, pgoff); -#endif - return page; + return find_get_incore_page(vma->vm_file->f_mapping, + linear_page_index(vma, addr)); } /** @@ -5643,7 +5642,7 @@ static int mem_cgroup_move_account(struct page *page, if (PageDirty(page)) { struct address_space *mapping = page_mapping(page); - if (mapping_cap_account_dirty(mapping)) { + if (mapping_can_writeback(mapping)) { __mod_lruvec_state(from_vec, NR_FILE_DIRTY, -nr_pages); __mod_lruvec_state(to_vec, NR_FILE_DIRTY, @@ -6393,6 +6392,35 @@ static int memory_stat_show(struct seq_file *m, void *v) return 0; } +#ifdef CONFIG_NUMA +static int memory_numa_stat_show(struct seq_file *m, void *v) +{ + int i; + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { + int nid; + + if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS) + continue; + + seq_printf(m, "%s", memory_stats[i].name); + for_each_node_state(nid, N_MEMORY) { + u64 size; + struct lruvec *lruvec; + + lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); + size = lruvec_page_state(lruvec, memory_stats[i].idx); + size *= memory_stats[i].ratio; + seq_printf(m, " N%d=%llu", nid, size); + } + seq_putc(m, '\n'); + } + + return 0; +} +#endif + static int memory_oom_group_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_seq(m); @@ -6470,6 +6498,12 @@ static struct cftype memory_files[] = { .name = "stat", .seq_show = memory_stat_show, }, +#ifdef CONFIG_NUMA + { + .name = "numa_stat", + .seq_show = memory_numa_stat_show, + }, +#endif { .name = "oom.group", .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, diff --git a/mm/memory-failure.c b/mm/memory-failure.c index f1aa6433f404..990e3b2e37d5 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -484,11 +484,12 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, struct vm_area_struct *vma; struct task_struct *tsk; struct address_space *mapping = page->mapping; + pgoff_t pgoff; i_mmap_lock_read(mapping); read_lock(&tasklist_lock); + pgoff = page_to_pgoff(page); for_each_process(tsk) { - pgoff_t pgoff = page_to_pgoff(page); struct task_struct *t = task_early_kill(tsk, force_early); if (!t) @@ -824,7 +825,6 @@ static int me_huge_page(struct page *p, unsigned long pfn) #define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked)) #define unevict (1UL << PG_unevictable) #define mlock (1UL << PG_mlocked) -#define writeback (1UL << PG_writeback) #define lru (1UL << PG_lru) #define head (1UL << PG_head) #define slab (1UL << PG_slab) @@ -873,7 +873,6 @@ static struct page_state { #undef sc #undef unevict #undef mlock -#undef writeback #undef lru #undef head #undef slab @@ -1006,7 +1005,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, */ mapping = page_mapping(hpage); if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping && - mapping_cap_writeback_dirty(mapping)) { + mapping_can_writeback(mapping)) { if (page_mkclean(hpage)) { SetPageDirty(hpage); } else { diff --git a/mm/memory.c b/mm/memory.c index 7e4e8b81a738..2afb01ea1307 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -772,15 +772,105 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, return 0; } -static inline void -copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, - unsigned long addr, int *rss) +/* + * Copy a present and normal page if necessary. + * + * NOTE! The usual case is that this doesn't need to do + * anything, and can just return a positive value. That + * will let the caller know that it can just increase + * the page refcount and re-use the pte the traditional + * way. + * + * But _if_ we need to copy it because it needs to be + * pinned in the parent (and the child should get its own + * copy rather than just a reference to the same page), + * we'll do that here and return zero to let the caller + * know we're done. + * + * And if we need a pre-allocated page but don't yet have + * one, return a negative error to let the preallocation + * code know so that it can do so outside the page table + * lock. + */ +static inline int +copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, + struct page **prealloc, pte_t pte, struct page *page) { - unsigned long vm_flags = vma->vm_flags; + struct mm_struct *src_mm = src_vma->vm_mm; + struct page *new_page; + + if (!is_cow_mapping(src_vma->vm_flags)) + return 1; + + /* + * What we want to do is to check whether this page may + * have been pinned by the parent process. If so, + * instead of wrprotect the pte on both sides, we copy + * the page immediately so that we'll always guarantee + * the pinned page won't be randomly replaced in the + * future. + * + * The page pinning checks are just "has this mm ever + * seen pinning", along with the (inexact) check of + * the page count. That might give false positives for + * for pinning, but it will work correctly. + */ + if (likely(!atomic_read(&src_mm->has_pinned))) + return 1; + if (likely(!page_maybe_dma_pinned(page))) + return 1; + + new_page = *prealloc; + if (!new_page) + return -EAGAIN; + + /* + * We have a prealloc page, all good! Take it + * over and copy the page & arm it. + */ + *prealloc = NULL; + copy_user_highpage(new_page, page, addr, src_vma); + __SetPageUptodate(new_page); + page_add_new_anon_rmap(new_page, dst_vma, addr, false); + lru_cache_add_inactive_or_unevictable(new_page, dst_vma); + rss[mm_counter(new_page)]++; + + /* All done, just insert the new page copy in the child */ + pte = mk_pte(new_page, dst_vma->vm_page_prot); + pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); + set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); + return 0; +} + +/* + * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page + * is required to copy this pte. + */ +static inline int +copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, + struct page **prealloc) +{ + struct mm_struct *src_mm = src_vma->vm_mm; + unsigned long vm_flags = src_vma->vm_flags; pte_t pte = *src_pte; struct page *page; + page = vm_normal_page(src_vma, addr, pte); + if (page) { + int retval; + + retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte, + addr, rss, prealloc, pte, page); + if (retval <= 0) + return retval; + + get_page(page); + page_dup_rmap(page, false); + rss[mm_counter(page)]++; + } + /* * If it's a COW mapping, write protect it both * in the parent and the child @@ -806,33 +896,53 @@ copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (!(vm_flags & VM_UFFD_WP)) pte = pte_clear_uffd_wp(pte); - page = vm_normal_page(vma, addr, pte); - if (page) { - get_page(page); - page_dup_rmap(page, false); - rss[mm_counter(page)]++; + set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); + return 0; +} + +static inline struct page * +page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma, + unsigned long addr) +{ + struct page *new_page; + + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr); + if (!new_page) + return NULL; + + if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) { + put_page(new_page); + return NULL; } + cgroup_throttle_swaprate(new_page, GFP_KERNEL); - set_pte_at(dst_mm, addr, dst_pte, pte); + return new_page; } -static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, - unsigned long addr, unsigned long end) +static int +copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, + unsigned long end) { + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = src_vma->vm_mm; pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; - int progress = 0; + int progress, ret = 0; int rss[NR_MM_COUNTERS]; swp_entry_t entry = (swp_entry_t){0}; + struct page *prealloc = NULL; again: + progress = 0; init_rss_vec(rss); dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); - if (!dst_pte) - return -ENOMEM; + if (!dst_pte) { + ret = -ENOMEM; + goto out; + } src_pte = pte_offset_map(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); @@ -858,14 +968,31 @@ again: if (unlikely(!pte_present(*src_pte))) { entry.val = copy_nonpresent_pte(dst_mm, src_mm, dst_pte, src_pte, - vma, addr, rss); + src_vma, addr, rss); if (entry.val) break; progress += 8; continue; } - copy_present_pte(dst_mm, src_mm, dst_pte, src_pte, - vma, addr, rss); + /* copy_present_pte() will clear `*prealloc' if consumed */ + ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte, + addr, rss, &prealloc); + /* + * If we need a pre-allocated page for this pte, drop the + * locks, allocate, and try again. + */ + if (unlikely(ret == -EAGAIN)) + break; + if (unlikely(prealloc)) { + /* + * pre-alloc page cannot be reused by next time so as + * to strictly follow mempolicy (e.g., alloc_page_vma() + * will allocate page according to address). This + * could only happen if one pinned pte changed. + */ + put_page(prealloc); + prealloc = NULL; + } progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); @@ -877,19 +1004,34 @@ again: cond_resched(); if (entry.val) { - if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) + if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) { + ret = -ENOMEM; + goto out; + } + entry.val = 0; + } else if (ret) { + WARN_ON_ONCE(ret != -EAGAIN); + prealloc = page_copy_prealloc(src_mm, src_vma, addr); + if (!prealloc) return -ENOMEM; - progress = 0; + /* We've captured and resolved the error. Reset, try again. */ + ret = 0; } if (addr != end) goto again; - return 0; +out: + if (unlikely(prealloc)) + put_page(prealloc); + return ret; } -static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, - unsigned long addr, unsigned long end) +static inline int +copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pud_t *dst_pud, pud_t *src_pud, unsigned long addr, + unsigned long end) { + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = src_vma->vm_mm; pmd_t *src_pmd, *dst_pmd; unsigned long next; @@ -902,9 +1044,9 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { int err; - VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma); + VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma); err = copy_huge_pmd(dst_mm, src_mm, - dst_pmd, src_pmd, addr, vma); + dst_pmd, src_pmd, addr, src_vma); if (err == -ENOMEM) return -ENOMEM; if (!err) @@ -913,17 +1055,20 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src } if (pmd_none_or_clear_bad(src_pmd)) continue; - if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, - vma, addr, next)) + if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd, + addr, next)) return -ENOMEM; } while (dst_pmd++, src_pmd++, addr = next, addr != end); return 0; } -static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma, - unsigned long addr, unsigned long end) +static inline int +copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr, + unsigned long end) { + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = src_vma->vm_mm; pud_t *src_pud, *dst_pud; unsigned long next; @@ -936,9 +1081,9 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) { int err; - VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma); + VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma); err = copy_huge_pud(dst_mm, src_mm, - dst_pud, src_pud, addr, vma); + dst_pud, src_pud, addr, src_vma); if (err == -ENOMEM) return -ENOMEM; if (!err) @@ -947,17 +1092,19 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src } if (pud_none_or_clear_bad(src_pud)) continue; - if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, - vma, addr, next)) + if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud, + addr, next)) return -ENOMEM; } while (dst_pud++, src_pud++, addr = next, addr != end); return 0; } -static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, - unsigned long addr, unsigned long end) +static inline int +copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr, + unsigned long end) { + struct mm_struct *dst_mm = dst_vma->vm_mm; p4d_t *src_p4d, *dst_p4d; unsigned long next; @@ -969,20 +1116,22 @@ static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(src_p4d)) continue; - if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d, - vma, addr, next)) + if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d, + addr, next)) return -ENOMEM; } while (dst_p4d++, src_p4d++, addr = next, addr != end); return 0; } -int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - struct vm_area_struct *vma) +int +copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { pgd_t *src_pgd, *dst_pgd; unsigned long next; - unsigned long addr = vma->vm_start; - unsigned long end = vma->vm_end; + unsigned long addr = src_vma->vm_start; + unsigned long end = src_vma->vm_end; + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = src_vma->vm_mm; struct mmu_notifier_range range; bool is_cow; int ret; @@ -993,19 +1142,19 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * readonly mappings. The tradeoff is that copy_page_range is more * efficient than faulting. */ - if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) && - !vma->anon_vma) + if (!(src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) && + !src_vma->anon_vma) return 0; - if (is_vm_hugetlb_page(vma)) - return copy_hugetlb_page_range(dst_mm, src_mm, vma); + if (is_vm_hugetlb_page(src_vma)) + return copy_hugetlb_page_range(dst_mm, src_mm, src_vma); - if (unlikely(vma->vm_flags & VM_PFNMAP)) { + if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { /* * We do not free on error cases below as remove_vma * gets called on error from higher level routine */ - ret = track_pfn_copy(vma); + ret = track_pfn_copy(src_vma); if (ret) return ret; } @@ -1016,11 +1165,11 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * parent mm. And a permission downgrade will only happen if * is_cow_mapping() returns true. */ - is_cow = is_cow_mapping(vma->vm_flags); + is_cow = is_cow_mapping(src_vma->vm_flags); if (is_cow) { mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, - 0, vma, src_mm, addr, end); + 0, src_vma, src_mm, addr, end); mmu_notifier_invalidate_range_start(&range); } @@ -1031,8 +1180,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(src_pgd)) continue; - if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd, - vma, addr, next))) { + if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd, + addr, next))) { ret = -ENOMEM; break; } @@ -3444,7 +3593,7 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) * unlock_page(A) * lock_page(B) * lock_page(B) - * pte_alloc_pne + * pte_alloc_one * shrink_page_list * wait_on_page_writeback(A) * SetPageWriteback(B) @@ -3452,7 +3601,7 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) * # flush A, B to clear the writeback */ if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) { - vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); + vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; smp_wmb(); /* See comment in __pte_alloc() */ @@ -3619,7 +3768,7 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) /** * alloc_set_pte - setup new PTE entry for given page and add reverse page - * mapping. If needed, the fucntion allocates page table or use pre-allocated. + * mapping. If needed, the function allocates page table or use pre-allocated. * * @vmf: fault environment * @page: page to map diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b11a269e2356..8e9e2d44cdad 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -353,11 +353,19 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, #ifdef CONFIG_NUMA int __weak memory_add_physaddr_to_nid(u64 start) { - pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n", + pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n", start); return 0; } EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); + +int __weak phys_to_target_node(u64 start) +{ + pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n", + start); + return 0; +} +EXPORT_SYMBOL_GPL(phys_to_target_node); #endif /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ @@ -729,7 +737,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, * are reserved so nobody should be touching them so we should be safe */ memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, - MEMMAP_HOTPLUG, altmap); + MEMINIT_HOTPLUG, altmap); set_zone_contiguous(zone); } @@ -1080,7 +1088,8 @@ int __ref add_memory_resource(int nid, struct resource *res) } /* link memory sections under this node.*/ - ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1)); + ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1), + MEMINIT_HOTPLUG); BUG_ON(ret); /* create new memmap entry */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index eddbe4e56c73..3fde772ef5ef 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -875,13 +875,12 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, goto out; } - task_lock(current); ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { - task_unlock(current); mpol_put(new); goto out; } + task_lock(current); old = current->mempolicy; current->mempolicy = new; if (new && new->mode == MPOL_INTERLEAVE) @@ -1324,9 +1323,7 @@ static long do_mbind(unsigned long start, unsigned long len, NODEMASK_SCRATCH(scratch); if (scratch) { mmap_write_lock(mm); - task_lock(current); err = mpol_set_nodemask(new, nmask, scratch); - task_unlock(current); if (err) mmap_write_unlock(mm); } else @@ -1885,8 +1882,7 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) } /* Return the node id preferred by the given mempolicy, or the given id */ -static int policy_node(gfp_t gfp, struct mempolicy *policy, - int nd) +static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) { if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL)) nd = policy->v.preferred_node; diff --git a/mm/mempool.c b/mm/mempool.c index 79bff63ecf27..f473cdddaff0 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -58,11 +58,10 @@ static void __check_element(mempool_t *pool, void *element, size_t size) static void check_element(mempool_t *pool, void *element) { /* Mempools backed by slab allocator */ - if (pool->free == mempool_free_slab || pool->free == mempool_kfree) + if (pool->free == mempool_free_slab || pool->free == mempool_kfree) { __check_element(pool, element, ksize(element)); - - /* Mempools backed by page allocator */ - if (pool->free == mempool_free_pages) { + } else if (pool->free == mempool_free_pages) { + /* Mempools backed by page allocator */ int order = (int)(long)pool->pool_data; void *addr = kmap_atomic((struct page *)element); @@ -82,11 +81,10 @@ static void __poison_element(void *element, size_t size) static void poison_element(mempool_t *pool, void *element) { /* Mempools backed by slab allocator */ - if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) + if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) { __poison_element(element, ksize(element)); - - /* Mempools backed by page allocator */ - if (pool->alloc == mempool_alloc_pages) { + } else if (pool->alloc == mempool_alloc_pages) { + /* Mempools backed by page allocator */ int order = (int)(long)pool->pool_data; void *addr = kmap_atomic((struct page *)element); @@ -107,7 +105,7 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element) { if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) kasan_poison_kfree(element, _RET_IP_); - if (pool->alloc == mempool_alloc_pages) + else if (pool->alloc == mempool_alloc_pages) kasan_free_pages(element, (unsigned long)pool->pool_data); } @@ -115,7 +113,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element) { if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) kasan_unpoison_slab(element); - if (pool->alloc == mempool_alloc_pages) + else if (pool->alloc == mempool_alloc_pages) kasan_alloc_pages(element, (unsigned long)pool->pool_data); } diff --git a/mm/memremap.c b/mm/memremap.c index 006dace60b1a..198083453182 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -40,12 +40,10 @@ EXPORT_SYMBOL_GPL(memremap_compat_align); #ifdef CONFIG_DEV_PAGEMAP_OPS DEFINE_STATIC_KEY_FALSE(devmap_managed_key); EXPORT_SYMBOL(devmap_managed_key); -static atomic_t devmap_managed_enable; static void devmap_managed_enable_put(void) { - if (atomic_dec_and_test(&devmap_managed_enable)) - static_branch_disable(&devmap_managed_key); + static_branch_dec(&devmap_managed_key); } static int devmap_managed_enable_get(struct dev_pagemap *pgmap) @@ -56,8 +54,7 @@ static int devmap_managed_enable_get(struct dev_pagemap *pgmap) return -EINVAL; } - if (atomic_inc_return(&devmap_managed_enable) == 1) - static_branch_enable(&devmap_managed_key); + static_branch_inc(&devmap_managed_key); return 0; } #else @@ -70,24 +67,28 @@ static void devmap_managed_enable_put(void) } #endif /* CONFIG_DEV_PAGEMAP_OPS */ -static void pgmap_array_delete(struct resource *res) +static void pgmap_array_delete(struct range *range) { - xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end), + xa_store_range(&pgmap_array, PHYS_PFN(range->start), PHYS_PFN(range->end), NULL, GFP_KERNEL); synchronize_rcu(); } -static unsigned long pfn_first(struct dev_pagemap *pgmap) +static unsigned long pfn_first(struct dev_pagemap *pgmap, int range_id) { - return PHYS_PFN(pgmap->res.start) + - vmem_altmap_offset(pgmap_altmap(pgmap)); + struct range *range = &pgmap->ranges[range_id]; + unsigned long pfn = PHYS_PFN(range->start); + + if (range_id) + return pfn; + return pfn + vmem_altmap_offset(pgmap_altmap(pgmap)); } -static unsigned long pfn_end(struct dev_pagemap *pgmap) +static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id) { - const struct resource *res = &pgmap->res; + const struct range *range = &pgmap->ranges[range_id]; - return (res->start + resource_size(res)) >> PAGE_SHIFT; + return (range->start + range_len(range)) >> PAGE_SHIFT; } static unsigned long pfn_next(unsigned long pfn) @@ -97,8 +98,8 @@ static unsigned long pfn_next(unsigned long pfn) return pfn + 1; } -#define for_each_device_pfn(pfn, map) \ - for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn)) +#define for_each_device_pfn(pfn, map, i) \ + for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn)) static void dev_pagemap_kill(struct dev_pagemap *pgmap) { @@ -124,39 +125,49 @@ static void dev_pagemap_cleanup(struct dev_pagemap *pgmap) pgmap->ref = NULL; } -void memunmap_pages(struct dev_pagemap *pgmap) +static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) { - struct resource *res = &pgmap->res; + struct range *range = &pgmap->ranges[range_id]; struct page *first_page; - unsigned long pfn; int nid; - dev_pagemap_kill(pgmap); - for_each_device_pfn(pfn, pgmap) - put_page(pfn_to_page(pfn)); - dev_pagemap_cleanup(pgmap); - /* make sure to access a memmap that was actually initialized */ - first_page = pfn_to_page(pfn_first(pgmap)); + first_page = pfn_to_page(pfn_first(pgmap, range_id)); /* pages are dead and unused, undo the arch mapping */ nid = page_to_nid(first_page); mem_hotplug_begin(); - remove_pfn_range_from_zone(page_zone(first_page), PHYS_PFN(res->start), - PHYS_PFN(resource_size(res))); + remove_pfn_range_from_zone(page_zone(first_page), PHYS_PFN(range->start), + PHYS_PFN(range_len(range))); if (pgmap->type == MEMORY_DEVICE_PRIVATE) { - __remove_pages(PHYS_PFN(res->start), - PHYS_PFN(resource_size(res)), NULL); + __remove_pages(PHYS_PFN(range->start), + PHYS_PFN(range_len(range)), NULL); } else { - arch_remove_memory(nid, res->start, resource_size(res), + arch_remove_memory(nid, range->start, range_len(range), pgmap_altmap(pgmap)); - kasan_remove_zero_shadow(__va(res->start), resource_size(res)); + kasan_remove_zero_shadow(__va(range->start), range_len(range)); } mem_hotplug_done(); - untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res)); - pgmap_array_delete(res); + untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range)); + pgmap_array_delete(range); +} + +void memunmap_pages(struct dev_pagemap *pgmap) +{ + unsigned long pfn; + int i; + + dev_pagemap_kill(pgmap); + for (i = 0; i < pgmap->nr_range; i++) + for_each_device_pfn(pfn, pgmap, i) + put_page(pfn_to_page(pfn)); + dev_pagemap_cleanup(pgmap); + + for (i = 0; i < pgmap->nr_range; i++) + pageunmap_range(pgmap, i); + WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n"); devmap_managed_enable_put(); } @@ -175,6 +186,114 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref) complete(&pgmap->done); } +static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, + int range_id, int nid) +{ + struct range *range = &pgmap->ranges[range_id]; + struct dev_pagemap *conflict_pgmap; + int error, is_ram; + + if (WARN_ONCE(pgmap_altmap(pgmap) && range_id > 0, + "altmap not supported for multiple ranges\n")) + return -EINVAL; + + conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->start), NULL); + if (conflict_pgmap) { + WARN(1, "Conflicting mapping in same section\n"); + put_dev_pagemap(conflict_pgmap); + return -ENOMEM; + } + + conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->end), NULL); + if (conflict_pgmap) { + WARN(1, "Conflicting mapping in same section\n"); + put_dev_pagemap(conflict_pgmap); + return -ENOMEM; + } + + is_ram = region_intersects(range->start, range_len(range), + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); + + if (is_ram != REGION_DISJOINT) { + WARN_ONCE(1, "attempted on %s region %#llx-%#llx\n", + is_ram == REGION_MIXED ? "mixed" : "ram", + range->start, range->end); + return -ENXIO; + } + + error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(range->start), + PHYS_PFN(range->end), pgmap, GFP_KERNEL)); + if (error) + return error; + + if (nid < 0) + nid = numa_mem_id(); + + error = track_pfn_remap(NULL, ¶ms->pgprot, PHYS_PFN(range->start), 0, + range_len(range)); + if (error) + goto err_pfn_remap; + + mem_hotplug_begin(); + + /* + * For device private memory we call add_pages() as we only need to + * allocate and initialize struct page for the device memory. More- + * over the device memory is un-accessible thus we do not want to + * create a linear mapping for the memory like arch_add_memory() + * would do. + * + * For all other device memory types, which are accessible by + * the CPU, we do want the linear mapping and thus use + * arch_add_memory(). + */ + if (pgmap->type == MEMORY_DEVICE_PRIVATE) { + error = add_pages(nid, PHYS_PFN(range->start), + PHYS_PFN(range_len(range)), params); + } else { + error = kasan_add_zero_shadow(__va(range->start), range_len(range)); + if (error) { + mem_hotplug_done(); + goto err_kasan; + } + + error = arch_add_memory(nid, range->start, range_len(range), + params); + } + + if (!error) { + struct zone *zone; + + zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; + move_pfn_range_to_zone(zone, PHYS_PFN(range->start), + PHYS_PFN(range_len(range)), params->altmap); + } + + mem_hotplug_done(); + if (error) + goto err_add_memory; + + /* + * Initialization of the pages has been deferred until now in order + * to allow us to do the work while not holding the hotplug lock. + */ + memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], + PHYS_PFN(range->start), + PHYS_PFN(range_len(range)), pgmap); + percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id) + - pfn_first(pgmap, range_id)); + return 0; + +err_add_memory: + kasan_remove_zero_shadow(__va(range->start), range_len(range)); +err_kasan: + untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range)); +err_pfn_remap: + pgmap_array_delete(range); + return error; +} + + /* * Not device managed version of dev_memremap_pages, undone by * memunmap_pages(). Please use dev_memremap_pages if you have a struct @@ -182,17 +301,16 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref) */ void *memremap_pages(struct dev_pagemap *pgmap, int nid) { - struct resource *res = &pgmap->res; - struct dev_pagemap *conflict_pgmap; struct mhp_params params = { - /* - * We do not want any optional features only our own memmap - */ .altmap = pgmap_altmap(pgmap), .pgprot = PAGE_KERNEL, }; - int error, is_ram; + const int nr_range = pgmap->nr_range; bool need_devmap_managed = true; + int error, i; + + if (WARN_ONCE(!nr_range, "nr_range must be specified\n")) + return ERR_PTR(-EINVAL); switch (pgmap->type) { case MEMORY_DEVICE_PRIVATE: @@ -251,105 +369,27 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) return ERR_PTR(error); } - conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->start), NULL); - if (conflict_pgmap) { - WARN(1, "Conflicting mapping in same section\n"); - put_dev_pagemap(conflict_pgmap); - error = -ENOMEM; - goto err_array; - } - - conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->end), NULL); - if (conflict_pgmap) { - WARN(1, "Conflicting mapping in same section\n"); - put_dev_pagemap(conflict_pgmap); - error = -ENOMEM; - goto err_array; - } - - is_ram = region_intersects(res->start, resource_size(res), - IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); - - if (is_ram != REGION_DISJOINT) { - WARN_ONCE(1, "%s attempted on %s region %pr\n", __func__, - is_ram == REGION_MIXED ? "mixed" : "ram", res); - error = -ENXIO; - goto err_array; - } - - error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start), - PHYS_PFN(res->end), pgmap, GFP_KERNEL)); - if (error) - goto err_array; - - if (nid < 0) - nid = numa_mem_id(); - - error = track_pfn_remap(NULL, ¶ms.pgprot, PHYS_PFN(res->start), - 0, resource_size(res)); - if (error) - goto err_pfn_remap; - - mem_hotplug_begin(); - /* - * For device private memory we call add_pages() as we only need to - * allocate and initialize struct page for the device memory. More- - * over the device memory is un-accessible thus we do not want to - * create a linear mapping for the memory like arch_add_memory() - * would do. - * - * For all other device memory types, which are accessible by - * the CPU, we do want the linear mapping and thus use - * arch_add_memory(). + * Clear the pgmap nr_range as it will be incremented for each + * successfully processed range. This communicates how many + * regions to unwind in the abort case. */ - if (pgmap->type == MEMORY_DEVICE_PRIVATE) { - error = add_pages(nid, PHYS_PFN(res->start), - PHYS_PFN(resource_size(res)), ¶ms); - } else { - error = kasan_add_zero_shadow(__va(res->start), resource_size(res)); - if (error) { - mem_hotplug_done(); - goto err_kasan; - } - - error = arch_add_memory(nid, res->start, resource_size(res), - ¶ms); + pgmap->nr_range = 0; + error = 0; + for (i = 0; i < nr_range; i++) { + error = pagemap_range(pgmap, ¶ms, i, nid); + if (error) + break; + pgmap->nr_range++; } - if (!error) { - struct zone *zone; - - zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; - move_pfn_range_to_zone(zone, PHYS_PFN(res->start), - PHYS_PFN(resource_size(res)), params.altmap); + if (i < nr_range) { + memunmap_pages(pgmap); + pgmap->nr_range = nr_range; + return ERR_PTR(error); } - mem_hotplug_done(); - if (error) - goto err_add_memory; - - /* - * Initialization of the pages has been deferred until now in order - * to allow us to do the work while not holding the hotplug lock. - */ - memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], - PHYS_PFN(res->start), - PHYS_PFN(resource_size(res)), pgmap); - percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap)); - return __va(res->start); - - err_add_memory: - kasan_remove_zero_shadow(__va(res->start), resource_size(res)); - err_kasan: - untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res)); - err_pfn_remap: - pgmap_array_delete(res); - err_array: - dev_pagemap_kill(pgmap); - dev_pagemap_cleanup(pgmap); - devmap_managed_enable_put(); - return ERR_PTR(error); + return __va(pgmap->ranges[0].start); } EXPORT_SYMBOL_GPL(memremap_pages); @@ -369,7 +409,7 @@ EXPORT_SYMBOL_GPL(memremap_pages); * 'live' on entry and will be killed and reaped at * devm_memremap_pages_release() time, or if this routine fails. * - * 4/ res is expected to be a host memory range that could feasibly be + * 4/ range is expected to be a host memory range that could feasibly be * treated as a "System RAM" range, i.e. not a device mmio range, but * this is not enforced. */ @@ -426,7 +466,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, * In the cached case we're already holding a live reference. */ if (pgmap) { - if (phys >= pgmap->res.start && phys <= pgmap->res.end) + if (phys >= pgmap->range.start && phys <= pgmap->range.end) return pgmap; put_dev_pagemap(pgmap); } @@ -451,8 +491,6 @@ void free_devmap_managed_page(struct page *page) return; } - /* Clear Active bit in case of parallel mark_page_accessed */ - __ClearPageActive(page); __ClearPageWaiters(page); mem_cgroup_uncharge(page); diff --git a/mm/migrate.c b/mm/migrate.c index aecb1433cf3c..f94d7c7eeddf 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -381,7 +381,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page) int expected_count = 1; /* - * Device public or private pages have an extra refcount as they are + * Device private pages have an extra refcount as they are * ZONE_DEVICE pages. */ expected_count += is_device_private_page(page); @@ -503,7 +503,7 @@ int migrate_page_move_mapping(struct address_space *mapping, __dec_lruvec_state(old_lruvec, NR_SHMEM); __inc_lruvec_state(new_lruvec, NR_SHMEM); } - if (dirty && mapping_cap_account_dirty(mapping)) { + if (dirty && mapping_can_writeback(mapping)) { __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY); __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING); __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY); @@ -1446,7 +1446,7 @@ retry: * Capture required information that might get lost * during migration. */ - is_thp = PageTransHuge(page); + is_thp = PageTransHuge(page) && !PageHuge(page); nr_subpages = thp_nr_pages(page); cond_resched(); @@ -1472,7 +1472,7 @@ retry: * we encounter them after the rest of the list * is processed. */ - if (PageTransHuge(page) && !PageHuge(page)) { + if (is_thp) { lock_page(page); rc = split_huge_page_to_list(page, from); unlock_page(page); @@ -1481,8 +1481,7 @@ retry: nr_thp_split++; goto retry; } - } - if (is_thp) { + nr_thp_failed++; nr_failed += nr_subpages; goto out; @@ -3078,7 +3077,6 @@ void migrate_vma_finalize(struct migrate_vma *migrate) remove_migration_ptes(page, newpage, false); unlock_page(page); - migrate->cpages--; if (is_zone_device_page(page)) put_page(page); diff --git a/mm/mincore.c b/mm/mincore.c index 453ff112470f..02db1a834021 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -48,7 +48,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, * and is up to date; i.e. that no page-in operation would be required * at this time if an application were to map and access this page. */ -static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) +static unsigned char mincore_page(struct address_space *mapping, pgoff_t index) { unsigned char present = 0; struct page *page; @@ -59,31 +59,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) * any other file mapping (ie. marked !present and faulted in with * tmpfs's .fault). So swapped out tmpfs mappings are tested here. */ -#ifdef CONFIG_SWAP - if (shmem_mapping(mapping)) { - page = find_get_entry(mapping, pgoff); - /* - * shmem/tmpfs may return swap: account for swapcache - * page too. - */ - if (xa_is_value(page)) { - swp_entry_t swp = radix_to_swp_entry(page); - struct swap_info_struct *si; - - /* Prevent swap device to being swapoff under us */ - si = get_swap_device(swp); - if (si) { - page = find_get_page(swap_address_space(swp), - swp_offset(swp)); - put_swap_device(si); - } else - page = NULL; - } - } else - page = find_get_page(mapping, pgoff); -#else - page = find_get_page(mapping, pgoff); -#endif + page = find_get_incore_page(mapping, index); if (page) { present = PageUptodate(page); put_page(page); diff --git a/mm/mmap.c b/mm/mmap.c index 40248d84ad5f..67d11ad6df24 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -143,7 +143,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct file *file, struct address_space *mapping) { if (vma->vm_flags & VM_DENYWRITE) - atomic_inc(&file_inode(file)->i_writecount); + allow_write_access(file); if (vma->vm_flags & VM_SHARED) mapping_unmap_writable(mapping); @@ -474,8 +474,12 @@ static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma, { /* * All rb_subtree_gap values must be consistent prior to erase, - * with the possible exception of the "next" vma being erased if - * next->vm_start was reduced. + * with the possible exception of + * + * a. the "next" vma being erased if next->vm_start was reduced in + * __vma_adjust() -> __vma_unlink() + * b. the vma being erased in detach_vmas_to_be_unmapped() -> + * vma_rb_erase() */ validate_mm_rb(root, ignore); @@ -485,13 +489,7 @@ static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma, static __always_inline void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) { - /* - * All rb_subtree_gap values must be consistent prior to erase, - * with the possible exception of the vma being erased. - */ - validate_mm_rb(root, vma); - - __vma_rb_erase(vma, root); + vma_rb_erase_ignore(vma, root, vma); } /* @@ -623,7 +621,7 @@ static void __vma_link_file(struct vm_area_struct *vma) if (vma->vm_flags & VM_DENYWRITE) atomic_dec(&file_inode(file)->i_writecount); if (vma->vm_flags & VM_SHARED) - atomic_inc(&mapping->i_mmap_writable); + mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); vma_interval_tree_insert(vma, &mapping->i_mmap); @@ -677,7 +675,7 @@ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) mm->map_count++; } -static __always_inline void __vma_unlink_common(struct mm_struct *mm, +static __always_inline void __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *ignore) { @@ -760,7 +758,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * vma expands, overlapping part of the next: * mprotect case 5 shifting the boundary up. */ - adjust_next = (end - next->vm_start) >> PAGE_SHIFT; + adjust_next = (end - next->vm_start); exporter = next; importer = vma; VM_WARN_ON(expand != importer); @@ -770,7 +768,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * split_vma inserting another: so it must be * mprotect case 4 shifting the boundary down. */ - adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT); + adjust_next = -(vma->vm_end - end); exporter = vma; importer = next; VM_WARN_ON(expand != importer); @@ -825,7 +823,7 @@ again: anon_vma_interval_tree_pre_update_vma(next); } - if (root) { + if (file) { flush_dcache_mmap_lock(mapping); vma_interval_tree_remove(vma, root); if (adjust_next) @@ -842,11 +840,11 @@ again: } vma->vm_pgoff = pgoff; if (adjust_next) { - next->vm_start += adjust_next << PAGE_SHIFT; - next->vm_pgoff += adjust_next; + next->vm_start += adjust_next; + next->vm_pgoff += adjust_next >> PAGE_SHIFT; } - if (root) { + if (file) { if (adjust_next) vma_interval_tree_insert(next, root); vma_interval_tree_insert(vma, root); @@ -859,7 +857,7 @@ again: * us to remove next before dropping the locks. */ if (remove_next != 3) - __vma_unlink_common(mm, next, next); + __vma_unlink(mm, next, next); else /* * vma is not before next if they've been @@ -870,7 +868,7 @@ again: * "next" (which is stored in post-swap() * "vma"). */ - __vma_unlink_common(mm, next, vma); + __vma_unlink(mm, next, vma); if (file) __remove_shared_vm_struct(next, file, mapping); } else if (insert) { @@ -897,10 +895,9 @@ again: anon_vma_interval_tree_post_update_vma(next); anon_vma_unlock_write(anon_vma); } - if (mapping) - i_mmap_unlock_write(mapping); - if (root) { + if (file) { + i_mmap_unlock_write(mapping); uprobe_mmap(vma); if (adjust_next) @@ -1666,7 +1663,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) /* Can the mapping track the dirty pages? */ return vma->vm_file && vma->vm_file->f_mapping && - mapping_cap_account_dirty(vma->vm_file->f_mapping); + mapping_can_writeback(vma->vm_file->f_mapping); } /* @@ -1781,7 +1778,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr, merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX); if (merge) { - fput(file); + /* ->mmap() can change vma->vm_file and fput the original file. So + * fput the vma->vm_file here or we would add an extra fput for file + * and cause general protection fault ultimately. + */ + fput(vma->vm_file); vm_area_free(vma); vma = merge; /* Update vm_flags and possible addr to pick up the change. We don't @@ -1812,6 +1813,15 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma_set_anonymous(vma); } + /* Allow architectures to sanity-check the vm_flags */ + if (!arch_validate_flags(vma->vm_flags)) { + error = -EINVAL; + if (file) + goto unmap_and_free_vma; + else + goto free_vma; + } + vma_link(mm, vma, prev, rb_link, rb_parent); /* Once vma denies write, undo our temporary denial count */ if (file) { @@ -3223,7 +3233,7 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) * By setting it to reflect the virtual start address of the * vma, merges and splits can happen in a seamless way, just * using the existing file pgoff checks and manipulations. - * Similarly in do_mmap and in do_brk. + * Similarly in do_mmap and in do_brk_flags. */ if (vma_is_anonymous(vma)) { BUG_ON(vma->anon_vma); diff --git a/mm/mprotect.c b/mm/mprotect.c index ce8b8a5eacbb..56c02beb6041 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -603,6 +603,12 @@ static int do_mprotect_pkey(unsigned long start, size_t len, goto out; } + /* Allow architectures to sanity-check the new flags */ + if (!arch_validate_flags(newflags)) { + error = -EINVAL; + goto out; + } + error = security_file_mprotect(vma, reqprot, prot); if (error) goto out; diff --git a/mm/nommu.c b/mm/nommu.c index 75a327149af1..0df7ca321314 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -5,7 +5,7 @@ * Replacement code for mm functions to support CPU's that don't * have any form of memory management unit (thus no virtual memory). * - * See Documentation/mm/nommu-mmap.rst + * See Documentation/admin-guide/mm/nommu-mmap.rst * * Copyright (c) 2004-2008 David Howells <dhowells@redhat.com> * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e90f25d6385d..8b84661a6410 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -64,6 +64,8 @@ int sysctl_oom_dump_tasks = 1; * and mark_oom_victim */ DEFINE_MUTEX(oom_lock); +/* Serializes oom_score_adj and oom_score_adj_min updates */ +DEFINE_MUTEX(oom_adj_mutex); static inline bool is_memcg_oom(struct oom_control *oc) { diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 4e4ddd67b71e..358d6f28c627 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1882,7 +1882,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) int ratelimit; int *p; - if (!bdi_cap_account_dirty(bdi)) + if (!(bdi->capabilities & BDI_CAP_WRITEBACK)) return; if (inode_cgwb_enabled(inode)) @@ -2423,7 +2423,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) trace_writeback_dirty_page(page, mapping); - if (mapping_cap_account_dirty(mapping)) { + if (mapping_can_writeback(mapping)) { struct bdi_writeback *wb; inode_attach_wb(inode, page); @@ -2450,7 +2450,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) void account_page_cleaned(struct page *page, struct address_space *mapping, struct bdi_writeback *wb) { - if (mapping_cap_account_dirty(mapping)) { + if (mapping_can_writeback(mapping)) { dec_lruvec_page_state(page, NR_FILE_DIRTY); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); @@ -2513,7 +2513,7 @@ void account_page_redirty(struct page *page) { struct address_space *mapping = page->mapping; - if (mapping && mapping_cap_account_dirty(mapping)) { + if (mapping && mapping_can_writeback(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; struct wb_lock_cookie cookie = {}; @@ -2625,7 +2625,7 @@ void __cancel_dirty_page(struct page *page) { struct address_space *mapping = page_mapping(page); - if (mapping_cap_account_dirty(mapping)) { + if (mapping_can_writeback(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; struct wb_lock_cookie cookie = {}; @@ -2665,7 +2665,7 @@ int clear_page_dirty_for_io(struct page *page) VM_BUG_ON_PAGE(!PageLocked(page), page); - if (mapping && mapping_cap_account_dirty(mapping)) { + if (mapping && mapping_can_writeback(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; struct wb_lock_cookie cookie = {}; @@ -2738,7 +2738,7 @@ int test_clear_page_writeback(struct page *page) if (ret) { __xa_clear_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_WRITEBACK); - if (bdi_cap_account_writeback(bdi)) { + if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { struct bdi_writeback *wb = inode_to_wb(inode); dec_wb_stat(wb, WB_WRITEBACK); @@ -2791,7 +2791,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) PAGECACHE_TAG_WRITEBACK); xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); - if (bdi_cap_account_writeback(bdi)) + if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); /* @@ -2849,7 +2849,7 @@ EXPORT_SYMBOL_GPL(wait_on_page_writeback); */ void wait_for_stable_page(struct page *page) { - if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host))) + if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES) wait_on_page_writeback(page); } EXPORT_SYMBOL_GPL(wait_for_stable_page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fab5e97dc9ca..05fe1ddb033c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -69,6 +69,7 @@ #include <linux/nmi.h> #include <linux/psi.h> #include <linux/padata.h> +#include <linux/khugepaged.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -155,16 +156,16 @@ static int __init early_init_on_alloc(char *buf) int ret; bool bool_result; - if (!buf) - return -EINVAL; ret = kstrtobool(buf, &bool_result); + if (ret) + return ret; if (bool_result && page_poisoning_enabled()) pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n"); if (bool_result) static_branch_enable(&init_on_alloc); else static_branch_disable(&init_on_alloc); - return ret; + return 0; } early_param("init_on_alloc", early_init_on_alloc); @@ -173,16 +174,16 @@ static int __init early_init_on_free(char *buf) int ret; bool bool_result; - if (!buf) - return -EINVAL; ret = kstrtobool(buf, &bool_result); + if (ret) + return ret; if (bool_result && page_poisoning_enabled()) pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n"); if (bool_result) static_branch_enable(&init_on_free); else static_branch_disable(&init_on_free); - return ret; + return 0; } early_param("init_on_free", early_init_on_free); @@ -3367,9 +3368,16 @@ struct page *rmqueue(struct zone *preferred_zone, struct page *page; if (likely(order == 0)) { - page = rmqueue_pcplist(preferred_zone, zone, gfp_flags, + /* + * MIGRATE_MOVABLE pcplist could have the pages on CMA area and + * we need to skip it when CMA area isn't allowed. + */ + if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA || + migratetype != MIGRATE_MOVABLE) { + page = rmqueue_pcplist(preferred_zone, zone, gfp_flags, migratetype, alloc_flags); - goto out; + goto out; + } } /* @@ -3381,7 +3389,13 @@ struct page *rmqueue(struct zone *preferred_zone, do { page = NULL; - if (alloc_flags & ALLOC_HARDER) { + /* + * order-0 request can reach here when the pcplist is skipped + * due to non-CMA allocation context. HIGHATOMIC area is + * reserved for high-order atomic allocation, so order-0 + * request should skip it. + */ + if (order > 0 && alloc_flags & ALLOC_HARDER) { page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); if (page) trace_mm_page_alloc_zone_locked(page, order, migratetype); @@ -3727,8 +3741,8 @@ retry: */ no_fallback = alloc_flags & ALLOC_NOFRAGMENT; z = ac->preferred_zoneref; - for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, - ac->highest_zoneidx, ac->nodemask) { + for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx, + ac->nodemask) { struct page *page; unsigned long mark; @@ -3972,8 +3986,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, * success so it is time to admit defeat. We will skip the OOM killer * because it is very likely that the caller has a more reasonable * fallback than shooting a random task. + * + * The OOM killer may not free memory on a specific node. */ - if (gfp_mask & __GFP_RETRY_MAYFAIL) + if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE)) goto out; /* The OOM killer does not needlessly kill tasks for lowmem */ if (ac->highest_zoneidx < ZONE_NORMAL) @@ -3990,10 +4006,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, * failures more gracefully we should just bail out here. */ - /* The OOM killer may not free memory on a specific node */ - if (gfp_mask & __GFP_THISNODE) - goto out; - /* Exhausted what can be done so it's blame time */ if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { *did_some_progress = 1; @@ -4241,13 +4253,12 @@ EXPORT_SYMBOL_GPL(fs_reclaim_release); #endif /* Perform direct synchronous page reclaim */ -static int +static unsigned long __perform_reclaim(gfp_t gfp_mask, unsigned int order, const struct alloc_context *ac) { - int progress; unsigned int noreclaim_flag; - unsigned long pflags; + unsigned long pflags, progress; cond_resched(); @@ -4826,12 +4837,6 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, *alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags); - return true; -} - -/* Determine whether to spread dirty pages and what the first usable zone */ -static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) -{ /* Dirty zone balancing only done in the fast path */ ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); @@ -4842,6 +4847,8 @@ static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) */ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->highest_zoneidx, ac->nodemask); + + return true; } /* @@ -4870,8 +4877,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) return NULL; - finalise_ac(gfp_mask, &ac); - /* * Forbid the first pass from falling back to types that fragment * memory until all local zones are considered. @@ -4947,6 +4952,9 @@ void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) free_the_page(page, order); + else if (!PageHead(page)) + while (order-- > 0) + free_the_page(page + (1 << order), order); } EXPORT_SYMBOL(__free_pages); @@ -5637,7 +5645,6 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) int n, val; int min_val = INT_MAX; int best_node = NUMA_NO_NODE; - const struct cpumask *tmp = cpumask_of_node(0); /* Use the local node if we haven't already */ if (!node_isset(node, *used_node_mask)) { @@ -5658,8 +5665,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) val += (n < node); /* Give preference to headless and unused nodes */ - tmp = cpumask_of_node(n); - if (!cpumask_empty(tmp)) + if (!cpumask_empty(cpumask_of_node(n))) val += PENALTY_FOR_NODE_WITH_CPUS; /* Slight preference for less loaded node */ @@ -5955,7 +5961,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) if (mirrored_kernelcore && zone == ZONE_MOVABLE) { if (!r || *pfn >= memblock_region_memory_end_pfn(r)) { - for_each_memblock(memory, r) { + for_each_mem_region(r) { if (*pfn < memblock_region_memory_end_pfn(r)) break; } @@ -5975,7 +5981,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) * done. Non-atomic initialization, single-pass. */ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, - unsigned long start_pfn, enum memmap_context context, + unsigned long start_pfn, enum meminit_context context, struct vmem_altmap *altmap) { unsigned long pfn, end_pfn = start_pfn + size; @@ -6007,7 +6013,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * There can be holes in boot-time mem_map[]s handed to this * function. They do not exist on hotplugged memory. */ - if (context == MEMMAP_EARLY) { + if (context == MEMINIT_EARLY) { if (overlap_memmap_init(zone, &pfn)) continue; if (defer_init(nid, pfn, end_pfn)) @@ -6016,7 +6022,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, page = pfn_to_page(pfn); __init_single_page(page, pfn, zone, nid); - if (context == MEMMAP_HOTPLUG) + if (context == MEMINIT_HOTPLUG) __SetPageReserved(page); /* @@ -6099,7 +6105,7 @@ void __ref memmap_init_zone_device(struct zone *zone, * check here not to call set_pageblock_migratetype() against * pfn out of zone. * - * Please note that MEMMAP_HOTPLUG path doesn't clear memmap + * Please note that MEMINIT_HOTPLUG path doesn't clear memmap * because this is done early in section_activate() */ if (!(pfn & (pageblock_nr_pages - 1))) { @@ -6137,7 +6143,7 @@ void __meminit __weak memmap_init(unsigned long size, int nid, if (end_pfn > start_pfn) { size = end_pfn - start_pfn; memmap_init_zone(size, nid, zone, start_pfn, - MEMMAP_EARLY, NULL); + MEMINIT_EARLY, NULL); } } } @@ -6540,7 +6546,7 @@ static unsigned long __init zone_absent_pages_in_node(int nid, unsigned long start_pfn, end_pfn; struct memblock_region *r; - for_each_memblock(memory, r) { + for_each_mem_region(r) { start_pfn = clamp(memblock_region_memory_base_pfn(r), zone_start_pfn, zone_end_pfn); end_pfn = clamp(memblock_region_memory_end_pfn(r), @@ -6984,8 +6990,7 @@ static void __init init_unavailable_mem(void) * Loop through unavailable ranges not covered by memblock.memory. */ pgcnt = 0; - for_each_mem_range(i, &memblock.memory, NULL, - NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) { + for_each_mem_range(i, &start, &end) { if (next < start) pgcnt += init_unavailable_range(PFN_DOWN(next), PFN_UP(start)); @@ -7135,7 +7140,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) * options. */ if (movable_node_is_enabled()) { - for_each_memblock(memory, r) { + for_each_mem_region(r) { if (!memblock_is_hotpluggable(r)) continue; @@ -7156,7 +7161,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) if (mirrored_kernelcore) { bool mem_below_4gb_not_mirrored = false; - for_each_memblock(memory, r) { + for_each_mem_region(r) { if (memblock_is_mirror(r)) continue; @@ -7891,6 +7896,8 @@ int __meminit init_per_zone_wmark_min(void) setup_min_slab_ratio(); #endif + khugepaged_min_free_kbytes_update(); + return 0; } postcore_initcall(init_per_zone_wmark_min) @@ -8218,14 +8225,7 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page, { unsigned long iter = 0; unsigned long pfn = page_to_pfn(page); - - /* - * TODO we could make this much more efficient by not checking every - * page in the range if we know all of them are in MOVABLE_ZONE and - * that the movable zone guarantees that pages are migratable but - * the later is not the case right now unfortunatelly. E.g. movablecore - * can still lead to having bootmem allocations in zone_movable. - */ + unsigned long offset = pfn % pageblock_nr_pages; if (is_migrate_cma_page(page)) { /* @@ -8239,12 +8239,18 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page, return page; } - for (; iter < pageblock_nr_pages; iter++) { + for (; iter < pageblock_nr_pages - offset; iter++) { if (!pfn_valid_within(pfn + iter)) continue; page = pfn_to_page(pfn + iter); + /* + * Both, bootmem allocations and memory holes are marked + * PG_reserved and are unmovable. We can even have unmovable + * allocations inside ZONE_MOVABLE, for example when + * specifying "movablecore". + */ if (PageReserved(page)) return page; @@ -8318,14 +8324,6 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page, * it. But now, memory offline itself doesn't call * shrink_node_slabs() and it still to be fixed. */ - /* - * If the page is not RAM, page_count()should be 0. - * we don't need more check. This is an _used_ not-movable page. - * - * The problematic thing here is PG_reserved pages. PG_reserved - * is set to both of a memory hole page and a _used_ kernel - * page at boot. - */ return page; } return NULL; diff --git a/mm/page_counter.c b/mm/page_counter.c index afe22ad335cc..b24a60b28bb0 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -109,7 +109,7 @@ bool page_counter_try_charge(struct page_counter *counter, * * The atomic_long_add_return() implies a full memory * barrier between incrementing the count and reading - * the limit. When racing with page_counter_limit(), + * the limit. When racing with page_counter_set_max(), * we either see the new limit or the setter sees the * counter has changed and retries. */ diff --git a/mm/page_io.c b/mm/page_io.c index e485a6e8a6cd..433df1263349 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -252,6 +252,16 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) unlock_page(page); goto out; } + /* + * Arch code may have to preserve more data than just the page + * contents, e.g. memory tags. + */ + ret = arch_prepare_to_swap(page); + if (ret) { + set_page_dirty(page); + unlock_page(page); + goto out; + } if (frontswap_store(page) == 0) { set_page_writeback(page); unlock_page(page); @@ -302,7 +312,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, struct swap_info_struct *sis = page_swap_info(page); VM_BUG_ON_PAGE(!PageSwapCache(page), page); - if (data_race(sis->flags & SWP_FS)) { + if (data_race(sis->flags & SWP_FS_OPS)) { struct kiocb kiocb; struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; @@ -349,13 +359,11 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, return 0; } - ret = 0; bio = get_swap_bio(GFP_NOIO, page, end_write_func); if (bio == NULL) { set_page_dirty(page); unlock_page(page); - ret = -ENOMEM; - goto out; + return -ENOMEM; } bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc); bio_associate_blkg_from_page(bio, page); @@ -363,8 +371,8 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, set_page_writeback(page); unlock_page(page); submit_bio(bio); -out: - return ret; + + return 0; } int swap_readpage(struct page *page, bool synchronous) @@ -393,7 +401,7 @@ int swap_readpage(struct page *page, bool synchronous) goto out; } - if (data_race(sis->flags & SWP_FS)) { + if (data_race(sis->flags & SWP_FS_OPS)) { struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; @@ -403,15 +411,17 @@ int swap_readpage(struct page *page, bool synchronous) goto out; } - ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); - if (!ret) { - if (trylock_page(page)) { - swap_slot_free_notify(page); - unlock_page(page); - } + if (sis->flags & SWP_SYNCHRONOUS_IO) { + ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); + if (!ret) { + if (trylock_page(page)) { + swap_slot_free_notify(page); + unlock_page(page); + } - count_vm_event(PSWPIN); - goto out; + count_vm_event(PSWPIN); + goto out; + } } ret = 0; @@ -455,7 +465,7 @@ int swap_set_page_dirty(struct page *page) { struct swap_info_struct *sis = page_swap_info(page); - if (data_race(sis->flags & SWP_FS)) { + if (data_race(sis->flags & SWP_FS_OPS)) { struct address_space *mapping = sis->swap_file->f_mapping; VM_BUG_ON_PAGE(!PageSwapCache(page), page); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 63a3db10a8c0..aa94afb63823 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -17,22 +17,21 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags) { - struct page *unmovable = NULL; - struct zone *zone; + struct zone *zone = page_zone(page); + struct page *unmovable; unsigned long flags; - int ret = -EBUSY; - - zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); /* * We assume the caller intended to SET migrate type to isolate. * If it is already set, then someone else must have raced and - * set it before us. Return -EBUSY + * set it before us. */ - if (is_migrate_isolate_page(page)) - goto out; + if (is_migrate_isolate_page(page)) { + spin_unlock_irqrestore(&zone->lock, flags); + return -EBUSY; + } /* * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. @@ -49,25 +48,21 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ NULL); __mod_zone_freepage_state(zone, -nr_pages, mt); - ret = 0; + spin_unlock_irqrestore(&zone->lock, flags); + drain_all_pages(zone); + return 0; } -out: spin_unlock_irqrestore(&zone->lock, flags); - if (!ret) { - drain_all_pages(zone); - } else { - WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE); - - if ((isol_flags & REPORT_FAILURE) && unmovable) - /* - * printk() with zone->lock held will likely trigger a - * lockdep splat, so defer it here. - */ - dump_page(unmovable, "unmovable page"); + if (isol_flags & REPORT_FAILURE) { + /* + * printk() with zone->lock held will likely trigger a + * lockdep splat, so defer it here. + */ + dump_page(unmovable, "unmovable page"); } - return ret; + return -EBUSY; } static void unset_migratetype_isolate(struct page *page, unsigned migratetype) diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 29c052099aff..fd12da80b6f2 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -14,10 +14,6 @@ #include <linux/slab.h> #include <linux/syscalls.h> -#ifdef CONFIG_COMPAT -#include <linux/compat.h> -#endif - /** * process_vm_rw_pages - read/write pages from task specified * @pages: array of pointers to pages we want to copy @@ -276,20 +272,17 @@ static ssize_t process_vm_rw(pid_t pid, if (rc < 0) return rc; if (!iov_iter_count(&iter)) - goto free_iovecs; - - rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, - iovstack_r, &iov_r); - if (rc <= 0) - goto free_iovecs; - + goto free_iov_l; + iov_r = iovec_from_user(rvec, riovcnt, UIO_FASTIOV, iovstack_r, false); + if (IS_ERR(iov_r)) { + rc = PTR_ERR(iov_r); + goto free_iov_l; + } rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write); - -free_iovecs: if (iov_r != iovstack_r) kfree(iov_r); +free_iov_l: kfree(iov_l); - return rc; } @@ -307,68 +300,3 @@ SYSCALL_DEFINE6(process_vm_writev, pid_t, pid, { return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1); } - -#ifdef CONFIG_COMPAT - -static ssize_t -compat_process_vm_rw(compat_pid_t pid, - const struct compat_iovec __user *lvec, - unsigned long liovcnt, - const struct compat_iovec __user *rvec, - unsigned long riovcnt, - unsigned long flags, int vm_write) -{ - struct iovec iovstack_l[UIO_FASTIOV]; - struct iovec iovstack_r[UIO_FASTIOV]; - struct iovec *iov_l = iovstack_l; - struct iovec *iov_r = iovstack_r; - struct iov_iter iter; - ssize_t rc = -EFAULT; - int dir = vm_write ? WRITE : READ; - - if (flags != 0) - return -EINVAL; - - rc = compat_import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter); - if (rc < 0) - return rc; - if (!iov_iter_count(&iter)) - goto free_iovecs; - rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, - UIO_FASTIOV, iovstack_r, - &iov_r); - if (rc <= 0) - goto free_iovecs; - - rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write); - -free_iovecs: - if (iov_r != iovstack_r) - kfree(iov_r); - kfree(iov_l); - return rc; -} - -COMPAT_SYSCALL_DEFINE6(process_vm_readv, compat_pid_t, pid, - const struct compat_iovec __user *, lvec, - compat_ulong_t, liovcnt, - const struct compat_iovec __user *, rvec, - compat_ulong_t, riovcnt, - compat_ulong_t, flags) -{ - return compat_process_vm_rw(pid, lvec, liovcnt, rvec, - riovcnt, flags, 0); -} - -COMPAT_SYSCALL_DEFINE6(process_vm_writev, compat_pid_t, pid, - const struct compat_iovec __user *, lvec, - compat_ulong_t, liovcnt, - const struct compat_iovec __user *, rvec, - compat_ulong_t, riovcnt, - compat_ulong_t, flags) -{ - return compat_process_vm_rw(pid, lvec, liovcnt, rvec, - riovcnt, flags, 1); -} - -#endif diff --git a/mm/shmem.c b/mm/shmem.c index 8e2b35ba93ad..6d4ddef4a24f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1736,6 +1736,12 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, } wait_on_page_writeback(page); + /* + * Some architectures may have to restore extra metadata to the + * physical page after reading from swap. + */ + arch_swap_restore(swap, page); + if (shmem_should_replace_page(page, gfp)) { error = shmem_replace_page(&page, gfp, info, index); if (error) @@ -1824,6 +1830,8 @@ repeat: return error; } + if (page) + hindex = page->index; if (page && sgp == SGP_WRITE) mark_page_accessed(page); @@ -1834,11 +1842,10 @@ repeat: unlock_page(page); put_page(page); page = NULL; + hindex = index; } - if (page || sgp == SGP_READ) { - *pagep = page; - return 0; - } + if (page || sgp == SGP_READ) + goto out; /* * Fast cache lookup did not find it: @@ -1963,14 +1970,13 @@ clear: * it now, lest undo on failure cancel our earlier guarantee. */ if (sgp != SGP_WRITE && !PageUptodate(page)) { - struct page *head = compound_head(page); int i; - for (i = 0; i < compound_nr(head); i++) { - clear_highpage(head + i); - flush_dcache_page(head + i); + for (i = 0; i < compound_nr(page); i++) { + clear_highpage(page + i); + flush_dcache_page(page + i); } - SetPageUptodate(head); + SetPageUptodate(page); } /* Perhaps the file has been truncated since we checked */ @@ -1986,6 +1992,7 @@ clear: error = -EINVAL; goto unlock; } +out: *pagep = page + index - hindex; return 0; @@ -2269,6 +2276,9 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_flags &= ~(VM_MAYWRITE); } + /* arm64 - allow memory tagging on RAM-based files */ + vma->vm_flags |= VM_MTE_ALLOWED; + file_accessed(file); vma->vm_ops = &shmem_vm_ops; if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && diff --git a/mm/slab.c b/mm/slab.c index 3160dff6fd76..399a9d185b0f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1632,6 +1632,10 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page) kmem_cache_free(cachep->freelist_cache, freelist); } +/* + * Update the size of the caches before calling slabs_destroy as it may + * recursively call kfree. + */ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) { struct page *page, *n; @@ -2153,8 +2157,8 @@ static void do_drain(void *arg) spin_lock(&n->list_lock); free_block(cachep, ac->entry, ac->avail, node, &list); spin_unlock(&n->list_lock); - slabs_destroy(cachep, &list); ac->avail = 0; + slabs_destroy(cachep, &list); } static void drain_cpu_caches(struct kmem_cache *cachep) @@ -2301,8 +2305,6 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, /* Slab management obj is off-slab. */ freelist = kmem_cache_alloc_node(cachep->freelist_cache, local_flags, nodeid); - if (!freelist) - return NULL; } else { /* We will use last bytes at the slab for freelist */ freelist = addr + (PAGE_SIZE << cachep->gfporder) - @@ -3402,9 +3404,9 @@ free_done: } #endif spin_unlock(&n->list_lock); - slabs_destroy(cachep, &list); ac->avail -= batchcount; memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); + slabs_destroy(cachep, &list); } /* @@ -3436,7 +3438,7 @@ void ___cache_free(struct kmem_cache *cachep, void *objp, memset(objp, 0, cachep->object_size); kmemleak_free_recursive(objp, cachep->flags); objp = cache_free_debugcheck(cachep, objp, caller); - memcg_slab_free_hook(cachep, virt_to_head_page(objp), objp); + memcg_slab_free_hook(cachep, &objp, 1); /* * Skip calling cache_free_alien() when the platform is not numa. diff --git a/mm/slab.h b/mm/slab.h index 6cc323f1313a..6dd4b702888a 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -345,30 +345,42 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, obj_cgroup_put(objcg); } -static inline void memcg_slab_free_hook(struct kmem_cache *s, struct page *page, - void *p) +static inline void memcg_slab_free_hook(struct kmem_cache *s_orig, + void **p, int objects) { + struct kmem_cache *s; struct obj_cgroup *objcg; + struct page *page; unsigned int off; + int i; if (!memcg_kmem_enabled()) return; - if (!page_has_obj_cgroups(page)) - return; + for (i = 0; i < objects; i++) { + if (unlikely(!p[i])) + continue; - off = obj_to_index(s, page, p); - objcg = page_obj_cgroups(page)[off]; - page_obj_cgroups(page)[off] = NULL; + page = virt_to_head_page(p[i]); + if (!page_has_obj_cgroups(page)) + continue; - if (!objcg) - return; + if (!s_orig) + s = page->slab_cache; + else + s = s_orig; - obj_cgroup_uncharge(objcg, obj_full_size(s)); - mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s), - -obj_full_size(s)); + off = obj_to_index(s, page, p[i]); + objcg = page_obj_cgroups(page)[off]; + if (!objcg) + continue; - obj_cgroup_put(objcg); + page_obj_cgroups(page)[off] = NULL; + obj_cgroup_uncharge(objcg, obj_full_size(s)); + mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s), + -obj_full_size(s)); + obj_cgroup_put(objcg); + } } #else /* CONFIG_MEMCG_KMEM */ @@ -406,8 +418,8 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, { } -static inline void memcg_slab_free_hook(struct kmem_cache *s, struct page *page, - void *p) +static inline void memcg_slab_free_hook(struct kmem_cache *s, + void **p, int objects) { } #endif /* CONFIG_MEMCG_KMEM */ diff --git a/mm/slub.c b/mm/slub.c index d4177aecedf6..61d0d2968413 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1413,10 +1413,6 @@ slab_flags_t kmem_cache_flags(unsigned int object_size, char *next_block; slab_flags_t block_flags; - /* If slub_debug = 0, it folds into the if conditional. */ - if (!slub_debug_string) - return flags | slub_debug; - len = strlen(name); next_block = slub_debug_string; /* Go through all blocks of debug options, see if any matches our slab's name */ @@ -1450,7 +1446,7 @@ slab_flags_t kmem_cache_flags(unsigned int object_size, } } - return slub_debug; + return flags | slub_debug; } #else /* !CONFIG_SLUB_DEBUG */ static inline void setup_object_debug(struct kmem_cache *s, @@ -2249,7 +2245,8 @@ redo: } } else { m = M_FULL; - if (kmem_cache_debug(s) && !lock) { +#ifdef CONFIG_SLUB_DEBUG + if ((s->flags & SLAB_STORE_USER) && !lock) { lock = 1; /* * This also ensures that the scanning of full @@ -2258,6 +2255,7 @@ redo: */ spin_lock(&n->list_lock); } +#endif } if (l != m) { @@ -2665,6 +2663,8 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, void *freelist; struct page *page; + stat(s, ALLOC_SLOWPATH); + page = c->page; if (!page) { /* @@ -2854,7 +2854,6 @@ redo: page = c->page; if (unlikely(!object || !node_match(page, node))) { object = __slab_alloc(s, gfpflags, node, addr, c); - stat(s, ALLOC_SLOWPATH); } else { void *next_object = get_freepointer_safe(s, object); @@ -3023,20 +3022,21 @@ static void __slab_free(struct kmem_cache *s, struct page *page, if (likely(!n)) { - /* - * If we just froze the page then put it onto the - * per cpu partial list. - */ - if (new.frozen && !was_frozen) { + if (likely(was_frozen)) { + /* + * The list lock was not taken therefore no list + * activity can be necessary. + */ + stat(s, FREE_FROZEN); + } else if (new.frozen) { + /* + * If we just froze the page then put it onto the + * per cpu partial list. + */ put_cpu_partial(s, page, 1); stat(s, CPU_PARTIAL_FREE); } - /* - * The list lock was not taken therefore no list - * activity can be necessary. - */ - if (was_frozen) - stat(s, FREE_FROZEN); + return; } @@ -3095,7 +3095,7 @@ static __always_inline void do_slab_free(struct kmem_cache *s, struct kmem_cache_cpu *c; unsigned long tid; - memcg_slab_free_hook(s, page, head); + memcg_slab_free_hook(s, &head, 1); redo: /* * Determine the currently cpus per cpu slab. @@ -3257,6 +3257,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) if (WARN_ON(!size)) return; + memcg_slab_free_hook(s, p, size); do { struct detached_freelist df; diff --git a/mm/sparse.c b/mm/sparse.c index fcc3d176f1ea..b25ad8e64839 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -291,13 +291,11 @@ static void __init memory_present(int nid, unsigned long start, unsigned long en */ static void __init memblocks_present(void) { - struct memblock_region *reg; + unsigned long start, end; + int i, nid; - for_each_memblock(memory, reg) { - memory_present(memblock_get_region_node(reg), - memblock_region_memory_base_pfn(reg), - memblock_region_memory_end_pfn(reg)); - } + for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) + memory_present(nid, start, end); } /* diff --git a/mm/swap.c b/mm/swap.c index e7bdf094f76a..47a47681c86b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -348,7 +348,7 @@ static bool need_activate_page_drain(int cpu) return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0; } -void activate_page(struct page *page) +static void activate_page(struct page *page) { page = compound_head(page); if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { @@ -368,7 +368,7 @@ static inline void activate_page_drain(int cpu) { } -void activate_page(struct page *page) +static void activate_page(struct page *page) { pg_data_t *pgdat = page_pgdat(page); @@ -481,9 +481,7 @@ EXPORT_SYMBOL(lru_cache_add); * @vma: vma in which page is mapped for determining reclaimability * * Place @page on the inactive or unevictable LRU list, depending on its - * evictability. Note that if the page is not evictable, it goes - * directly back onto it's zone's unevictable list, it does NOT use a - * per cpu pagevec. + * evictability. */ void lru_cache_add_inactive_or_unevictable(struct page *page, struct vm_area_struct *vma) @@ -763,10 +761,20 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) */ void lru_add_drain_all(void) { - static seqcount_t seqcount = SEQCNT_ZERO(seqcount); - static DEFINE_MUTEX(lock); + /* + * lru_drain_gen - Global pages generation number + * + * (A) Definition: global lru_drain_gen = x implies that all generations + * 0 < n <= x are already *scheduled* for draining. + * + * This is an optimization for the highly-contended use case where a + * user space workload keeps constantly generating a flow of pages for + * each CPU. + */ + static unsigned int lru_drain_gen; static struct cpumask has_work; - int cpu, seq; + static DEFINE_MUTEX(lock); + unsigned cpu, this_gen; /* * Make sure nobody triggers this path before mm_percpu_wq is fully @@ -775,21 +783,54 @@ void lru_add_drain_all(void) if (WARN_ON(!mm_percpu_wq)) return; - seq = raw_read_seqcount_latch(&seqcount); + /* + * Guarantee pagevec counter stores visible by this CPU are visible to + * other CPUs before loading the current drain generation. + */ + smp_mb(); + + /* + * (B) Locally cache global LRU draining generation number + * + * The read barrier ensures that the counter is loaded before the mutex + * is taken. It pairs with smp_mb() inside the mutex critical section + * at (D). + */ + this_gen = smp_load_acquire(&lru_drain_gen); mutex_lock(&lock); /* - * Piggyback on drain started and finished while we waited for lock: - * all pages pended at the time of our enter were drained from vectors. + * (C) Exit the draining operation if a newer generation, from another + * lru_add_drain_all(), was already scheduled for draining. Check (A). */ - if (__read_seqcount_retry(&seqcount, seq)) + if (unlikely(this_gen != lru_drain_gen)) goto done; - raw_write_seqcount_latch(&seqcount); + /* + * (D) Increment global generation number + * + * Pairs with smp_load_acquire() at (B), outside of the critical + * section. Use a full memory barrier to guarantee that the new global + * drain generation number is stored before loading pagevec counters. + * + * This pairing must be done here, before the for_each_online_cpu loop + * below which drains the page vectors. + * + * Let x, y, and z represent some system CPU numbers, where x < y < z. + * Assume CPU #z is is in the middle of the for_each_online_cpu loop + * below and has already reached CPU #y's per-cpu data. CPU #x comes + * along, adds some pages to its per-cpu vectors, then calls + * lru_add_drain_all(). + * + * If the paired barrier is done at any later step, e.g. after the + * loop, CPU #x will just exit at (C) and miss flushing out all of its + * added pages. + */ + WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1); + smp_mb(); cpumask_clear(&has_work); - for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); @@ -801,7 +842,7 @@ void lru_add_drain_all(void) need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); - cpumask_set_cpu(cpu, &has_work); + __cpumask_set_cpu(cpu, &has_work); } } @@ -816,7 +857,7 @@ void lru_add_drain_all(void) { lru_add_drain(); } -#endif +#endif /* CONFIG_SMP */ /** * release_pages - batched put_page() @@ -848,6 +889,7 @@ void release_pages(struct page **pages, int nr) locked_pgdat = NULL; } + page = compound_head(page); if (is_huge_zero_page(page)) continue; @@ -859,7 +901,7 @@ void release_pages(struct page **pages, int nr) } /* * ZONE_DEVICE pages that return 'false' from - * put_devmap_managed_page() do not require special + * page_is_devmap_managed() do not require special * processing, and instead, expect a call to * put_page_testzero(). */ @@ -869,7 +911,6 @@ void release_pages(struct page **pages, int nr) } } - page = compound_head(page); if (!put_page_testzero(page)) continue; @@ -900,8 +941,6 @@ void release_pages(struct page **pages, int nr) del_page_from_lru_list(page, lruvec, page_off_lru(page)); } - /* Clear Active bit in case of parallel mark_page_accessed */ - __ClearPageActive(page); __ClearPageWaiters(page); list_add(&page->lru, &pages_to_free); diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 3e6453573a89..0357fbe70645 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -237,7 +237,7 @@ static int free_slot_cache(unsigned int cpu) return 0; } -int enable_swap_slots_cache(void) +void enable_swap_slots_cache(void) { mutex_lock(&swap_slots_cache_enable_mutex); if (!swap_slot_cache_initialized) { @@ -255,7 +255,6 @@ int enable_swap_slots_cache(void) __reenable_swap_slots_cache(); out_unlock: mutex_unlock(&swap_slots_cache_enable_mutex); - return 0; } /* called with swap slot cache's alloc lock held */ diff --git a/mm/swap_state.c b/mm/swap_state.c index c16eebb81d8b..aa40e706604c 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -21,6 +21,7 @@ #include <linux/vmalloc.h> #include <linux/swap_slots.h> #include <linux/huge_mm.h> +#include <linux/shmem_fs.h> #include "internal.h" /* @@ -414,6 +415,39 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, return page; } +/** + * find_get_incore_page - Find and get a page from the page or swap caches. + * @mapping: The address_space to search. + * @index: The page cache index. + * + * This differs from find_get_page() in that it will also look for the + * page in the swap cache. + * + * Return: The found page or %NULL. + */ +struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) +{ + swp_entry_t swp; + struct swap_info_struct *si; + struct page *page = find_get_entry(mapping, index); + + if (!page) + return page; + if (!xa_is_value(page)) + return find_subpage(page, index); + if (!shmem_mapping(mapping)) + return NULL; + + swp = radix_to_swp_entry(page); + /* Prevent swapoff from happening to us */ + si = get_swap_device(swp); + if (!si) + return NULL; + page = find_get_page(swap_address_space(swp), swp_offset(swp)); + put_swap_device(si); + return page; +} + struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, bool *new_page_allocated) @@ -631,7 +665,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, goto skip; /* Test swap type to make sure the dereference is safe */ - if (likely(si->flags & (SWP_BLKDEV | SWP_FS))) { + if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) { struct inode *inode = si->swap_file->f_mapping->host; if (inode_read_congested(inode)) goto skip; diff --git a/mm/swapfile.c b/mm/swapfile.c index 12f59e641b5e..c4a613688a17 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -717,6 +717,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, else swap_slot_free_notify = NULL; while (offset <= end) { + arch_swap_invalidate_page(si->type, offset); frontswap_invalidate_page(si->type, offset); if (swap_slot_free_notify) swap_slot_free_notify(si->bdev, offset); @@ -1078,7 +1079,7 @@ start_over: goto nextsi; } if (size == SWAPFILE_CLUSTER) { - if (!(si->flags & SWP_FS)) + if (si->flags & SWP_BLKDEV) n_ret = swap_alloc_cluster(si, swp_entries); } else n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, @@ -1183,7 +1184,6 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry) bad_free: pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val); - goto out; out: return NULL; } @@ -1801,13 +1801,12 @@ int free_swap_and_cache(swp_entry_t entry) * * This is needed for the suspend to disk (aka swsusp). */ -int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) +int swap_type_of(dev_t device, sector_t offset) { - struct block_device *bdev = NULL; int type; - if (device) - bdev = bdget(device); + if (!device) + return -1; spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { @@ -1816,30 +1815,34 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) if (!(sis->flags & SWP_WRITEOK)) continue; - if (!bdev) { - if (bdev_p) - *bdev_p = bdgrab(sis->bdev); - - spin_unlock(&swap_lock); - return type; - } - if (bdev == sis->bdev) { + if (device == sis->bdev->bd_dev) { struct swap_extent *se = first_se(sis); if (se->start_block == offset) { - if (bdev_p) - *bdev_p = bdgrab(sis->bdev); - spin_unlock(&swap_lock); - bdput(bdev); return type; } } } spin_unlock(&swap_lock); - if (bdev) - bdput(bdev); + return -ENODEV; +} + +int find_first_swap(dev_t *device) +{ + int type; + + spin_lock(&swap_lock); + for (type = 0; type < nr_swapfiles; type++) { + struct swap_info_struct *sis = swap_info[type]; + if (!(sis->flags & SWP_WRITEOK)) + continue; + *device = sis->bdev->bd_dev; + spin_unlock(&swap_lock); + return type; + } + spin_unlock(&swap_lock); return -ENODEV; } @@ -1925,11 +1928,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, lru_cache_add_inactive_or_unevictable(page, vma); } swap_free(entry); - /* - * Move the page to the active list so it is not - * immediately swapped out again after swapon. - */ - activate_page(page); out: pte_unmap_unlock(pte, ptl); if (page != swapcache) { @@ -2433,7 +2431,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) if (ret >= 0) sis->flags |= SWP_ACTIVATED; if (!ret) { - sis->flags |= SWP_FS; + sis->flags |= SWP_FS_OPS; ret = add_swap_extent(sis, 0, sis->max, 0); *span = sis->pages; } @@ -2682,6 +2680,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) frontswap_map = frontswap_map_get(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); + arch_swap_invalidate_area(p->type); frontswap_invalidate_area(p->type); frontswap_map_set(p, NULL); mutex_unlock(&swapon_mutex); @@ -2920,10 +2919,10 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) int error; if (S_ISBLK(inode->i_mode)) { - p->bdev = bdgrab(I_BDEV(inode)); - error = blkdev_get(p->bdev, + p->bdev = blkdev_get_by_dev(inode->i_rdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, p); - if (error < 0) { + if (IS_ERR(p->bdev)) { + error = PTR_ERR(p->bdev); p->bdev = NULL; return error; } @@ -3234,10 +3233,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap_unlock_inode; } - if (bdi_cap_stable_pages_required(inode_to_bdi(inode))) + if (p->bdev && blk_queue_stable_writes(p->bdev->bd_disk->queue)) p->flags |= SWP_STABLE_WRITES; - if (bdi_cap_synchronous_io(inode_to_bdi(inode))) + if (p->bdev && p->bdev->bd_disk->fops->rw_page) p->flags |= SWP_SYNCHRONOUS_IO; if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { @@ -3343,7 +3342,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = inode_drain_writes(inode); if (error) { inode->i_flags &= ~S_SWAPFILE; - goto bad_swap_unlock_inode; + goto free_swap_address_space; } mutex_lock(&swapon_mutex); @@ -3368,6 +3367,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = 0; goto out; +free_swap_address_space: + exit_swap_address_space(p->type); bad_swap_unlock_inode: inode_unlock(inode); bad_swap: diff --git a/mm/truncate.c b/mm/truncate.c index dd9ebc1da356..6bbe0f0b3ce9 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -528,23 +528,8 @@ void truncate_inode_pages_final(struct address_space *mapping) } EXPORT_SYMBOL(truncate_inode_pages_final); -/** - * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode - * @mapping: the address_space which holds the pages to invalidate - * @start: the offset 'from' which to invalidate - * @end: the offset 'to' which to invalidate (inclusive) - * - * This function only removes the unlocked pages, if you want to - * remove all the pages of one inode, you must call truncate_inode_pages. - * - * invalidate_mapping_pages() will not block on IO activity. It will not - * invalidate pages which are dirty, locked, under writeback or mapped into - * pagetables. - * - * Return: the number of the pages that were invalidated - */ -unsigned long invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end) +unsigned long __invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) { pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; @@ -610,8 +595,13 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, * Invalidation is a hint that the page is no longer * of interest and try to speed up its reclaim. */ - if (!ret) + if (!ret) { deactivate_file_page(page); + /* It is likely on the pagevec of a remote CPU */ + if (nr_pagevec) + (*nr_pagevec)++; + } + if (PageTransHuge(page)) put_page(page); count += ret; @@ -623,8 +613,40 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, } return count; } + +/** + * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode + * @mapping: the address_space which holds the pages to invalidate + * @start: the offset 'from' which to invalidate + * @end: the offset 'to' which to invalidate (inclusive) + * + * This function only removes the unlocked pages, if you want to + * remove all the pages of one inode, you must call truncate_inode_pages. + * + * invalidate_mapping_pages() will not block on IO activity. It will not + * invalidate pages which are dirty, locked, under writeback or mapped into + * pagetables. + * + * Return: the number of the pages that were invalidated + */ +unsigned long invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + return __invalidate_mapping_pages(mapping, start, end, NULL); +} EXPORT_SYMBOL(invalidate_mapping_pages); +/** + * This helper is similar with the above one, except that it accounts for pages + * that are likely on a pagevec and count them in @nr_pagevec, which will used by + * the caller. + */ +void invalidate_mapping_pagevec(struct address_space *mapping, + pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) +{ + __invalidate_mapping_pages(mapping, start, end, nr_pagevec); +} + /* * This is like invalidate_complete_page(), except it ignores the page's * refcount. We do this because invalidate_inode_pages2() needs stronger diff --git a/mm/util.c b/mm/util.c index 5ef378a2a038..4e21fe7eae27 100644 --- a/mm/util.c +++ b/mm/util.c @@ -957,7 +957,7 @@ out: return res; } -int memcmp_pages(struct page *page1, struct page *page2) +int __weak memcmp_pages(struct page *page1, struct page *page2) { char *addr1, *addr2; int ret; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index be4724b916b3..04ac98bf5045 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2133,7 +2133,7 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, * It is up to the caller to do all required locking to keep the returned * pointer valid. * - * Return: pointer to the found area or %NULL on faulure + * Return: the area descriptor on success or %NULL on failure. */ struct vm_struct *find_vm_area(const void *addr) { @@ -2154,7 +2154,7 @@ struct vm_struct *find_vm_area(const void *addr) * This function returns the found VM area, but using it is NOT safe * on SMP machines, except for its size or flags. * - * Return: pointer to the found area or %NULL on faulure + * Return: the area descriptor on success or %NULL on failure. */ struct vm_struct *remove_vm_area(const void *addr) { @@ -2447,7 +2447,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, page = alloc_pages_node(node, alloc_mask|highmem_mask, 0); if (unlikely(!page)) { - /* Successfully allocated i pages, free them in __vunmap() */ + /* Successfully allocated i pages, free them in __vfree() */ area->nr_pages = i; atomic_long_add(area->nr_pages, &nr_vmalloc_pages); goto fail; diff --git a/mm/vmscan.c b/mm/vmscan.c index 466fc3144fff..879fb57c5045 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -699,6 +699,9 @@ void drop_slab_node(int nid) do { struct mem_cgroup *memcg = NULL; + if (fatal_signal_pending(current)) + return; + freed = 0; memcg = mem_cgroup_iter(NULL, NULL, NULL); do { @@ -1751,7 +1754,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, * Restrictions: * * (1) Must be called with an elevated refcount on the page. This is a - * fundamentnal difference from isolate_lru_pages (which is called + * fundamental difference from isolate_lru_pages (which is called * without a stable reference). * (2) the lru_lock must not be held. * (3) interrupts must be enabled. diff --git a/mm/z3fold.c b/mm/z3fold.c index 460b0feced26..18feaa0bc537 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -212,13 +212,12 @@ static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool, { struct z3fold_buddy_slots *slots; - slots = kmem_cache_alloc(pool->c_handle, + slots = kmem_cache_zalloc(pool->c_handle, (gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE))); if (slots) { /* It will be freed separately in free_handle(). */ kmemleak_not_leak(slots); - memset(slots->slot, 0, sizeof(slots->slot)); slots->pool = (unsigned long)pool; rwlock_init(&slots->lock); } diff --git a/mm/zbud.c b/mm/zbud.c index bc93aa4e46fc..c49966ece674 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -367,7 +367,6 @@ int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp, spin_lock(&pool->lock); /* First, try to find an unbuddied zbud page. */ - zhdr = NULL; for_each_unbuddied_list(i, chunks) { if (!list_empty(&pool->unbuddied[i])) { zhdr = list_first_entry(&pool->unbuddied[i], |