summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile6
-rw-r--r--mm/compaction.c5
-rw-r--r--mm/debug.c46
-rw-r--r--mm/filemap.c37
-rw-r--r--mm/gup.c115
-rw-r--r--mm/gup_benchmark.c40
-rw-r--r--mm/hmm.c12
-rw-r--r--mm/huge_memory.c49
-rw-r--r--mm/hugetlb.c96
-rw-r--r--mm/kasan/quarantine.c18
-rw-r--r--mm/kmemleak.c42
-rw-r--r--mm/maccess.c6
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memblock.c5
-rw-r--r--mm/memcontrol.c54
-rw-r--r--mm/memory.c403
-rw-r--r--mm/memory_hotplug.c146
-rw-r--r--mm/mempolicy.c35
-rw-r--r--mm/migrate.c106
-rw-r--r--mm/mmap.c98
-rw-r--r--mm/mmu_gather.c261
-rw-r--r--mm/mmu_notifier.c31
-rw-r--r--mm/mremap.c50
-rw-r--r--mm/nommu.c6
-rw-r--r--mm/oom_kill.c4
-rw-r--r--mm/page-writeback.c33
-rw-r--r--mm/page_alloc.c374
-rw-r--r--mm/page_io.c8
-rw-r--r--mm/percpu.c1
-rw-r--r--mm/pgtable-generic.c1
-rw-r--r--mm/rmap.c42
-rw-r--r--mm/slab.c8
-rw-r--r--mm/slab_common.c115
-rw-r--r--mm/slub.c83
-rw-r--r--mm/sparse.c4
-rw-r--r--mm/swap.c1
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/swapfile.c83
-rw-r--r--mm/util.c12
-rw-r--r--mm/vmalloc.c4
-rw-r--r--mm/vmscan.c38
-rw-r--r--mm/vmstat.c14
-rw-r--r--mm/workingset.c135
-rw-r--r--mm/zsmalloc.c2
44 files changed, 1512 insertions, 1120 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 26ef77a3883b..6485d5745dd7 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -23,9 +23,9 @@ KCOV_INSTRUMENT_vmstat.o := n
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \
- mlock.o mmap.o mprotect.o mremap.o msync.o \
- page_vma_mapped.o pagewalk.o pgtable-generic.o \
- rmap.o vmalloc.o
+ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
+ msync.o page_vma_mapped.o pagewalk.o \
+ pgtable-generic.o rmap.o vmalloc.o
ifdef CONFIG_CROSS_MEMORY_ATTACH
diff --git a/mm/compaction.c b/mm/compaction.c
index faca45ebe62d..7c607479de4a 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -22,6 +22,7 @@
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/page_owner.h>
+#include <linux/psi.h>
#include "internal.h"
#ifdef CONFIG_COMPACTION
@@ -2068,11 +2069,15 @@ static int kcompactd(void *p)
pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
while (!kthread_should_stop()) {
+ unsigned long pflags;
+
trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
wait_event_freezable(pgdat->kcompactd_wait,
kcompactd_work_requested(pgdat));
+ psi_memstall_enter(&pflags);
kcompactd_do_work(pgdat);
+ psi_memstall_leave(&pflags);
}
return 0;
diff --git a/mm/debug.c b/mm/debug.c
index bd10aad8539a..cdacba12e09a 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -13,6 +13,7 @@
#include <trace/events/mmflags.h>
#include <linux/migrate.h>
#include <linux/page_owner.h>
+#include <linux/ctype.h>
#include "internal.h"
@@ -175,4 +176,49 @@ void dump_mm(const struct mm_struct *mm)
);
}
+static bool page_init_poisoning __read_mostly = true;
+
+static int __init setup_vm_debug(char *str)
+{
+ bool __page_init_poisoning = true;
+
+ /*
+ * Calling vm_debug with no arguments is equivalent to requesting
+ * to enable all debugging options we can control.
+ */
+ if (*str++ != '=' || !*str)
+ goto out;
+
+ __page_init_poisoning = false;
+ if (*str == '-')
+ goto out;
+
+ while (*str) {
+ switch (tolower(*str)) {
+ case'p':
+ __page_init_poisoning = true;
+ break;
+ default:
+ pr_err("vm_debug option '%c' unknown. skipped\n",
+ *str);
+ }
+
+ str++;
+ }
+out:
+ if (page_init_poisoning && !__page_init_poisoning)
+ pr_warn("Page struct poisoning disabled by kernel command line option 'vm_debug'\n");
+
+ page_init_poisoning = __page_init_poisoning;
+
+ return 1;
+}
+__setup("vm_debug", setup_vm_debug);
+
+void page_init_poison(struct page *page, size_t size)
+{
+ if (page_init_poisoning)
+ memset(page, PAGE_POISON_PATTERN, size);
+}
+EXPORT_SYMBOL_GPL(page_init_poison);
#endif /* CONFIG_DEBUG_VM */
diff --git a/mm/filemap.c b/mm/filemap.c
index 6b36516bc31d..218d0b2ec82d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -36,6 +36,8 @@
#include <linux/cleancache.h>
#include <linux/shmem_fs.h>
#include <linux/rmap.h>
+#include <linux/delayacct.h>
+#include <linux/psi.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
@@ -894,12 +896,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
* data from the working set, only to cache data that will
* get overwritten with something else, is a waste of memory.
*/
- if (!(gfp_mask & __GFP_WRITE) &&
- shadow && workingset_refault(shadow)) {
- SetPageActive(page);
- workingset_activation(page);
- } else
- ClearPageActive(page);
+ WARN_ON_ONCE(PageActive(page));
+ if (!(gfp_mask & __GFP_WRITE) && shadow)
+ workingset_refault(page, shadow);
lru_cache_add(page);
}
return ret;
@@ -1055,8 +1054,18 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
{
struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait;
+ bool thrashing = false;
+ unsigned long pflags;
int ret = 0;
+ if (bit_nr == PG_locked &&
+ !PageUptodate(page) && PageWorkingset(page)) {
+ if (!PageSwapBacked(page))
+ delayacct_thrashing_start();
+ psi_memstall_enter(&pflags);
+ thrashing = true;
+ }
+
init_wait(wait);
wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0;
wait->func = wake_page_function;
@@ -1095,6 +1104,12 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
finish_wait(q, wait);
+ if (thrashing) {
+ if (!PageSwapBacked(page))
+ delayacct_thrashing_end();
+ psi_memstall_leave(&pflags);
+ }
+
/*
* A signal could leave PageWaiters set. Clearing it here if
* !waitqueue_active would be possible (by open-coding finish_wait),
@@ -2493,9 +2508,7 @@ no_cached_page:
* system is low on memory, or a problem occurs while trying
* to schedule I/O.
*/
- if (error == -ENOMEM)
- return VM_FAULT_OOM;
- return VM_FAULT_SIGBUS;
+ return vmf_error(error);
page_not_uptodate:
/*
@@ -2644,9 +2657,9 @@ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
return generic_file_mmap(file, vma);
}
#else
-int filemap_page_mkwrite(struct vm_fault *vmf)
+vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
- return -ENOSYS;
+ return VM_FAULT_SIGBUS;
}
int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
{
@@ -2908,7 +2921,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
if (iocb->ki_flags & IOCB_NOWAIT) {
/* If there are pages to writeback, return */
if (filemap_range_has_page(inode->i_mapping, pos,
- pos + iov_iter_count(from)))
+ pos + write_len))
return -EAGAIN;
} else {
written = filemap_write_and_wait_range(mapping, pos,
diff --git a/mm/gup.c b/mm/gup.c
index 1abc8b4afff6..841d7ef53591 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -20,6 +20,11 @@
#include "internal.h"
+struct follow_page_context {
+ struct dev_pagemap *pgmap;
+ unsigned int page_mask;
+};
+
static struct page *no_page_table(struct vm_area_struct *vma,
unsigned int flags)
{
@@ -71,10 +76,10 @@ static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
}
static struct page *follow_page_pte(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd, unsigned int flags)
+ unsigned long address, pmd_t *pmd, unsigned int flags,
+ struct dev_pagemap **pgmap)
{
struct mm_struct *mm = vma->vm_mm;
- struct dev_pagemap *pgmap = NULL;
struct page *page;
spinlock_t *ptl;
pte_t *ptep, pte;
@@ -116,8 +121,8 @@ retry:
* Only return device mapping pages in the FOLL_GET case since
* they are only valid while holding the pgmap reference.
*/
- pgmap = get_dev_pagemap(pte_pfn(pte), NULL);
- if (pgmap)
+ *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
+ if (*pgmap)
page = pte_page(pte);
else
goto no_page;
@@ -152,15 +157,8 @@ retry:
goto retry;
}
- if (flags & FOLL_GET) {
+ if (flags & FOLL_GET)
get_page(page);
-
- /* drop the pgmap reference now that we hold the page */
- if (pgmap) {
- put_dev_pagemap(pgmap);
- pgmap = NULL;
- }
- }
if (flags & FOLL_TOUCH) {
if ((flags & FOLL_WRITE) &&
!pte_dirty(pte) && !PageDirty(page))
@@ -210,7 +208,8 @@ no_page:
static struct page *follow_pmd_mask(struct vm_area_struct *vma,
unsigned long address, pud_t *pudp,
- unsigned int flags, unsigned int *page_mask)
+ unsigned int flags,
+ struct follow_page_context *ctx)
{
pmd_t *pmd, pmdval;
spinlock_t *ptl;
@@ -258,13 +257,13 @@ retry:
}
if (pmd_devmap(pmdval)) {
ptl = pmd_lock(mm, pmd);
- page = follow_devmap_pmd(vma, address, pmd, flags);
+ page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
spin_unlock(ptl);
if (page)
return page;
}
if (likely(!pmd_trans_huge(pmdval)))
- return follow_page_pte(vma, address, pmd, flags);
+ return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
return no_page_table(vma, flags);
@@ -284,7 +283,7 @@ retry_locked:
}
if (unlikely(!pmd_trans_huge(*pmd))) {
spin_unlock(ptl);
- return follow_page_pte(vma, address, pmd, flags);
+ return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
}
if (flags & FOLL_SPLIT) {
int ret;
@@ -307,18 +306,18 @@ retry_locked:
}
return ret ? ERR_PTR(ret) :
- follow_page_pte(vma, address, pmd, flags);
+ follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
}
page = follow_trans_huge_pmd(vma, address, pmd, flags);
spin_unlock(ptl);
- *page_mask = HPAGE_PMD_NR - 1;
+ ctx->page_mask = HPAGE_PMD_NR - 1;
return page;
}
-
static struct page *follow_pud_mask(struct vm_area_struct *vma,
unsigned long address, p4d_t *p4dp,
- unsigned int flags, unsigned int *page_mask)
+ unsigned int flags,
+ struct follow_page_context *ctx)
{
pud_t *pud;
spinlock_t *ptl;
@@ -344,7 +343,7 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
}
if (pud_devmap(*pud)) {
ptl = pud_lock(mm, pud);
- page = follow_devmap_pud(vma, address, pud, flags);
+ page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
spin_unlock(ptl);
if (page)
return page;
@@ -352,13 +351,13 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
if (unlikely(pud_bad(*pud)))
return no_page_table(vma, flags);
- return follow_pmd_mask(vma, address, pud, flags, page_mask);
+ return follow_pmd_mask(vma, address, pud, flags, ctx);
}
-
static struct page *follow_p4d_mask(struct vm_area_struct *vma,
unsigned long address, pgd_t *pgdp,
- unsigned int flags, unsigned int *page_mask)
+ unsigned int flags,
+ struct follow_page_context *ctx)
{
p4d_t *p4d;
struct page *page;
@@ -378,7 +377,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
return page;
return no_page_table(vma, flags);
}
- return follow_pud_mask(vma, address, p4d, flags, page_mask);
+ return follow_pud_mask(vma, address, p4d, flags, ctx);
}
/**
@@ -396,13 +395,13 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
*/
struct page *follow_page_mask(struct vm_area_struct *vma,
unsigned long address, unsigned int flags,
- unsigned int *page_mask)
+ struct follow_page_context *ctx)
{
pgd_t *pgd;
struct page *page;
struct mm_struct *mm = vma->vm_mm;
- *page_mask = 0;
+ ctx->page_mask = 0;
/* make this handle hugepd */
page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
@@ -431,7 +430,19 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
return no_page_table(vma, flags);
}
- return follow_p4d_mask(vma, address, pgd, flags, page_mask);
+ return follow_p4d_mask(vma, address, pgd, flags, ctx);
+}
+
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
+ unsigned int foll_flags)
+{
+ struct follow_page_context ctx = { NULL };
+ struct page *page;
+
+ page = follow_page_mask(vma, address, foll_flags, &ctx);
+ if (ctx.pgmap)
+ put_dev_pagemap(ctx.pgmap);
+ return page;
}
static int get_gate_page(struct mm_struct *mm, unsigned long address,
@@ -659,9 +670,9 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *nonblocking)
{
- long i = 0;
- unsigned int page_mask;
+ long ret = 0, i = 0;
struct vm_area_struct *vma = NULL;
+ struct follow_page_context ctx = { NULL };
if (!nr_pages)
return 0;
@@ -691,12 +702,14 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
pages ? &pages[i] : NULL);
if (ret)
return i ? : ret;
- page_mask = 0;
+ ctx.page_mask = 0;
goto next_page;
}
- if (!vma || check_vma_flags(vma, gup_flags))
- return i ? : -EFAULT;
+ if (!vma || check_vma_flags(vma, gup_flags)) {
+ ret = -EFAULT;
+ goto out;
+ }
if (is_vm_hugetlb_page(vma)) {
i = follow_hugetlb_page(mm, vma, pages, vmas,
&start, &nr_pages, i,
@@ -709,23 +722,26 @@ retry:
* If we have a pending SIGKILL, don't keep faulting pages and
* potentially allocating memory.
*/
- if (unlikely(fatal_signal_pending(current)))
- return i ? i : -ERESTARTSYS;
+ if (unlikely(fatal_signal_pending(current))) {
+ ret = -ERESTARTSYS;
+ goto out;
+ }
cond_resched();
- page = follow_page_mask(vma, start, foll_flags, &page_mask);
+
+ page = follow_page_mask(vma, start, foll_flags, &ctx);
if (!page) {
- int ret;
ret = faultin_page(tsk, vma, start, &foll_flags,
nonblocking);
switch (ret) {
case 0:
goto retry;
+ case -EBUSY:
+ ret = 0;
+ /* FALLTHRU */
case -EFAULT:
case -ENOMEM:
case -EHWPOISON:
- return i ? i : ret;
- case -EBUSY:
- return i;
+ goto out;
case -ENOENT:
goto next_page;
}
@@ -737,27 +753,31 @@ retry:
*/
goto next_page;
} else if (IS_ERR(page)) {
- return i ? i : PTR_ERR(page);
+ ret = PTR_ERR(page);
+ goto out;
}
if (pages) {
pages[i] = page;
flush_anon_page(vma, page, start);
flush_dcache_page(page);
- page_mask = 0;
+ ctx.page_mask = 0;
}
next_page:
if (vmas) {
vmas[i] = vma;
- page_mask = 0;
+ ctx.page_mask = 0;
}
- page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
+ page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
if (page_increm > nr_pages)
page_increm = nr_pages;
i += page_increm;
start += page_increm * PAGE_SIZE;
nr_pages -= page_increm;
} while (nr_pages);
- return i;
+out:
+ if (ctx.pgmap)
+ put_dev_pagemap(ctx.pgmap);
+ return i ? i : ret;
}
static bool vma_permits_fault(struct vm_area_struct *vma,
@@ -1780,12 +1800,11 @@ bool gup_fast_permitted(unsigned long start, int nr_pages, int write)
int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
- unsigned long addr, len, end;
+ unsigned long len, end;
unsigned long flags;
int nr = 0;
start &= PAGE_MASK;
- addr = start;
len = (unsigned long) nr_pages << PAGE_SHIFT;
end = start + len;
@@ -1807,7 +1826,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
if (gup_fast_permitted(start, nr_pages, write)) {
local_irq_save(flags);
- gup_pgd_range(addr, end, write, pages, &nr);
+ gup_pgd_range(start, end, write, pages, &nr);
local_irq_restore(flags);
}
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
index 6a473709e9b6..debf11388a60 100644
--- a/mm/gup_benchmark.c
+++ b/mm/gup_benchmark.c
@@ -6,20 +6,25 @@
#include <linux/debugfs.h>
#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark)
+#define GUP_LONGTERM_BENCHMARK _IOWR('g', 2, struct gup_benchmark)
+#define GUP_BENCHMARK _IOWR('g', 3, struct gup_benchmark)
struct gup_benchmark {
- __u64 delta_usec;
+ __u64 get_delta_usec;
+ __u64 put_delta_usec;
__u64 addr;
__u64 size;
__u32 nr_pages_per_call;
__u32 flags;
+ __u64 expansion[10]; /* For future use */
};
static int __gup_benchmark_ioctl(unsigned int cmd,
struct gup_benchmark *gup)
{
ktime_t start_time, end_time;
- unsigned long i, nr, nr_pages, addr, next;
+ unsigned long i, nr_pages, addr, next;
+ int nr;
struct page **pages;
nr_pages = gup->size / PAGE_SIZE;
@@ -40,21 +45,40 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
nr = (next - addr) / PAGE_SIZE;
}
- nr = get_user_pages_fast(addr, nr, gup->flags & 1, pages + i);
+ switch (cmd) {
+ case GUP_FAST_BENCHMARK:
+ nr = get_user_pages_fast(addr, nr, gup->flags & 1,
+ pages + i);
+ break;
+ case GUP_LONGTERM_BENCHMARK:
+ nr = get_user_pages_longterm(addr, nr, gup->flags & 1,
+ pages + i, NULL);
+ break;
+ case GUP_BENCHMARK:
+ nr = get_user_pages(addr, nr, gup->flags & 1, pages + i,
+ NULL);
+ break;
+ default:
+ return -1;
+ }
+
if (nr <= 0)
break;
i += nr;
}
end_time = ktime_get();
- gup->delta_usec = ktime_us_delta(end_time, start_time);
+ gup->get_delta_usec = ktime_us_delta(end_time, start_time);
gup->size = addr - gup->addr;
+ start_time = ktime_get();
for (i = 0; i < nr_pages; i++) {
if (!pages[i])
break;
put_page(pages[i]);
}
+ end_time = ktime_get();
+ gup->put_delta_usec = ktime_us_delta(end_time, start_time);
kvfree(pages);
return 0;
@@ -66,8 +90,14 @@ static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd,
struct gup_benchmark gup;
int ret;
- if (cmd != GUP_FAST_BENCHMARK)
+ switch (cmd) {
+ case GUP_FAST_BENCHMARK:
+ case GUP_LONGTERM_BENCHMARK:
+ case GUP_BENCHMARK:
+ break;
+ default:
return -EINVAL;
+ }
if (copy_from_user(&gup, (void __user *)arg, sizeof(gup)))
return -EFAULT;
diff --git a/mm/hmm.c b/mm/hmm.c
index c968e49f7a0c..774d684fa2b4 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1024,7 +1024,6 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
resource_size_t key, align_start, align_size, align_end;
struct device *device = devmem->device;
int ret, nid, is_ram;
- unsigned long pfn;
align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
align_size = ALIGN(devmem->resource->start +
@@ -1109,11 +1108,14 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
align_size >> PAGE_SHIFT, NULL);
mem_hotplug_done();
- for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) {
- struct page *page = pfn_to_page(pfn);
+ /*
+ * Initialization of the pages has been deferred until now in order
+ * to allow us to do the work while not holding the hotplug lock.
+ */
+ memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
+ align_start >> PAGE_SHIFT,
+ align_size >> PAGE_SHIFT, &devmem->pagemap);
- page->pgmap = &devmem->pagemap;
- }
return 0;
error_add_memory:
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9eb79c384616..4e4ef8fa479d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -852,11 +852,10 @@ static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
}
struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, int flags)
+ pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
{
unsigned long pfn = pmd_pfn(*pmd);
struct mm_struct *mm = vma->vm_mm;
- struct dev_pagemap *pgmap;
struct page *page;
assert_spin_locked(pmd_lockptr(mm, pmd));
@@ -886,12 +885,11 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
return ERR_PTR(-EEXIST);
pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
- pgmap = get_dev_pagemap(pfn, NULL);
- if (!pgmap)
+ *pgmap = get_dev_pagemap(pfn, *pgmap);
+ if (!*pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
get_page(page);
- put_dev_pagemap(pgmap);
return page;
}
@@ -1000,11 +998,10 @@ static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
}
struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
- pud_t *pud, int flags)
+ pud_t *pud, int flags, struct dev_pagemap **pgmap)
{
unsigned long pfn = pud_pfn(*pud);
struct mm_struct *mm = vma->vm_mm;
- struct dev_pagemap *pgmap;
struct page *page;
assert_spin_locked(pud_lockptr(mm, pud));
@@ -1028,12 +1025,11 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
return ERR_PTR(-EEXIST);
pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
- pgmap = get_dev_pagemap(pfn, NULL);
- if (!pgmap)
+ *pgmap = get_dev_pagemap(pfn, *pgmap);
+ if (!*pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
get_page(page);
- put_dev_pagemap(pgmap);
return page;
}
@@ -1562,8 +1558,20 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
* We are not sure a pending tlb flush here is for a huge page
* mapping or not. Hence use the tlb range variant
*/
- if (mm_tlb_flush_pending(vma->vm_mm))
+ if (mm_tlb_flush_pending(vma->vm_mm)) {
flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
+ /*
+ * change_huge_pmd() released the pmd lock before
+ * invalidating the secondary MMUs sharing the primary
+ * MMU pagetables (with ->invalidate_range()). The
+ * mmu_notifier_invalidate_range_end() (which
+ * internally calls ->invalidate_range()) in
+ * change_pmd_range() will run after us, so we can't
+ * rely on it here and we need an explicit invalidate.
+ */
+ mmu_notifier_invalidate_range(vma->vm_mm, haddr,
+ haddr + HPAGE_PMD_SIZE);
+ }
/*
* Migrate the THP to the requested node, returns with page unlocked
@@ -1780,7 +1788,7 @@ static pmd_t move_soft_dirty_pmd(pmd_t pmd)
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
- pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush)
+ pmd_t *old_pmd, pmd_t *new_pmd)
{
spinlock_t *old_ptl, *new_ptl;
pmd_t pmd;
@@ -1811,7 +1819,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
- if (pmd_present(pmd) && pmd_dirty(pmd))
+ if (pmd_present(pmd))
force_flush = true;
VM_BUG_ON(!pmd_none(*new_pmd));
@@ -1822,12 +1830,10 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
}
pmd = move_soft_dirty_pmd(pmd);
set_pmd_at(mm, new_addr, new_pmd, pmd);
- if (new_ptl != old_ptl)
- spin_unlock(new_ptl);
if (force_flush)
flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
- else
- *need_flush = true;
+ if (new_ptl != old_ptl)
+ spin_unlock(new_ptl);
spin_unlock(old_ptl);
return true;
}
@@ -2371,6 +2377,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
(1L << PG_mlocked) |
(1L << PG_uptodate) |
(1L << PG_active) |
+ (1L << PG_workingset) |
(1L << PG_locked) |
(1L << PG_unevictable) |
(1L << PG_dirty)));
@@ -2882,9 +2889,6 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
if (!(pvmw->pmd && !pvmw->pte))
return;
- mmu_notifier_invalidate_range_start(mm, address,
- address + HPAGE_PMD_SIZE);
-
flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
pmdval = *pvmw->pmd;
pmdp_invalidate(vma, address, pvmw->pmd);
@@ -2897,9 +2901,6 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
set_pmd_at(mm, address, pvmw->pmd, pmdswp);
page_remove_rmap(page, true);
put_page(page);
-
- mmu_notifier_invalidate_range_end(mm, address,
- address + HPAGE_PMD_SIZE);
}
void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
@@ -2928,7 +2929,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
else
page_add_file_rmap(new, true);
set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
- if (vma->vm_flags & VM_LOCKED)
+ if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new))
mlock_vma_page(new);
update_mmu_cache_pmd(vma, address, pvmw->pmd);
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3c21775f196b..7b5c0ad9a6bd 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3326,8 +3326,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
- const unsigned long mmun_start = start; /* For mmu_notifiers */
- const unsigned long mmun_end = end; /* For mmu_notifiers */
+ unsigned long mmun_start = start; /* For mmu_notifiers */
+ unsigned long mmun_end = end; /* For mmu_notifiers */
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
@@ -3339,6 +3339,11 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
*/
tlb_remove_check_page_size_change(tlb, sz);
tlb_start_vma(tlb, vma);
+
+ /*
+ * If sharing possible, alert mmu notifiers of worst case.
+ */
+ adjust_range_if_pmd_sharing_possible(vma, &mmun_start, &mmun_end);
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
address = start;
for (; address < end; address += sz) {
@@ -3349,6 +3354,10 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
ptl = huge_pte_lock(h, mm, ptep);
if (huge_pmd_unshare(mm, &address, ptep)) {
spin_unlock(ptl);
+ /*
+ * We just unmapped a page of PMDs by clearing a PUD.
+ * The caller's TLB flush range should cover this area.
+ */
continue;
}
@@ -3431,12 +3440,23 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
{
struct mm_struct *mm;
struct mmu_gather tlb;
+ unsigned long tlb_start = start;
+ unsigned long tlb_end = end;
+
+ /*
+ * If shared PMDs were possibly used within this vma range, adjust
+ * start/end for worst case tlb flushing.
+ * Note that we can not be sure if PMDs are shared until we try to
+ * unmap pages. However, we want to make sure TLB flushing covers
+ * the largest possible range.
+ */
+ adjust_range_if_pmd_sharing_possible(vma, &tlb_start, &tlb_end);
mm = vma->vm_mm;
- tlb_gather_mmu(&tlb, mm, start, end);
+ tlb_gather_mmu(&tlb, mm, tlb_start, tlb_end);
__unmap_hugepage_range(&tlb, vma, start, end, ref_page);
- tlb_finish_mmu(&tlb, start, end);
+ tlb_finish_mmu(&tlb, tlb_start, tlb_end);
}
/*
@@ -3670,6 +3690,12 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
return err;
ClearPagePrivate(page);
+ /*
+ * set page dirty so that it will not be removed from cache/file
+ * by non-hugetlbfs specific code paths.
+ */
+ set_page_dirty(page);
+
spin_lock(&inode->i_lock);
inode->i_blocks += blocks_per_huge_page(h);
spin_unlock(&inode->i_lock);
@@ -4298,11 +4324,21 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
pte_t pte;
struct hstate *h = hstate_vma(vma);
unsigned long pages = 0;
+ unsigned long f_start = start;
+ unsigned long f_end = end;
+ bool shared_pmd = false;
+
+ /*
+ * In the case of shared PMDs, the area to flush could be beyond
+ * start/end. Set f_start/f_end to cover the maximum possible
+ * range if PMD sharing is possible.
+ */
+ adjust_range_if_pmd_sharing_possible(vma, &f_start, &f_end);
BUG_ON(address >= end);
- flush_cache_range(vma, address, end);
+ flush_cache_range(vma, f_start, f_end);
- mmu_notifier_invalidate_range_start(mm, start, end);
+ mmu_notifier_invalidate_range_start(mm, f_start, f_end);
i_mmap_lock_write(vma->vm_file->f_mapping);
for (; address < end; address += huge_page_size(h)) {
spinlock_t *ptl;
@@ -4313,6 +4349,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
if (huge_pmd_unshare(mm, &address, ptep)) {
pages++;
spin_unlock(ptl);
+ shared_pmd = true;
continue;
}
pte = huge_ptep_get(ptep);
@@ -4348,9 +4385,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
* may have cleared our pud entry and done put_page on the page table:
* once we release i_mmap_rwsem, another task can do the final put_page
- * and that page table be reused and filled with junk.
+ * and that page table be reused and filled with junk. If we actually
+ * did unshare a page of pmds, flush the range corresponding to the pud.
*/
- flush_hugetlb_tlb_range(vma, start, end);
+ if (shared_pmd)
+ flush_hugetlb_tlb_range(vma, f_start, f_end);
+ else
+ flush_hugetlb_tlb_range(vma, start, end);
/*
* No need to call mmu_notifier_invalidate_range() we are downgrading
* page table protection not changing it to point to a new page.
@@ -4358,7 +4399,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* See Documentation/vm/mmu_notifier.rst
*/
i_mmap_unlock_write(vma->vm_file->f_mapping);
- mmu_notifier_invalidate_range_end(mm, start, end);
+ mmu_notifier_invalidate_range_end(mm, f_start, f_end);
return pages << h->order;
}
@@ -4545,13 +4586,41 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
/*
* check on proper vm_flags and page table alignment
*/
- if (vma->vm_flags & VM_MAYSHARE &&
- vma->vm_start <= base && end <= vma->vm_end)
+ if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
return true;
return false;
}
/*
+ * Determine if start,end range within vma could be mapped by shared pmd.
+ * If yes, adjust start and end to cover range associated with possible
+ * shared pmd mappings.
+ */
+void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
+ unsigned long *start, unsigned long *end)
+{
+ unsigned long check_addr = *start;
+
+ if (!(vma->vm_flags & VM_MAYSHARE))
+ return;
+
+ for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) {
+ unsigned long a_start = check_addr & PUD_MASK;
+ unsigned long a_end = a_start + PUD_SIZE;
+
+ /*
+ * If sharing is possible, adjust start/end if necessary.
+ */
+ if (range_in_vma(vma, a_start, a_end)) {
+ if (a_start < *start)
+ *start = a_start;
+ if (a_end > *end)
+ *end = a_end;
+ }
+ }
+}
+
+/*
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
* and returns the corresponding pte. While this is not necessary for the
* !shared pmd case because we can allocate the pmd later as well, it makes the
@@ -4648,6 +4717,11 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
{
return 0;
}
+
+void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
+ unsigned long *start, unsigned long *end)
+{
+}
#define want_pmd_share() (0)
#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 3a8ddf8baf7d..b209dbaefde8 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -103,7 +103,7 @@ static int quarantine_head;
static int quarantine_tail;
/* Total size of all objects in global_quarantine across all batches. */
static unsigned long quarantine_size;
-static DEFINE_SPINLOCK(quarantine_lock);
+static DEFINE_RAW_SPINLOCK(quarantine_lock);
DEFINE_STATIC_SRCU(remove_cache_srcu);
/* Maximum size of the global queue. */
@@ -190,7 +190,7 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) {
qlist_move_all(q, &temp);
- spin_lock(&quarantine_lock);
+ raw_spin_lock(&quarantine_lock);
WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes);
qlist_move_all(&temp, &global_quarantine[quarantine_tail]);
if (global_quarantine[quarantine_tail].bytes >=
@@ -203,7 +203,7 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
if (new_tail != quarantine_head)
quarantine_tail = new_tail;
}
- spin_unlock(&quarantine_lock);
+ raw_spin_unlock(&quarantine_lock);
}
local_irq_restore(flags);
@@ -230,7 +230,7 @@ void quarantine_reduce(void)
* expected case).
*/
srcu_idx = srcu_read_lock(&remove_cache_srcu);
- spin_lock_irqsave(&quarantine_lock, flags);
+ raw_spin_lock_irqsave(&quarantine_lock, flags);
/*
* Update quarantine size in case of hotplug. Allocate a fraction of
@@ -254,7 +254,7 @@ void quarantine_reduce(void)
quarantine_head = 0;
}
- spin_unlock_irqrestore(&quarantine_lock, flags);
+ raw_spin_unlock_irqrestore(&quarantine_lock, flags);
qlist_free_all(&to_free, NULL);
srcu_read_unlock(&remove_cache_srcu, srcu_idx);
@@ -310,17 +310,17 @@ void quarantine_remove_cache(struct kmem_cache *cache)
*/
on_each_cpu(per_cpu_remove_cache, cache, 1);
- spin_lock_irqsave(&quarantine_lock, flags);
+ raw_spin_lock_irqsave(&quarantine_lock, flags);
for (i = 0; i < QUARANTINE_BATCHES; i++) {
if (qlist_empty(&global_quarantine[i]))
continue;
qlist_move_cache(&global_quarantine[i], &to_free, cache);
/* Scanning whole quarantine can take a while. */
- spin_unlock_irqrestore(&quarantine_lock, flags);
+ raw_spin_unlock_irqrestore(&quarantine_lock, flags);
cond_resched();
- spin_lock_irqsave(&quarantine_lock, flags);
+ raw_spin_lock_irqsave(&quarantine_lock, flags);
}
- spin_unlock_irqrestore(&quarantine_lock, flags);
+ raw_spin_unlock_irqrestore(&quarantine_lock, flags);
qlist_free_all(&to_free, cache);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 17dd883198ae..4f7e4b5a2f08 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -86,6 +86,7 @@
#include <linux/seq_file.h>
#include <linux/cpumask.h>
#include <linux/spinlock.h>
+#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/rcupdate.h>
#include <linux/stacktrace.h>
@@ -181,6 +182,7 @@ struct kmemleak_object {
/* flag set to not scan the object */
#define OBJECT_NO_SCAN (1 << 2)
+#define HEX_PREFIX " "
/* number of bytes to print per line; must be 16 or 32 */
#define HEX_ROW_SIZE 16
/* number of bytes to print at a time (1, 2, 4, 8) */
@@ -235,6 +237,9 @@ static int kmemleak_skip_disable;
/* If there are leaks that can be reported */
static bool kmemleak_found_leaks;
+static bool kmemleak_verbose;
+module_param_named(verbose, kmemleak_verbose, bool, 0600);
+
/*
* Early object allocation/freeing logging. Kmemleak is initialized after the
* kernel allocator. However, both the kernel allocator and kmemleak may
@@ -299,6 +304,25 @@ static void kmemleak_disable(void);
kmemleak_disable(); \
} while (0)
+#define warn_or_seq_printf(seq, fmt, ...) do { \
+ if (seq) \
+ seq_printf(seq, fmt, ##__VA_ARGS__); \
+ else \
+ pr_warn(fmt, ##__VA_ARGS__); \
+} while (0)
+
+static void warn_or_seq_hex_dump(struct seq_file *seq, int prefix_type,
+ int rowsize, int groupsize, const void *buf,
+ size_t len, bool ascii)
+{
+ if (seq)
+ seq_hex_dump(seq, HEX_PREFIX, prefix_type, rowsize, groupsize,
+ buf, len, ascii);
+ else
+ print_hex_dump(KERN_WARNING, pr_fmt(HEX_PREFIX), prefix_type,
+ rowsize, groupsize, buf, len, ascii);
+}
+
/*
* Printing of the objects hex dump to the seq file. The number of lines to be
* printed is limited to HEX_MAX_LINES to prevent seq file spamming. The
@@ -314,10 +338,10 @@ static void hex_dump_object(struct seq_file *seq,
/* limit the number of lines to HEX_MAX_LINES */
len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE);
- seq_printf(seq, " hex dump (first %zu bytes):\n", len);
+ warn_or_seq_printf(seq, " hex dump (first %zu bytes):\n", len);
kasan_disable_current();
- seq_hex_dump(seq, " ", DUMP_PREFIX_NONE, HEX_ROW_SIZE,
- HEX_GROUP_SIZE, ptr, len, HEX_ASCII);
+ warn_or_seq_hex_dump(seq, DUMP_PREFIX_NONE, HEX_ROW_SIZE,
+ HEX_GROUP_SIZE, ptr, len, HEX_ASCII);
kasan_enable_current();
}
@@ -365,17 +389,17 @@ static void print_unreferenced(struct seq_file *seq,
int i;
unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies);
- seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
+ warn_or_seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
object->pointer, object->size);
- seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n",
+ warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n",
object->comm, object->pid, object->jiffies,
msecs_age / 1000, msecs_age % 1000);
hex_dump_object(seq, object);
- seq_printf(seq, " backtrace:\n");
+ warn_or_seq_printf(seq, " backtrace:\n");
for (i = 0; i < object->trace_len; i++) {
void *ptr = (void *)object->trace[i];
- seq_printf(seq, " [<%p>] %pS\n", ptr, ptr);
+ warn_or_seq_printf(seq, " [<%p>] %pS\n", ptr, ptr);
}
}
@@ -1598,6 +1622,10 @@ static void kmemleak_scan(void)
if (unreferenced_object(object) &&
!(object->flags & OBJECT_REPORTED)) {
object->flags |= OBJECT_REPORTED;
+
+ if (kmemleak_verbose)
+ print_unreferenced(NULL, object);
+
new_leaks++;
}
spin_unlock_irqrestore(&object->lock, flags);
diff --git a/mm/maccess.c b/mm/maccess.c
index ec00be51a24f..f3416632e5a4 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -30,8 +30,10 @@ long __probe_kernel_read(void *dst, const void *src, size_t size)
set_fs(KERNEL_DS);
pagefault_disable();
+ current->kernel_uaccess_faults_ok++;
ret = __copy_from_user_inatomic(dst,
(__force const void __user *)src, size);
+ current->kernel_uaccess_faults_ok--;
pagefault_enable();
set_fs(old_fs);
@@ -58,7 +60,9 @@ long __probe_kernel_write(void *dst, const void *src, size_t size)
set_fs(KERNEL_DS);
pagefault_disable();
+ current->kernel_uaccess_faults_ok++;
ret = __copy_to_user_inatomic((__force void __user *)dst, src, size);
+ current->kernel_uaccess_faults_ok--;
pagefault_enable();
set_fs(old_fs);
@@ -94,11 +98,13 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
set_fs(KERNEL_DS);
pagefault_disable();
+ current->kernel_uaccess_faults_ok++;
do {
ret = __get_user(*dst++, (const char __user __force *)src++);
} while (dst[-1] && ret == 0 && src - unsafe_addr < count);
+ current->kernel_uaccess_faults_ok--;
dst[-1] = '\0';
pagefault_enable();
set_fs(old_fs);
diff --git a/mm/madvise.c b/mm/madvise.c
index 9d802566c494..6cb1ca93e290 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -96,7 +96,7 @@ static long madvise_behavior(struct vm_area_struct *vma,
new_flags |= VM_DONTDUMP;
break;
case MADV_DODUMP:
- if (new_flags & VM_SPECIAL) {
+ if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
error = -EINVAL;
goto out;
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 237944479d25..a85315083b5a 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1444,10 +1444,9 @@ void * __init memblock_virt_alloc_try_nid_raw(
ptr = memblock_virt_alloc_internal(size, align,
min_addr, max_addr, nid);
-#ifdef CONFIG_DEBUG_VM
if (ptr && size > 0)
- memset(ptr, PAGE_POISON_PATTERN, size);
-#endif
+ page_init_poison(ptr, size);
+
return ptr;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 29d9d1a69b36..54920cbc46bf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1669,6 +1669,8 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int
if (order > PAGE_ALLOC_COSTLY_ORDER)
return OOM_SKIPPED;
+ memcg_memory_event(memcg, MEMCG_OOM);
+
/*
* We are in the middle of the charge context here, so we
* don't want to block when potentially sitting on a callstack
@@ -2250,8 +2252,6 @@ retry:
if (fatal_signal_pending(current))
goto force;
- memcg_memory_event(mem_over_limit, MEMCG_OOM);
-
/*
* keep retrying as long as the memcg oom killer is able to make
* a forward progress or bypass the charge if the oom killer
@@ -2460,7 +2460,7 @@ static void memcg_kmem_cache_create_func(struct work_struct *w)
/*
* Enqueue the creation of a per-memcg kmem_cache.
*/
-static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
+static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
struct kmem_cache *cachep)
{
struct memcg_kmem_cache_create_work *cw;
@@ -2478,25 +2478,6 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
queue_work(memcg_kmem_cache_wq, &cw->work);
}
-static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
-{
- /*
- * We need to stop accounting when we kmalloc, because if the
- * corresponding kmalloc cache is not yet created, the first allocation
- * in __memcg_schedule_kmem_cache_create will recurse.
- *
- * However, it is better to enclose the whole function. Depending on
- * the debugging options enabled, INIT_WORK(), for instance, can
- * trigger an allocation. This too, will make us recurse. Because at
- * this point we can't allow ourselves back into memcg_kmem_get_cache,
- * the safest choice is to do it like this, wrapping the whole function.
- */
- current->memcg_kmem_skip_account = 1;
- __memcg_schedule_kmem_cache_create(memcg, cachep);
- current->memcg_kmem_skip_account = 0;
-}
-
static inline bool memcg_kmem_bypass(void)
{
if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
@@ -2531,9 +2512,6 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
if (memcg_kmem_bypass())
return cachep;
- if (current->memcg_kmem_skip_account)
- return cachep;
-
memcg = get_mem_cgroup_from_current();
kmemcg_id = READ_ONCE(memcg->kmemcg_id);
if (kmemcg_id < 0)
@@ -4321,14 +4299,12 @@ static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
{
- VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
- atomic_add(n, &memcg->id.ref);
+ refcount_add(n, &memcg->id.ref);
}
static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
{
- VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
- if (atomic_sub_and_test(n, &memcg->id.ref)) {
+ if (refcount_sub_and_test(n, &memcg->id.ref)) {
mem_cgroup_id_remove(memcg);
/* Memcg ID pins CSS */
@@ -4545,7 +4521,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
}
/* Online state pins memcg ID, memcg ID pins CSS */
- atomic_set(&memcg->id.ref, 1);
+ refcount_set(&memcg->id.ref, 1);
css_get(css);
return 0;
}
@@ -4573,6 +4549,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
memcg_offline_kmem(memcg);
wb_memcg_offline(memcg);
+ drain_all_stock(memcg);
+
mem_cgroup_id_put(memcg);
}
@@ -5595,6 +5573,13 @@ static int memory_stat_show(struct seq_file *m, void *v)
seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
+ seq_printf(m, "workingset_refault %lu\n",
+ acc.stat[WORKINGSET_REFAULT]);
+ seq_printf(m, "workingset_activate %lu\n",
+ acc.stat[WORKINGSET_ACTIVATE]);
+ seq_printf(m, "workingset_nodereclaim %lu\n",
+ acc.stat[WORKINGSET_NODERECLAIM]);
+
seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
acc.events[PGSCAN_DIRECT]);
@@ -5605,13 +5590,6 @@ static int memory_stat_show(struct seq_file *m, void *v)
seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
- seq_printf(m, "workingset_refault %lu\n",
- acc.stat[WORKINGSET_REFAULT]);
- seq_printf(m, "workingset_activate %lu\n",
- acc.stat[WORKINGSET_ACTIVATE]);
- seq_printf(m, "workingset_nodereclaim %lu\n",
- acc.stat[WORKINGSET_NODERECLAIM]);
-
return 0;
}
@@ -6377,7 +6355,7 @@ subsys_initcall(mem_cgroup_init);
#ifdef CONFIG_MEMCG_SWAP
static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
{
- while (!atomic_inc_not_zero(&memcg->id.ref)) {
+ while (!refcount_inc_not_zero(&memcg->id.ref)) {
/*
* The root cgroup cannot be destroyed, so it's refcount must
* always be >= 1.
diff --git a/mm/memory.c b/mm/memory.c
index c467102a5cbc..072139579d89 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -186,253 +186,6 @@ static void check_sync_rss_stat(struct task_struct *task)
#endif /* SPLIT_RSS_COUNTING */
-#ifdef HAVE_GENERIC_MMU_GATHER
-
-static bool tlb_next_batch(struct mmu_gather *tlb)
-{
- struct mmu_gather_batch *batch;
-
- batch = tlb->active;
- if (batch->next) {
- tlb->active = batch->next;
- return true;
- }
-
- if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
- return false;
-
- batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
- if (!batch)
- return false;
-
- tlb->batch_count++;
- batch->next = NULL;
- batch->nr = 0;
- batch->max = MAX_GATHER_BATCH;
-
- tlb->active->next = batch;
- tlb->active = batch;
-
- return true;
-}
-
-void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
- unsigned long start, unsigned long end)
-{
- tlb->mm = mm;
-
- /* Is it from 0 to ~0? */
- tlb->fullmm = !(start | (end+1));
- tlb->need_flush_all = 0;
- tlb->local.next = NULL;
- tlb->local.nr = 0;
- tlb->local.max = ARRAY_SIZE(tlb->__pages);
- tlb->active = &tlb->local;
- tlb->batch_count = 0;
-
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
- tlb->batch = NULL;
-#endif
- tlb->page_size = 0;
-
- __tlb_reset_range(tlb);
-}
-
-static void tlb_flush_mmu_free(struct mmu_gather *tlb)
-{
- struct mmu_gather_batch *batch;
-
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
- tlb_table_flush(tlb);
-#endif
- for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
- free_pages_and_swap_cache(batch->pages, batch->nr);
- batch->nr = 0;
- }
- tlb->active = &tlb->local;
-}
-
-void tlb_flush_mmu(struct mmu_gather *tlb)
-{
- tlb_flush_mmu_tlbonly(tlb);
- tlb_flush_mmu_free(tlb);
-}
-
-/* tlb_finish_mmu
- * Called at the end of the shootdown operation to free up any resources
- * that were required.
- */
-void arch_tlb_finish_mmu(struct mmu_gather *tlb,
- unsigned long start, unsigned long end, bool force)
-{
- struct mmu_gather_batch *batch, *next;
-
- if (force)
- __tlb_adjust_range(tlb, start, end - start);
-
- tlb_flush_mmu(tlb);
-
- /* keep the page table cache within bounds */
- check_pgt_cache();
-
- for (batch = tlb->local.next; batch; batch = next) {
- next = batch->next;
- free_pages((unsigned long)batch, 0);
- }
- tlb->local.next = NULL;
-}
-
-/* __tlb_remove_page
- * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
- * handling the additional races in SMP caused by other CPUs caching valid
- * mappings in their TLBs. Returns the number of free page slots left.
- * When out of page slots we must call tlb_flush_mmu().
- *returns true if the caller should flush.
- */
-bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
-{
- struct mmu_gather_batch *batch;
-
- VM_BUG_ON(!tlb->end);
- VM_WARN_ON(tlb->page_size != page_size);
-
- batch = tlb->active;
- /*
- * Add the page and check if we are full. If so
- * force a flush.
- */
- batch->pages[batch->nr++] = page;
- if (batch->nr == batch->max) {
- if (!tlb_next_batch(tlb))
- return true;
- batch = tlb->active;
- }
- VM_BUG_ON_PAGE(batch->nr > batch->max, page);
-
- return false;
-}
-
-#endif /* HAVE_GENERIC_MMU_GATHER */
-
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-
-/*
- * See the comment near struct mmu_table_batch.
- */
-
-/*
- * If we want tlb_remove_table() to imply TLB invalidates.
- */
-static inline void tlb_table_invalidate(struct mmu_gather *tlb)
-{
-#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE
- /*
- * Invalidate page-table caches used by hardware walkers. Then we still
- * need to RCU-sched wait while freeing the pages because software
- * walkers can still be in-flight.
- */
- tlb_flush_mmu_tlbonly(tlb);
-#endif
-}
-
-static void tlb_remove_table_smp_sync(void *arg)
-{
- /* Simply deliver the interrupt */
-}
-
-static void tlb_remove_table_one(void *table)
-{
- /*
- * This isn't an RCU grace period and hence the page-tables cannot be
- * assumed to be actually RCU-freed.
- *
- * It is however sufficient for software page-table walkers that rely on
- * IRQ disabling. See the comment near struct mmu_table_batch.
- */
- smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
- __tlb_remove_table(table);
-}
-
-static void tlb_remove_table_rcu(struct rcu_head *head)
-{
- struct mmu_table_batch *batch;
- int i;
-
- batch = container_of(head, struct mmu_table_batch, rcu);
-
- for (i = 0; i < batch->nr; i++)
- __tlb_remove_table(batch->tables[i]);
-
- free_page((unsigned long)batch);
-}
-
-void tlb_table_flush(struct mmu_gather *tlb)
-{
- struct mmu_table_batch **batch = &tlb->batch;
-
- if (*batch) {
- tlb_table_invalidate(tlb);
- call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
- *batch = NULL;
- }
-}
-
-void tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- struct mmu_table_batch **batch = &tlb->batch;
-
- if (*batch == NULL) {
- *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
- if (*batch == NULL) {
- tlb_table_invalidate(tlb);
- tlb_remove_table_one(table);
- return;
- }
- (*batch)->nr = 0;
- }
-
- (*batch)->tables[(*batch)->nr++] = table;
- if ((*batch)->nr == MAX_TABLE_BATCH)
- tlb_table_flush(tlb);
-}
-
-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
-
-/**
- * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
- * @tlb: the mmu_gather structure to initialize
- * @mm: the mm_struct of the target address space
- * @start: start of the region that will be removed from the page-table
- * @end: end of the region that will be removed from the page-table
- *
- * Called to initialize an (on-stack) mmu_gather structure for page-table
- * tear-down from @mm. The @start and @end are set to 0 and -1
- * respectively when @mm is without users and we're going to destroy
- * the full address space (exit/execve).
- */
-void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
- unsigned long start, unsigned long end)
-{
- arch_tlb_gather_mmu(tlb, mm, start, end);
- inc_tlb_flush_pending(tlb->mm);
-}
-
-void tlb_finish_mmu(struct mmu_gather *tlb,
- unsigned long start, unsigned long end)
-{
- /*
- * If there are parallel threads are doing PTE changes on same range
- * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB
- * flush by batching, a thread has stable TLB entry can fail to flush
- * the TLB by observing pte_none|!pte_dirty, for example so flush TLB
- * forcefully if we detect parallel PTE batching threads.
- */
- bool force = mm_tlb_flush_nested(tlb->mm);
-
- arch_tlb_finish_mmu(tlb, start, end, force);
- dec_tlb_flush_pending(tlb->mm);
-}
-
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
@@ -1767,19 +1520,16 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
}
EXPORT_SYMBOL(vm_insert_page);
-static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn, pgprot_t prot, bool mkwrite)
{
struct mm_struct *mm = vma->vm_mm;
- int retval;
pte_t *pte, entry;
spinlock_t *ptl;
- retval = -ENOMEM;
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
- goto out;
- retval = -EBUSY;
+ return VM_FAULT_OOM;
if (!pte_none(*pte)) {
if (mkwrite) {
/*
@@ -1812,56 +1562,32 @@ out_mkwrite:
set_pte_at(mm, addr, pte, entry);
update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
- retval = 0;
out_unlock:
pte_unmap_unlock(pte, ptl);
-out:
- return retval;
-}
-
-/**
- * vm_insert_pfn - insert single pfn into user vma
- * @vma: user vma to map to
- * @addr: target user address of this page
- * @pfn: source kernel pfn
- *
- * Similar to vm_insert_page, this allows drivers to insert individual pages
- * they've allocated into a user vma. Same comments apply.
- *
- * This function should only be called from a vm_ops->fault handler, and
- * in that case the handler should return NULL.
- *
- * vma cannot be a COW mapping.
- *
- * As this is called only for pages that do not currently exist, we
- * do not need to flush old virtual caches or the TLB.
- */
-int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn)
-{
- return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
+ return VM_FAULT_NOPAGE;
}
-EXPORT_SYMBOL(vm_insert_pfn);
/**
- * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot
+ * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
* @vma: user vma to map to
* @addr: target user address of this page
* @pfn: source kernel pfn
* @pgprot: pgprot flags for the inserted page
*
- * This is exactly like vm_insert_pfn, except that it allows drivers to
+ * This is exactly like vmf_insert_pfn(), except that it allows drivers to
* to override pgprot on a per-page basis.
*
* This only makes sense for IO mappings, and it makes no sense for
- * cow mappings. In general, using multiple vmas is preferable;
- * vm_insert_pfn_prot should only be used if using multiple VMAs is
+ * COW mappings. In general, using multiple vmas is preferable;
+ * vmf_insert_pfn_prot should only be used if using multiple VMAs is
* impractical.
+ *
+ * Context: Process context. May allocate using %GFP_KERNEL.
+ * Return: vm_fault_t value.
*/
-int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
+vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, pgprot_t pgprot)
{
- int ret;
/*
* Technically, architectures with pte_special can avoid all these
* restrictions (same for remap_pfn_range). However we would like
@@ -1875,19 +1601,44 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
if (addr < vma->vm_start || addr >= vma->vm_end)
- return -EFAULT;
+ return VM_FAULT_SIGBUS;
if (!pfn_modify_allowed(pfn, pgprot))
- return -EACCES;
+ return VM_FAULT_SIGBUS;
track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
- ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
+ return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
false);
+}
+EXPORT_SYMBOL(vmf_insert_pfn_prot);
- return ret;
+/**
+ * vmf_insert_pfn - insert single pfn into user vma
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @pfn: source kernel pfn
+ *
+ * Similar to vm_insert_page, this allows drivers to insert individual pages
+ * they've allocated into a user vma. Same comments apply.
+ *
+ * This function should only be called from a vm_ops->fault handler, and
+ * in that case the handler should return the result of this function.
+ *
+ * vma cannot be a COW mapping.
+ *
+ * As this is called only for pages that do not currently exist, we
+ * do not need to flush old virtual caches or the TLB.
+ *
+ * Context: Process context. May allocate using %GFP_KERNEL.
+ * Return: vm_fault_t value.
+ */
+vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn)
+{
+ return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
}
-EXPORT_SYMBOL(vm_insert_pfn_prot);
+EXPORT_SYMBOL(vmf_insert_pfn);
static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
{
@@ -1903,20 +1654,21 @@ static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
return false;
}
-static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn, bool mkwrite)
+static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
+ unsigned long addr, pfn_t pfn, bool mkwrite)
{
pgprot_t pgprot = vma->vm_page_prot;
+ int err;
BUG_ON(!vm_mixed_ok(vma, pfn));
if (addr < vma->vm_start || addr >= vma->vm_end)
- return -EFAULT;
+ return VM_FAULT_SIGBUS;
track_pfn_insert(vma, &pgprot, pfn);
if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
- return -EACCES;
+ return VM_FAULT_SIGBUS;
/*
* If we don't have pte special, then we have to use the pfn_valid()
@@ -1935,36 +1687,35 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
* result in pfn_t_has_page() == false.
*/
page = pfn_to_page(pfn_t_to_pfn(pfn));
- return insert_page(vma, addr, page, pgprot);
+ err = insert_page(vma, addr, page, pgprot);
+ } else {
+ return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
}
- return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
+
+ if (err == -ENOMEM)
+ return VM_FAULT_OOM;
+ if (err < 0 && err != -EBUSY)
+ return VM_FAULT_SIGBUS;
+
+ return VM_FAULT_NOPAGE;
}
-int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn)
+vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+ pfn_t pfn)
{
return __vm_insert_mixed(vma, addr, pfn, false);
-
}
-EXPORT_SYMBOL(vm_insert_mixed);
+EXPORT_SYMBOL(vmf_insert_mixed);
/*
* If the insertion of PTE failed because someone else already added a
* different entry in the mean time, we treat that as success as we assume
* the same entry was actually inserted.
*/
-
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
unsigned long addr, pfn_t pfn)
{
- int err;
-
- err = __vm_insert_mixed(vma, addr, pfn, true);
- if (err == -ENOMEM)
- return VM_FAULT_OOM;
- if (err < 0 && err != -EBUSY)
- return VM_FAULT_SIGBUS;
- return VM_FAULT_NOPAGE;
+ return __vm_insert_mixed(vma, addr, pfn, true);
}
EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
@@ -3745,10 +3496,36 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret;
- /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
- if (!vma->vm_ops->fault)
- ret = VM_FAULT_SIGBUS;
- else if (!(vmf->flags & FAULT_FLAG_WRITE))
+ /*
+ * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
+ */
+ if (!vma->vm_ops->fault) {
+ /*
+ * If we find a migration pmd entry or a none pmd entry, which
+ * should never happen, return SIGBUS
+ */
+ if (unlikely(!pmd_present(*vmf->pmd)))
+ ret = VM_FAULT_SIGBUS;
+ else {
+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
+ vmf->pmd,
+ vmf->address,
+ &vmf->ptl);
+ /*
+ * Make sure this is not a temporary clearing of pte
+ * by holding ptl and checking again. A R/M/W update
+ * of pte involves: take ptl, clearing the pte so that
+ * we don't have concurrent modification by hardware
+ * followed by an update.
+ */
+ if (unlikely(pte_none(*vmf->pte)))
+ ret = VM_FAULT_SIGBUS;
+ else
+ ret = VM_FAULT_NOPAGE;
+
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ }
+ } else if (!(vmf->flags & FAULT_FLAG_WRITE))
ret = do_read_fault(vmf);
else if (!(vma->vm_flags & VM_SHARED))
ret = do_cow_fault(vmf);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 38d94b703e9d..7e6509a53d79 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -687,62 +687,19 @@ static void node_states_check_changes_online(unsigned long nr_pages,
struct zone *zone, struct memory_notify *arg)
{
int nid = zone_to_nid(zone);
- enum zone_type zone_last = ZONE_NORMAL;
- /*
- * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
- * contains nodes which have zones of 0...ZONE_NORMAL,
- * set zone_last to ZONE_NORMAL.
- *
- * If we don't have HIGHMEM nor movable node,
- * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
- * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
- */
- if (N_MEMORY == N_NORMAL_MEMORY)
- zone_last = ZONE_MOVABLE;
+ arg->status_change_nid = -1;
+ arg->status_change_nid_normal = -1;
+ arg->status_change_nid_high = -1;
- /*
- * if the memory to be online is in a zone of 0...zone_last, and
- * the zones of 0...zone_last don't have memory before online, we will
- * need to set the node to node_states[N_NORMAL_MEMORY] after
- * the memory is online.
- */
- if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
+ if (!node_state(nid, N_MEMORY))
+ arg->status_change_nid = nid;
+ if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
arg->status_change_nid_normal = nid;
- else
- arg->status_change_nid_normal = -1;
-
#ifdef CONFIG_HIGHMEM
- /*
- * If we have movable node, node_states[N_HIGH_MEMORY]
- * contains nodes which have zones of 0...ZONE_HIGHMEM,
- * set zone_last to ZONE_HIGHMEM.
- *
- * If we don't have movable node, node_states[N_NORMAL_MEMORY]
- * contains nodes which have zones of 0...ZONE_MOVABLE,
- * set zone_last to ZONE_MOVABLE.
- */
- zone_last = ZONE_HIGHMEM;
- if (N_MEMORY == N_HIGH_MEMORY)
- zone_last = ZONE_MOVABLE;
-
- if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY))
+ if (zone_idx(zone) <= N_HIGH_MEMORY && !node_state(nid, N_HIGH_MEMORY))
arg->status_change_nid_high = nid;
- else
- arg->status_change_nid_high = -1;
-#else
- arg->status_change_nid_high = arg->status_change_nid_normal;
#endif
-
- /*
- * if the node don't have memory befor online, we will need to
- * set the node to node_states[N_MEMORY] after the memory
- * is online.
- */
- if (!node_state(nid, N_MEMORY))
- arg->status_change_nid = nid;
- else
- arg->status_change_nid = -1;
}
static void node_states_set_node(int node, struct memory_notify *arg)
@@ -753,7 +710,8 @@ static void node_states_set_node(int node, struct memory_notify *arg)
if (arg->status_change_nid_high >= 0)
node_set_state(node, N_HIGH_MEMORY);
- node_set_state(node, N_MEMORY);
+ if (arg->status_change_nid >= 0)
+ node_set_state(node, N_MEMORY);
}
static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
@@ -1505,75 +1463,53 @@ static void node_states_check_changes_offline(unsigned long nr_pages,
{
struct pglist_data *pgdat = zone->zone_pgdat;
unsigned long present_pages = 0;
- enum zone_type zt, zone_last = ZONE_NORMAL;
+ enum zone_type zt;
- /*
- * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
- * contains nodes which have zones of 0...ZONE_NORMAL,
- * set zone_last to ZONE_NORMAL.
- *
- * If we don't have HIGHMEM nor movable node,
- * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
- * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
- */
- if (N_MEMORY == N_NORMAL_MEMORY)
- zone_last = ZONE_MOVABLE;
+ arg->status_change_nid = -1;
+ arg->status_change_nid_normal = -1;
+ arg->status_change_nid_high = -1;
/*
- * check whether node_states[N_NORMAL_MEMORY] will be changed.
- * If the memory to be offline is in a zone of 0...zone_last,
- * and it is the last present memory, 0...zone_last will
- * become empty after offline , thus we can determind we will
- * need to clear the node from node_states[N_NORMAL_MEMORY].
+ * Check whether node_states[N_NORMAL_MEMORY] will be changed.
+ * If the memory to be offline is within the range
+ * [0..ZONE_NORMAL], and it is the last present memory there,
+ * the zones in that range will become empty after the offlining,
+ * thus we can determine that we need to clear the node from
+ * node_states[N_NORMAL_MEMORY].
*/
- for (zt = 0; zt <= zone_last; zt++)
+ for (zt = 0; zt <= ZONE_NORMAL; zt++)
present_pages += pgdat->node_zones[zt].present_pages;
- if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
+ if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
arg->status_change_nid_normal = zone_to_nid(zone);
- else
- arg->status_change_nid_normal = -1;
#ifdef CONFIG_HIGHMEM
/*
- * If we have movable node, node_states[N_HIGH_MEMORY]
- * contains nodes which have zones of 0...ZONE_HIGHMEM,
- * set zone_last to ZONE_HIGHMEM.
- *
- * If we don't have movable node, node_states[N_NORMAL_MEMORY]
- * contains nodes which have zones of 0...ZONE_MOVABLE,
- * set zone_last to ZONE_MOVABLE.
+ * node_states[N_HIGH_MEMORY] contains nodes which
+ * have normal memory or high memory.
+ * Here we add the present_pages belonging to ZONE_HIGHMEM.
+ * If the zone is within the range of [0..ZONE_HIGHMEM), and
+ * we determine that the zones in that range become empty,
+ * we need to clear the node for N_HIGH_MEMORY.
*/
- zone_last = ZONE_HIGHMEM;
- if (N_MEMORY == N_HIGH_MEMORY)
- zone_last = ZONE_MOVABLE;
-
- for (; zt <= zone_last; zt++)
- present_pages += pgdat->node_zones[zt].present_pages;
- if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
+ present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages;
+ if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages)
arg->status_change_nid_high = zone_to_nid(zone);
- else
- arg->status_change_nid_high = -1;
-#else
- arg->status_change_nid_high = arg->status_change_nid_normal;
#endif
/*
- * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
+ * We have accounted the pages from [0..ZONE_NORMAL), and
+ * in case of CONFIG_HIGHMEM the pages from ZONE_HIGHMEM
+ * as well.
+ * Here we count the possible pages from ZONE_MOVABLE.
+ * If after having accounted all the pages, we see that the nr_pages
+ * to be offlined is over or equal to the accounted pages,
+ * we know that the node will become empty, and so, we can clear
+ * it for N_MEMORY as well.
*/
- zone_last = ZONE_MOVABLE;
+ present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages;
- /*
- * check whether node_states[N_HIGH_MEMORY] will be changed
- * If we try to offline the last present @nr_pages from the node,
- * we can determind we will need to clear the node from
- * node_states[N_HIGH_MEMORY].
- */
- for (; zt <= zone_last; zt++)
- present_pages += pgdat->node_zones[zt].present_pages;
if (nr_pages >= present_pages)
arg->status_change_nid = zone_to_nid(zone);
- else
- arg->status_change_nid = -1;
}
static void node_states_clear_node(int node, struct memory_notify *arg)
@@ -1581,12 +1517,10 @@ static void node_states_clear_node(int node, struct memory_notify *arg)
if (arg->status_change_nid_normal >= 0)
node_clear_state(node, N_NORMAL_MEMORY);
- if ((N_MEMORY != N_NORMAL_MEMORY) &&
- (arg->status_change_nid_high >= 0))
+ if (arg->status_change_nid_high >= 0)
node_clear_state(node, N_HIGH_MEMORY);
- if ((N_MEMORY != N_HIGH_MEMORY) &&
- (arg->status_change_nid >= 0))
+ if (arg->status_change_nid >= 0)
node_clear_state(node, N_MEMORY);
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index da858f794eb6..cfd26d7e61a1 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -797,16 +797,19 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
}
}
-static int lookup_node(unsigned long addr)
+static int lookup_node(struct mm_struct *mm, unsigned long addr)
{
struct page *p;
int err;
- err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);
+ int locked = 1;
+ err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
if (err >= 0) {
err = page_to_nid(p);
put_page(p);
}
+ if (locked)
+ up_read(&mm->mmap_sem);
return err;
}
@@ -817,7 +820,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
int err;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
- struct mempolicy *pol = current->mempolicy;
+ struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
if (flags &
~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
@@ -857,7 +860,16 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
if (flags & MPOL_F_NODE) {
if (flags & MPOL_F_ADDR) {
- err = lookup_node(addr);
+ /*
+ * Take a refcount on the mpol, lookup_node()
+ * wil drop the mmap_sem, so after calling
+ * lookup_node() only "pol" remains valid, "vma"
+ * is stale.
+ */
+ pol_refcount = pol;
+ vma = NULL;
+ mpol_get(pol);
+ err = lookup_node(mm, addr);
if (err < 0)
goto out;
*policy = err;
@@ -892,7 +904,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
out:
mpol_cond_put(pol);
if (vma)
- up_read(&current->mm->mmap_sem);
+ up_read(&mm->mmap_sem);
+ if (pol_refcount)
+ mpol_put(pol_refcount);
return err;
}
@@ -2697,12 +2711,11 @@ static const char * const policy_modes[] =
int mpol_parse_str(char *str, struct mempolicy **mpol)
{
struct mempolicy *new = NULL;
- unsigned short mode;
unsigned short mode_flags;
nodemask_t nodes;
char *nodelist = strchr(str, ':');
char *flags = strchr(str, '=');
- int err = 1;
+ int err = 1, mode;
if (nodelist) {
/* NUL-terminate mode or flags string */
@@ -2717,12 +2730,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
if (flags)
*flags++ = '\0'; /* terminate mode string */
- for (mode = 0; mode < MPOL_MAX; mode++) {
- if (!strcmp(str, policy_modes[mode])) {
- break;
- }
- }
- if (mode >= MPOL_MAX)
+ mode = match_string(policy_modes, MPOL_MAX, str);
+ if (mode < 0)
goto out;
switch (mode) {
diff --git a/mm/migrate.c b/mm/migrate.c
index b3cde3fd094a..f7e4bfdc13b7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -275,6 +275,9 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
mlock_vma_page(new);
+ if (PageTransHuge(page) && PageMlocked(page))
+ clear_page_mlock(page);
+
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, pvmw.address, pvmw.pte);
}
@@ -670,6 +673,8 @@ void migrate_page_states(struct page *newpage, struct page *page)
SetPageActive(newpage);
} else if (TestClearPageUnevictable(page))
SetPageUnevictable(newpage);
+ if (PageWorkingset(page))
+ SetPageWorkingset(newpage);
if (PageChecked(page))
SetPageChecked(newpage);
if (PageMappedToDisk(page))
@@ -1399,7 +1404,7 @@ retry:
* we encounter them after the rest of the list
* is processed.
*/
- if (PageTransHuge(page)) {
+ if (PageTransHuge(page) && !PageHuge(page)) {
lock_page(page);
rc = split_huge_page_to_list(page, from);
unlock_page(page);
@@ -1843,46 +1848,6 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
return newpage;
}
-/*
- * page migration rate limiting control.
- * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
- * window of time. Default here says do not migrate more than 1280M per second.
- */
-static unsigned int migrate_interval_millisecs __read_mostly = 100;
-static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
-
-/* Returns true if the node is migrate rate-limited after the update */
-static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
- unsigned long nr_pages)
-{
- /*
- * Rate-limit the amount of data that is being migrated to a node.
- * Optimal placement is no good if the memory bus is saturated and
- * all the time is being spent migrating!
- */
- if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
- spin_lock(&pgdat->numabalancing_migrate_lock);
- pgdat->numabalancing_migrate_nr_pages = 0;
- pgdat->numabalancing_migrate_next_window = jiffies +
- msecs_to_jiffies(migrate_interval_millisecs);
- spin_unlock(&pgdat->numabalancing_migrate_lock);
- }
- if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) {
- trace_mm_numa_migrate_ratelimit(current, pgdat->node_id,
- nr_pages);
- return true;
- }
-
- /*
- * This is an unlocked non-atomic update so errors are possible.
- * The consequences are failing to migrate when we potentiall should
- * have which is not severe enough to warrant locking. If it is ever
- * a problem, it can be converted to a per-cpu counter.
- */
- pgdat->numabalancing_migrate_nr_pages += nr_pages;
- return false;
-}
-
static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
{
int page_lru;
@@ -1955,14 +1920,6 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
if (page_is_file_cache(page) && PageDirty(page))
goto out;
- /*
- * Rate-limit the amount of data that is being migrated to a node.
- * Optimal placement is no good if the memory bus is saturated and
- * all the time is being spent migrating!
- */
- if (numamigrate_update_ratelimit(pgdat, 1))
- goto out;
-
isolated = numamigrate_isolate_page(pgdat, page);
if (!isolated)
goto out;
@@ -2006,16 +1963,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
int isolated = 0;
struct page *new_page = NULL;
int page_lru = page_is_file_cache(page);
- unsigned long mmun_start = address & HPAGE_PMD_MASK;
- unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
-
- /*
- * Rate-limit the amount of data that is being migrated to a node.
- * Optimal placement is no good if the memory bus is saturated and
- * all the time is being spent migrating!
- */
- if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
- goto out_dropref;
+ unsigned long start = address & HPAGE_PMD_MASK;
new_page = alloc_pages_node(node,
(GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
@@ -2038,15 +1986,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
/* anon mapping, we can simply copy page->mapping to the new page: */
new_page->mapping = page->mapping;
new_page->index = page->index;
+ /* flush the cache before copying using the kernel virtual address */
+ flush_cache_range(vma, start, start + HPAGE_PMD_SIZE);
migrate_page_copy(new_page, page);
WARN_ON(PageLRU(new_page));
/* Recheck the target PMD */
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
ptl = pmd_lock(mm, pmd);
if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) {
spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
/* Reverse changes made by migrate_page_copy() */
if (TestClearPageActive(new_page))
@@ -2070,16 +2018,26 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
/*
- * Clear the old entry under pagetable lock and establish the new PTE.
- * Any parallel GUP will either observe the old page blocking on the
- * page lock, block on the page table lock or observe the new page.
- * The SetPageUptodate on the new page and page_add_new_anon_rmap
- * guarantee the copy is visible before the pagetable update.
+ * Overwrite the old entry under pagetable lock and establish
+ * the new PTE. Any parallel GUP will either observe the old
+ * page blocking on the page lock, block on the page table
+ * lock or observe the new page. The SetPageUptodate on the
+ * new page and page_add_new_anon_rmap guarantee the copy is
+ * visible before the pagetable update.
+ */
+ page_add_anon_rmap(new_page, vma, start, true);
+ /*
+ * At this point the pmd is numa/protnone (i.e. non present) and the TLB
+ * has already been flushed globally. So no TLB can be currently
+ * caching this non present pmd mapping. There's no need to clear the
+ * pmd before doing set_pmd_at(), nor to flush the TLB after
+ * set_pmd_at(). Clearing the pmd here would introduce a race
+ * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the
+ * mmap_sem for reading. If the pmd is set to NULL at any given time,
+ * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this
+ * pmd.
*/
- flush_cache_range(vma, mmun_start, mmun_end);
- page_add_anon_rmap(new_page, vma, mmun_start, true);
- pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
- set_pmd_at(mm, mmun_start, pmd, entry);
+ set_pmd_at(mm, start, pmd, entry);
update_mmu_cache_pmd(vma, address, &entry);
page_ref_unfreeze(page, 2);
@@ -2088,11 +2046,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
spin_unlock(ptl);
- /*
- * No need to double call mmu_notifier->invalidate_range() callback as
- * the above pmdp_huge_clear_flush_notify() did already call it.
- */
- mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end);
/* Take an "isolate" reference and put new page on the LRU. */
get_page(new_page);
@@ -2113,11 +2066,10 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
out_fail:
count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
-out_dropref:
ptl = pmd_lock(mm, pmd);
if (pmd_same(*pmd, entry)) {
entry = pmd_modify(entry, vma->vm_page_prot);
- set_pmd_at(mm, mmun_start, pmd, entry);
+ set_pmd_at(mm, start, pmd, entry);
update_mmu_cache_pmd(vma, address, &entry);
}
spin_unlock(ptl);
diff --git a/mm/mmap.c b/mm/mmap.c
index 5f2b2b184c60..6c04292e16a7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -191,16 +191,19 @@ static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
unsigned long retval;
- unsigned long newbrk, oldbrk;
+ unsigned long newbrk, oldbrk, origbrk;
struct mm_struct *mm = current->mm;
struct vm_area_struct *next;
unsigned long min_brk;
bool populate;
+ bool downgraded = false;
LIST_HEAD(uf);
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
+ origbrk = mm->brk;
+
#ifdef CONFIG_COMPAT_BRK
/*
* CONFIG_COMPAT_BRK can still be overridden by setting
@@ -229,14 +232,32 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
newbrk = PAGE_ALIGN(brk);
oldbrk = PAGE_ALIGN(mm->brk);
- if (oldbrk == newbrk)
- goto set_brk;
+ if (oldbrk == newbrk) {
+ mm->brk = brk;
+ goto success;
+ }
- /* Always allow shrinking brk. */
+ /*
+ * Always allow shrinking brk.
+ * __do_munmap() may downgrade mmap_sem to read.
+ */
if (brk <= mm->brk) {
- if (!do_munmap(mm, newbrk, oldbrk-newbrk, &uf))
- goto set_brk;
- goto out;
+ int ret;
+
+ /*
+ * mm->brk must to be protected by write mmap_sem so update it
+ * before downgrading mmap_sem. When __do_munmap() fails,
+ * mm->brk will be restored from origbrk.
+ */
+ mm->brk = brk;
+ ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
+ if (ret < 0) {
+ mm->brk = origbrk;
+ goto out;
+ } else if (ret == 1) {
+ downgraded = true;
+ }
+ goto success;
}
/* Check against existing mmap mappings. */
@@ -247,18 +268,21 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
/* Ok, looks good - let it rip. */
if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
goto out;
-
-set_brk:
mm->brk = brk;
+
+success:
populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
- up_write(&mm->mmap_sem);
+ if (downgraded)
+ up_read(&mm->mmap_sem);
+ else
+ up_write(&mm->mmap_sem);
userfaultfd_unmap_complete(mm, &uf);
if (populate)
mm_populate(oldbrk, newbrk - oldbrk);
return brk;
out:
- retval = mm->brk;
+ retval = origbrk;
up_write(&mm->mmap_sem);
return retval;
}
@@ -1410,7 +1434,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (flags & MAP_FIXED_NOREPLACE) {
struct vm_area_struct *vma = find_vma(mm, addr);
- if (vma && vma->vm_start <= addr)
+ if (vma && vma->vm_start < addr + len)
return -EEXIST;
}
@@ -2687,8 +2711,8 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
* work. This now handles partial unmappings.
* Jeremy Fitzhardinge <jeremy@goop.org>
*/
-int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
- struct list_head *uf)
+int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
+ struct list_head *uf, bool downgrade)
{
unsigned long end;
struct vm_area_struct *vma, *prev, *last;
@@ -2770,25 +2794,38 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
mm->locked_vm -= vma_pages(tmp);
munlock_vma_pages_all(tmp);
}
+
tmp = tmp->vm_next;
}
}
- /*
- * Remove the vma's, and unmap the actual pages
- */
+ /* Detach vmas from rbtree */
detach_vmas_to_be_unmapped(mm, vma, prev, end);
- unmap_region(mm, vma, prev, start, end);
+ /*
+ * mpx unmap needs to be called with mmap_sem held for write.
+ * It is safe to call it before unmap_region().
+ */
arch_unmap(mm, vma, start, end);
+ if (downgrade)
+ downgrade_write(&mm->mmap_sem);
+
+ unmap_region(mm, vma, prev, start, end);
+
/* Fix up all other VM information */
remove_vma_list(mm, vma);
- return 0;
+ return downgrade ? 1 : 0;
}
-int vm_munmap(unsigned long start, size_t len)
+int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
+ struct list_head *uf)
+{
+ return __do_munmap(mm, start, len, uf, false);
+}
+
+static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
{
int ret;
struct mm_struct *mm = current->mm;
@@ -2797,17 +2834,32 @@ int vm_munmap(unsigned long start, size_t len)
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
- ret = do_munmap(mm, start, len, &uf);
- up_write(&mm->mmap_sem);
+ ret = __do_munmap(mm, start, len, &uf, downgrade);
+ /*
+ * Returning 1 indicates mmap_sem is downgraded.
+ * But 1 is not legal return value of vm_munmap() and munmap(), reset
+ * it to 0 before return.
+ */
+ if (ret == 1) {
+ up_read(&mm->mmap_sem);
+ ret = 0;
+ } else
+ up_write(&mm->mmap_sem);
+
userfaultfd_unmap_complete(mm, &uf);
return ret;
}
+
+int vm_munmap(unsigned long start, size_t len)
+{
+ return __vm_munmap(start, len, false);
+}
EXPORT_SYMBOL(vm_munmap);
SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
{
profile_munmap(addr);
- return vm_munmap(addr, len);
+ return __vm_munmap(addr, len, true);
}
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
new file mode 100644
index 000000000000..2a9fbc4a37d5
--- /dev/null
+++ b/mm/mmu_gather.c
@@ -0,0 +1,261 @@
+#include <linux/gfp.h>
+#include <linux/highmem.h>
+#include <linux/kernel.h>
+#include <linux/mmdebug.h>
+#include <linux/mm_types.h>
+#include <linux/pagemap.h>
+#include <linux/rcupdate.h>
+#include <linux/smp.h>
+#include <linux/swap.h>
+
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+
+#ifdef HAVE_GENERIC_MMU_GATHER
+
+static bool tlb_next_batch(struct mmu_gather *tlb)
+{
+ struct mmu_gather_batch *batch;
+
+ batch = tlb->active;
+ if (batch->next) {
+ tlb->active = batch->next;
+ return true;
+ }
+
+ if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
+ return false;
+
+ batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
+ if (!batch)
+ return false;
+
+ tlb->batch_count++;
+ batch->next = NULL;
+ batch->nr = 0;
+ batch->max = MAX_GATHER_BATCH;
+
+ tlb->active->next = batch;
+ tlb->active = batch;
+
+ return true;
+}
+
+void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ tlb->mm = mm;
+
+ /* Is it from 0 to ~0? */
+ tlb->fullmm = !(start | (end+1));
+ tlb->need_flush_all = 0;
+ tlb->local.next = NULL;
+ tlb->local.nr = 0;
+ tlb->local.max = ARRAY_SIZE(tlb->__pages);
+ tlb->active = &tlb->local;
+ tlb->batch_count = 0;
+
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+ tlb->batch = NULL;
+#endif
+ tlb->page_size = 0;
+
+ __tlb_reset_range(tlb);
+}
+
+void tlb_flush_mmu_free(struct mmu_gather *tlb)
+{
+ struct mmu_gather_batch *batch;
+
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+ tlb_table_flush(tlb);
+#endif
+ for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
+ free_pages_and_swap_cache(batch->pages, batch->nr);
+ batch->nr = 0;
+ }
+ tlb->active = &tlb->local;
+}
+
+void tlb_flush_mmu(struct mmu_gather *tlb)
+{
+ tlb_flush_mmu_tlbonly(tlb);
+ tlb_flush_mmu_free(tlb);
+}
+
+/* tlb_finish_mmu
+ * Called at the end of the shootdown operation to free up any resources
+ * that were required.
+ */
+void arch_tlb_finish_mmu(struct mmu_gather *tlb,
+ unsigned long start, unsigned long end, bool force)
+{
+ struct mmu_gather_batch *batch, *next;
+
+ if (force) {
+ __tlb_reset_range(tlb);
+ __tlb_adjust_range(tlb, start, end - start);
+ }
+
+ tlb_flush_mmu(tlb);
+
+ /* keep the page table cache within bounds */
+ check_pgt_cache();
+
+ for (batch = tlb->local.next; batch; batch = next) {
+ next = batch->next;
+ free_pages((unsigned long)batch, 0);
+ }
+ tlb->local.next = NULL;
+}
+
+/* __tlb_remove_page
+ * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
+ * handling the additional races in SMP caused by other CPUs caching valid
+ * mappings in their TLBs. Returns the number of free page slots left.
+ * When out of page slots we must call tlb_flush_mmu().
+ *returns true if the caller should flush.
+ */
+bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
+{
+ struct mmu_gather_batch *batch;
+
+ VM_BUG_ON(!tlb->end);
+ VM_WARN_ON(tlb->page_size != page_size);
+
+ batch = tlb->active;
+ /*
+ * Add the page and check if we are full. If so
+ * force a flush.
+ */
+ batch->pages[batch->nr++] = page;
+ if (batch->nr == batch->max) {
+ if (!tlb_next_batch(tlb))
+ return true;
+ batch = tlb->active;
+ }
+ VM_BUG_ON_PAGE(batch->nr > batch->max, page);
+
+ return false;
+}
+
+#endif /* HAVE_GENERIC_MMU_GATHER */
+
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+
+/*
+ * See the comment near struct mmu_table_batch.
+ */
+
+/*
+ * If we want tlb_remove_table() to imply TLB invalidates.
+ */
+static inline void tlb_table_invalidate(struct mmu_gather *tlb)
+{
+#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE
+ /*
+ * Invalidate page-table caches used by hardware walkers. Then we still
+ * need to RCU-sched wait while freeing the pages because software
+ * walkers can still be in-flight.
+ */
+ tlb_flush_mmu_tlbonly(tlb);
+#endif
+}
+
+static void tlb_remove_table_smp_sync(void *arg)
+{
+ /* Simply deliver the interrupt */
+}
+
+static void tlb_remove_table_one(void *table)
+{
+ /*
+ * This isn't an RCU grace period and hence the page-tables cannot be
+ * assumed to be actually RCU-freed.
+ *
+ * It is however sufficient for software page-table walkers that rely on
+ * IRQ disabling. See the comment near struct mmu_table_batch.
+ */
+ smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
+ __tlb_remove_table(table);
+}
+
+static void tlb_remove_table_rcu(struct rcu_head *head)
+{
+ struct mmu_table_batch *batch;
+ int i;
+
+ batch = container_of(head, struct mmu_table_batch, rcu);
+
+ for (i = 0; i < batch->nr; i++)
+ __tlb_remove_table(batch->tables[i]);
+
+ free_page((unsigned long)batch);
+}
+
+void tlb_table_flush(struct mmu_gather *tlb)
+{
+ struct mmu_table_batch **batch = &tlb->batch;
+
+ if (*batch) {
+ tlb_table_invalidate(tlb);
+ call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
+ *batch = NULL;
+ }
+}
+
+void tlb_remove_table(struct mmu_gather *tlb, void *table)
+{
+ struct mmu_table_batch **batch = &tlb->batch;
+
+ if (*batch == NULL) {
+ *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+ if (*batch == NULL) {
+ tlb_table_invalidate(tlb);
+ tlb_remove_table_one(table);
+ return;
+ }
+ (*batch)->nr = 0;
+ }
+
+ (*batch)->tables[(*batch)->nr++] = table;
+ if ((*batch)->nr == MAX_TABLE_BATCH)
+ tlb_table_flush(tlb);
+}
+
+#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
+
+/**
+ * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
+ * @tlb: the mmu_gather structure to initialize
+ * @mm: the mm_struct of the target address space
+ * @start: start of the region that will be removed from the page-table
+ * @end: end of the region that will be removed from the page-table
+ *
+ * Called to initialize an (on-stack) mmu_gather structure for page-table
+ * tear-down from @mm. The @start and @end are set to 0 and -1
+ * respectively when @mm is without users and we're going to destroy
+ * the full address space (exit/execve).
+ */
+void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ arch_tlb_gather_mmu(tlb, mm, start, end);
+ inc_tlb_flush_pending(tlb->mm);
+}
+
+void tlb_finish_mmu(struct mmu_gather *tlb,
+ unsigned long start, unsigned long end)
+{
+ /*
+ * If there are parallel threads are doing PTE changes on same range
+ * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB
+ * flush by batching, a thread has stable TLB entry can fail to flush
+ * the TLB by observing pte_none|!pte_dirty, for example so flush TLB
+ * forcefully if we detect parallel PTE batching threads.
+ */
+ bool force = mm_tlb_flush_nested(tlb->mm);
+
+ arch_tlb_finish_mmu(tlb, start, end, force);
+ dec_tlb_flush_pending(tlb->mm);
+}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 82bb1a939c0e..5119ff846769 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -247,37 +247,6 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
}
EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
-/*
- * Must be called while holding mm->mmap_sem for either read or write.
- * The result is guaranteed to be valid until mm->mmap_sem is dropped.
- */
-bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
-{
- struct mmu_notifier *mn;
- int id;
- bool ret = false;
-
- WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
-
- if (!mm_has_notifiers(mm))
- return ret;
-
- id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
- if (!mn->ops->invalidate_range &&
- !mn->ops->invalidate_range_start &&
- !mn->ops->invalidate_range_end)
- continue;
-
- if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
- ret = true;
- break;
- }
- }
- srcu_read_unlock(&srcu, id);
- return ret;
-}
-
static int do_mmu_notifier_register(struct mmu_notifier *mn,
struct mm_struct *mm,
int take_mmap_sem)
diff --git a/mm/mremap.c b/mm/mremap.c
index 5c2e18505f75..7f9f9180e401 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -115,7 +115,7 @@ static pte_t move_soft_dirty_pte(pte_t pte)
static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
unsigned long old_addr, unsigned long old_end,
struct vm_area_struct *new_vma, pmd_t *new_pmd,
- unsigned long new_addr, bool need_rmap_locks, bool *need_flush)
+ unsigned long new_addr, bool need_rmap_locks)
{
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
@@ -163,15 +163,17 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
pte = ptep_get_and_clear(mm, old_addr, old_pte);
/*
- * If we are remapping a dirty PTE, make sure
+ * If we are remapping a valid PTE, make sure
* to flush TLB before we drop the PTL for the
- * old PTE or we may race with page_mkclean().
+ * PTE.
*
- * This check has to be done after we removed the
- * old PTE from page tables or another thread may
- * dirty it after the check and before the removal.
+ * NOTE! Both old and new PTL matter: the old one
+ * for racing with page_mkclean(), the new one to
+ * make sure the physical page stays valid until
+ * the TLB entry for the old mapping has been
+ * flushed.
*/
- if (pte_present(pte) && pte_dirty(pte))
+ if (pte_present(pte))
force_flush = true;
pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
pte = move_soft_dirty_pte(pte);
@@ -179,13 +181,11 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
}
arch_leave_lazy_mmu_mode();
+ if (force_flush)
+ flush_tlb_range(vma, old_end - len, old_end);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
pte_unmap(new_pte - 1);
- if (force_flush)
- flush_tlb_range(vma, old_end - len, old_end);
- else
- *need_flush = true;
pte_unmap_unlock(old_pte - 1, old_ptl);
if (need_rmap_locks)
drop_rmap_locks(vma);
@@ -198,7 +198,6 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
{
unsigned long extent, next, old_end;
pmd_t *old_pmd, *new_pmd;
- bool need_flush = false;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
@@ -229,8 +228,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
if (need_rmap_locks)
take_rmap_locks(vma);
moved = move_huge_pmd(vma, old_addr, new_addr,
- old_end, old_pmd, new_pmd,
- &need_flush);
+ old_end, old_pmd, new_pmd);
if (need_rmap_locks)
drop_rmap_locks(vma);
if (moved)
@@ -246,10 +244,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
if (extent > next - new_addr)
extent = next - new_addr;
move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
- new_pmd, new_addr, need_rmap_locks, &need_flush);
+ new_pmd, new_addr, need_rmap_locks);
}
- if (need_flush)
- flush_tlb_range(vma, old_end-len, old_addr);
mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
@@ -525,6 +521,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
unsigned long ret = -EINVAL;
unsigned long charged = 0;
bool locked = false;
+ bool downgraded = false;
struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
LIST_HEAD(uf_unmap_early);
LIST_HEAD(uf_unmap);
@@ -561,12 +558,20 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
/*
* Always allow a shrinking remap: that just unmaps
* the unnecessary pages..
- * do_munmap does all the needed commit accounting
+ * __do_munmap does all the needed commit accounting, and
+ * downgrades mmap_sem to read if so directed.
*/
if (old_len >= new_len) {
- ret = do_munmap(mm, addr+new_len, old_len - new_len, &uf_unmap);
- if (ret && old_len != new_len)
+ int retval;
+
+ retval = __do_munmap(mm, addr+new_len, old_len - new_len,
+ &uf_unmap, true);
+ if (retval < 0 && old_len != new_len) {
+ ret = retval;
goto out;
+ /* Returning 1 indicates mmap_sem is downgraded to read. */
+ } else if (retval == 1)
+ downgraded = true;
ret = addr;
goto out;
}
@@ -631,7 +636,10 @@ out:
vm_unacct_memory(charged);
locked = 0;
}
- up_write(&current->mm->mmap_sem);
+ if (downgraded)
+ up_read(&current->mm->mmap_sem);
+ else
+ up_write(&current->mm->mmap_sem);
if (locked && new_len > old_len)
mm_populate(new_addr + old_len, new_len - old_len);
userfaultfd_unmap_complete(mm, &uf_unmap_early);
diff --git a/mm/nommu.c b/mm/nommu.c
index e4aac33216ae..749276beb109 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1709,11 +1709,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
return ret;
}
-struct page *follow_page_mask(struct vm_area_struct *vma,
- unsigned long address, unsigned int flags,
- unsigned int *page_mask)
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
+ unsigned int foll_flags)
{
- *page_mask = 0;
return NULL;
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f10aa5360616..6589f60d5018 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -859,7 +859,7 @@ static void __oom_kill_process(struct task_struct *victim)
* in order to prevent the OOM victim from depleting the memory
* reserves from the user space under its control.
*/
- do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, PIDTYPE_TGID);
+ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
mark_oom_victim(victim);
pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
@@ -897,7 +897,7 @@ static void __oom_kill_process(struct task_struct *victim)
*/
if (unlikely(p->flags & PF_KTHREAD))
continue;
- do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, PIDTYPE_TGID);
+ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
}
rcu_read_unlock();
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index fc6e5743b0bf..3f690bae6b78 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2140,6 +2140,13 @@ EXPORT_SYMBOL(tag_pages_for_writeback);
* not miss some pages (e.g., because some other process has cleared TOWRITE
* tag we set). The rule we follow is that TOWRITE tag can be cleared only
* by the process clearing the DIRTY tag (and submitting the page for IO).
+ *
+ * To avoid deadlocks between range_cyclic writeback and callers that hold
+ * pages in PageWriteback to aggregate IO until write_cache_pages() returns,
+ * we do not loop back to the start of the file. Doing so causes a page
+ * lock/page writeback access order inversion - we should only ever lock
+ * multiple pages in ascending page->index order, and looping back to the start
+ * of the file violates that rule and causes deadlocks.
*/
int write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc, writepage_t writepage,
@@ -2153,7 +2160,6 @@ int write_cache_pages(struct address_space *mapping,
pgoff_t index;
pgoff_t end; /* Inclusive */
pgoff_t done_index;
- int cycled;
int range_whole = 0;
xa_mark_t tag;
@@ -2161,23 +2167,17 @@ int write_cache_pages(struct address_space *mapping,
if (wbc->range_cyclic) {
writeback_index = mapping->writeback_index; /* prev offset */
index = writeback_index;
- if (index == 0)
- cycled = 1;
- else
- cycled = 0;
end = -1;
} else {
index = wbc->range_start >> PAGE_SHIFT;
end = wbc->range_end >> PAGE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
- cycled = 1; /* ignore range_cyclic tests */
}
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
-retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, index, end);
done_index = index;
@@ -2263,17 +2263,14 @@ continue_unlock:
pagevec_release(&pvec);
cond_resched();
}
- if (!cycled && !done) {
- /*
- * range_cyclic:
- * We hit the last page and there is more work to be done: wrap
- * back to the start of the file
- */
- cycled = 1;
- index = 0;
- end = writeback_index - 1;
- goto retry;
- }
+
+ /*
+ * If we hit the last page and there is more work to be done: wrap
+ * back the index back to the start of the file for the next
+ * time we are called.
+ */
+ if (wbc->range_cyclic && !done)
+ done_index = 0;
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = done_index;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 89d2a2ab3fe6..863d46da6586 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -66,6 +66,7 @@
#include <linux/ftrace.h>
#include <linux/lockdep.h>
#include <linux/nmi.h>
+#include <linux/psi.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -306,24 +307,33 @@ static inline bool __meminit early_page_uninitialised(unsigned long pfn)
}
/*
- * Returns false when the remaining initialisation should be deferred until
+ * Returns true when the remaining initialisation should be deferred until
* later in the boot cycle when it can be parallelised.
*/
-static inline bool update_defer_init(pg_data_t *pgdat,
- unsigned long pfn, unsigned long zone_end,
- unsigned long *nr_initialised)
+static bool __meminit
+defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
{
+ static unsigned long prev_end_pfn, nr_initialised;
+
+ /*
+ * prev_end_pfn static that contains the end of previous zone
+ * No need to protect because called very early in boot before smp_init.
+ */
+ if (prev_end_pfn != end_pfn) {
+ prev_end_pfn = end_pfn;
+ nr_initialised = 0;
+ }
+
/* Always populate low zones for address-constrained allocations */
- if (zone_end < pgdat_end_pfn(pgdat))
- return true;
- (*nr_initialised)++;
- if ((*nr_initialised > pgdat->static_init_pgcnt) &&
- (pfn & (PAGES_PER_SECTION - 1)) == 0) {
- pgdat->first_deferred_pfn = pfn;
+ if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
return false;
+ nr_initialised++;
+ if ((nr_initialised > NODE_DATA(nid)->static_init_pgcnt) &&
+ (pfn & (PAGES_PER_SECTION - 1)) == 0) {
+ NODE_DATA(nid)->first_deferred_pfn = pfn;
+ return true;
}
-
- return true;
+ return false;
}
#else
static inline bool early_page_uninitialised(unsigned long pfn)
@@ -331,11 +341,9 @@ static inline bool early_page_uninitialised(unsigned long pfn)
return false;
}
-static inline bool update_defer_init(pg_data_t *pgdat,
- unsigned long pfn, unsigned long zone_end,
- unsigned long *nr_initialised)
+static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
{
- return true;
+ return false;
}
#endif
@@ -1231,7 +1239,12 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
/* Avoid false-positive PageTail() */
INIT_LIST_HEAD(&page->lru);
- SetPageReserved(page);
+ /*
+ * no need for atomic set_bit because the struct
+ * page is not visible yet so nobody should
+ * access it yet.
+ */
+ __SetPageReserved(page);
}
}
}
@@ -2015,10 +2028,6 @@ static int move_freepages(struct zone *zone,
pfn_valid(page_to_pfn(end_page)) &&
page_zone(start_page) != page_zone(end_page));
#endif
-
- if (num_movable)
- *num_movable = 0;
-
for (page = start_page; page <= end_page;) {
if (!pfn_valid_within(page_to_pfn(page))) {
page++;
@@ -2058,6 +2067,9 @@ int move_freepages_block(struct zone *zone, struct page *page,
unsigned long start_pfn, end_pfn;
struct page *start_page, *end_page;
+ if (num_movable)
+ *num_movable = 0;
+
start_pfn = page_to_pfn(page);
start_pfn = start_pfn & ~(pageblock_nr_pages-1);
start_page = pfn_to_page(start_pfn);
@@ -3366,26 +3378,12 @@ try_this_zone:
return NULL;
}
-/*
- * Large machines with many possible nodes should not always dump per-node
- * meminfo in irq context.
- */
-static inline bool should_suppress_show_mem(void)
-{
- bool ret = false;
-
-#if NODES_SHIFT > 8
- ret = in_interrupt();
-#endif
- return ret;
-}
-
static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
{
unsigned int filter = SHOW_MEM_FILTER_NODES;
static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
- if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs))
+ if (!__ratelimit(&show_mem_rs))
return;
/*
@@ -3549,15 +3547,20 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
enum compact_priority prio, enum compact_result *compact_result)
{
struct page *page;
+ unsigned long pflags;
unsigned int noreclaim_flag;
if (!order)
return NULL;
+ psi_memstall_enter(&pflags);
noreclaim_flag = memalloc_noreclaim_save();
+
*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
prio);
+
memalloc_noreclaim_restore(noreclaim_flag);
+ psi_memstall_leave(&pflags);
if (*compact_result <= COMPACT_INACTIVE)
return NULL;
@@ -3756,11 +3759,13 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
struct reclaim_state reclaim_state;
int progress;
unsigned int noreclaim_flag;
+ unsigned long pflags;
cond_resched();
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
+ psi_memstall_enter(&pflags);
fs_reclaim_acquire(gfp_mask);
noreclaim_flag = memalloc_noreclaim_save();
reclaim_state.reclaimed_slab = 0;
@@ -3772,6 +3777,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
current->reclaim_state = NULL;
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(gfp_mask);
+ psi_memstall_leave(&pflags);
cond_resched();
@@ -3922,6 +3928,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
{
struct zone *zone;
struct zoneref *z;
+ bool ret = false;
/*
* Costly allocations might have made a progress but this doesn't mean
@@ -3985,25 +3992,24 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
}
}
- /*
- * Memory allocation/reclaim might be called from a WQ
- * context and the current implementation of the WQ
- * concurrency control doesn't recognize that
- * a particular WQ is congested if the worker thread is
- * looping without ever sleeping. Therefore we have to
- * do a short sleep here rather than calling
- * cond_resched().
- */
- if (current->flags & PF_WQ_WORKER)
- schedule_timeout_uninterruptible(1);
- else
- cond_resched();
-
- return true;
+ ret = true;
+ goto out;
}
}
- return false;
+out:
+ /*
+ * Memory allocation/reclaim might be called from a WQ context and the
+ * current implementation of the WQ concurrency control doesn't
+ * recognize that a particular WQ is congested if the worker thread is
+ * looping without ever sleeping. Therefore we have to do a short sleep
+ * here rather than calling cond_resched().
+ */
+ if (current->flags & PF_WQ_WORKER)
+ schedule_timeout_uninterruptible(1);
+ else
+ cond_resched();
+ return ret;
}
static inline bool
@@ -4701,6 +4707,7 @@ long si_mem_available(void)
unsigned long pagecache;
unsigned long wmark_low = 0;
unsigned long pages[NR_LRU_LISTS];
+ unsigned long reclaimable;
struct zone *zone;
int lru;
@@ -4726,19 +4733,13 @@ long si_mem_available(void)
available += pagecache;
/*
- * Part of the reclaimable slab consists of items that are in use,
- * and cannot be freed. Cap this estimate at the low watermark.
- */
- available += global_node_page_state(NR_SLAB_RECLAIMABLE) -
- min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2,
- wmark_low);
-
- /*
- * Part of the kernel memory, which can be released under memory
- * pressure.
+ * Part of the reclaimable slab and other kernel memory consists of
+ * items that are in use, and cannot be freed. Cap this estimate at the
+ * low watermark.
*/
- available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >>
- PAGE_SHIFT;
+ reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) +
+ global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
+ available += reclaimable - min(reclaimable / 2, wmark_low);
if (available < 0)
available = 0;
@@ -5449,6 +5450,30 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
#endif
}
+/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
+static bool __meminit
+overlap_memmap_init(unsigned long zone, unsigned long *pfn)
+{
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ static struct memblock_region *r;
+
+ if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
+ if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
+ for_each_memblock(memory, r) {
+ if (*pfn < memblock_region_memory_end_pfn(r))
+ break;
+ }
+ }
+ if (*pfn >= memblock_region_memory_base_pfn(r) &&
+ memblock_is_mirror(r)) {
+ *pfn = memblock_region_memory_end_pfn(r);
+ return true;
+ }
+ }
+#endif
+ return false;
+}
+
/*
* Initially all pages are reserved - free ones are freed
* up by free_all_bootmem() once the early boot process is
@@ -5458,67 +5483,118 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, enum memmap_context context,
struct vmem_altmap *altmap)
{
- unsigned long end_pfn = start_pfn + size;
- pg_data_t *pgdat = NODE_DATA(nid);
- unsigned long pfn;
- unsigned long nr_initialised = 0;
+ unsigned long pfn, end_pfn = start_pfn + size;
struct page *page;
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
- struct memblock_region *r = NULL, *tmp;
-#endif
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
+#ifdef CONFIG_ZONE_DEVICE
/*
* Honor reservation requested by the driver for this ZONE_DEVICE
- * memory
+ * memory. We limit the total number of pages to initialize to just
+ * those that might contain the memory mapping. We will defer the
+ * ZONE_DEVICE page initialization until after we have released
+ * the hotplug lock.
*/
- if (altmap && start_pfn == altmap->base_pfn)
- start_pfn += altmap->reserve;
+ if (zone == ZONE_DEVICE) {
+ if (!altmap)
+ return;
+
+ if (start_pfn == altmap->base_pfn)
+ start_pfn += altmap->reserve;
+ end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
+ }
+#endif
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
* There can be holes in boot-time mem_map[]s handed to this
* function. They do not exist on hotplugged memory.
*/
- if (context != MEMMAP_EARLY)
- goto not_early;
-
- if (!early_pfn_valid(pfn))
- continue;
- if (!early_pfn_in_nid(pfn, nid))
- continue;
- if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
- break;
-
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
- /*
- * Check given memblock attribute by firmware which can affect
- * kernel memory layout. If zone==ZONE_MOVABLE but memory is
- * mirrored, it's an overlapped memmap init. skip it.
- */
- if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
- if (!r || pfn >= memblock_region_memory_end_pfn(r)) {
- for_each_memblock(memory, tmp)
- if (pfn < memblock_region_memory_end_pfn(tmp))
- break;
- r = tmp;
- }
- if (pfn >= memblock_region_memory_base_pfn(r) &&
- memblock_is_mirror(r)) {
- /* already initialized as NORMAL */
- pfn = memblock_region_memory_end_pfn(r);
+ if (context == MEMMAP_EARLY) {
+ if (!early_pfn_valid(pfn))
continue;
- }
+ if (!early_pfn_in_nid(pfn, nid))
+ continue;
+ if (overlap_memmap_init(zone, &pfn))
+ continue;
+ if (defer_init(nid, pfn, end_pfn))
+ break;
}
-#endif
-not_early:
page = pfn_to_page(pfn);
__init_single_page(page, pfn, zone, nid);
if (context == MEMMAP_HOTPLUG)
- SetPageReserved(page);
+ __SetPageReserved(page);
+
+ /*
+ * Mark the block movable so that blocks are reserved for
+ * movable at startup. This will force kernel allocations
+ * to reserve their blocks rather than leaking throughout
+ * the address space during boot when many long-lived
+ * kernel allocations are made.
+ *
+ * bitmap is created for zone's valid pfn range. but memmap
+ * can be created for invalid pages (for alignment)
+ * check here not to call set_pageblock_migratetype() against
+ * pfn out of zone.
+ */
+ if (!(pfn & (pageblock_nr_pages - 1))) {
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ cond_resched();
+ }
+ }
+}
+
+#ifdef CONFIG_ZONE_DEVICE
+void __ref memmap_init_zone_device(struct zone *zone,
+ unsigned long start_pfn,
+ unsigned long size,
+ struct dev_pagemap *pgmap)
+{
+ unsigned long pfn, end_pfn = start_pfn + size;
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ unsigned long zone_idx = zone_idx(zone);
+ unsigned long start = jiffies;
+ int nid = pgdat->node_id;
+
+ if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone)))
+ return;
+
+ /*
+ * The call to memmap_init_zone should have already taken care
+ * of the pages reserved for the memmap, so we can just jump to
+ * the end of that region and start processing the device pages.
+ */
+ if (pgmap->altmap_valid) {
+ struct vmem_altmap *altmap = &pgmap->altmap;
+
+ start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
+ size = end_pfn - start_pfn;
+ }
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ struct page *page = pfn_to_page(pfn);
+
+ __init_single_page(page, pfn, zone_idx, nid);
+
+ /*
+ * Mark page reserved as it will need to wait for onlining
+ * phase for it to be fully associated with a zone.
+ *
+ * We can use the non-atomic __set_bit operation for setting
+ * the flag as we are still initializing the pages.
+ */
+ __SetPageReserved(page);
+
+ /*
+ * ZONE_DEVICE pages union ->lru with a ->pgmap back
+ * pointer and hmm_data. It is a bug if a ZONE_DEVICE
+ * page is ever freed or placed on a driver-private list.
+ */
+ page->pgmap = pgmap;
+ page->hmm_data = 0;
/*
* Mark the block movable so that blocks are reserved for
@@ -5540,8 +5616,12 @@ not_early:
cond_resched();
}
}
+
+ pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev),
+ size, jiffies_to_msecs(jiffies - start));
}
+#endif
static void __meminit zone_init_free_lists(struct zone *zone)
{
unsigned int order, t;
@@ -5551,10 +5631,11 @@ static void __meminit zone_init_free_lists(struct zone *zone)
}
}
-#ifndef __HAVE_ARCH_MEMMAP_INIT
-#define memmap_init(size, nid, zone, start_pfn) \
- memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY, NULL)
-#endif
+void __meminit __weak memmap_init(unsigned long size, int nid,
+ unsigned long zone, unsigned long start_pfn)
+{
+ memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, NULL);
+}
static int zone_batchsize(struct zone *zone)
{
@@ -6193,17 +6274,6 @@ static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
}
-#ifdef CONFIG_NUMA_BALANCING
-static void pgdat_init_numabalancing(struct pglist_data *pgdat)
-{
- spin_lock_init(&pgdat->numabalancing_migrate_lock);
- pgdat->numabalancing_migrate_nr_pages = 0;
- pgdat->numabalancing_migrate_next_window = jiffies;
-}
-#else
-static void pgdat_init_numabalancing(struct pglist_data *pgdat) {}
-#endif
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void pgdat_init_split_queue(struct pglist_data *pgdat)
{
@@ -6228,7 +6298,6 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
{
pgdat_resize_init(pgdat);
- pgdat_init_numabalancing(pgdat);
pgdat_init_split_queue(pgdat);
pgdat_init_kcompactd(pgdat);
@@ -6440,45 +6509,65 @@ void __init free_area_init_node(int nid, unsigned long *zones_size,
}
#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP)
+
+/*
+ * Zero all valid struct pages in range [spfn, epfn), return number of struct
+ * pages zeroed
+ */
+static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
+{
+ unsigned long pfn;
+ u64 pgcnt = 0;
+
+ for (pfn = spfn; pfn < epfn; pfn++) {
+ if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
+ pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
+ + pageblock_nr_pages - 1;
+ continue;
+ }
+ mm_zero_struct_page(pfn_to_page(pfn));
+ pgcnt++;
+ }
+
+ return pgcnt;
+}
+
/*
* Only struct pages that are backed by physical memory are zeroed and
* initialized by going through __init_single_page(). But, there are some
* struct pages which are reserved in memblock allocator and their fields
* may be accessed (for example page_to_pfn() on some configuration accesses
* flags). We must explicitly zero those struct pages.
+ *
+ * This function also addresses a similar issue where struct pages are left
+ * uninitialized because the physical address range is not covered by
+ * memblock.memory or memblock.reserved. That could happen when memblock
+ * layout is manually configured via memmap=.
*/
void __init zero_resv_unavail(void)
{
phys_addr_t start, end;
- unsigned long pfn;
u64 i, pgcnt;
+ phys_addr_t next = 0;
/*
- * Loop through ranges that are reserved, but do not have reported
- * physical memory backing.
+ * Loop through unavailable ranges not covered by memblock.memory.
*/
pgcnt = 0;
- for_each_resv_unavail_range(i, &start, &end) {
- for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) {
- if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
- pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
- + pageblock_nr_pages - 1;
- continue;
- }
- mm_zero_struct_page(pfn_to_page(pfn));
- pgcnt++;
- }
+ for_each_mem_range(i, &memblock.memory, NULL,
+ NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) {
+ if (next < start)
+ pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start));
+ next = end;
}
+ pgcnt += zero_pfn_range(PFN_DOWN(next), max_pfn);
/*
* Struct pages that do not have backing memory. This could be because
* firmware is using some of this memory, or for some other reasons.
- * Once memblock is changed so such behaviour is not allowed: i.e.
- * list of "reserved" memory must be a subset of list of "memory", then
- * this code can be removed.
*/
if (pgcnt)
- pr_info("Reserved but unavailable: %lld pages", pgcnt);
+ pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt);
}
#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */
@@ -6815,15 +6904,12 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
{
enum zone_type zone_type;
- if (N_MEMORY == N_NORMAL_MEMORY)
- return;
-
for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
if (populated_zone(zone)) {
- node_set_state(nid, N_HIGH_MEMORY);
- if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
- zone_type <= ZONE_NORMAL)
+ if (IS_ENABLED(CONFIG_HIGHMEM))
+ node_set_state(nid, N_HIGH_MEMORY);
+ if (zone_type <= ZONE_NORMAL)
node_set_state(nid, N_NORMAL_MEMORY);
break;
}
diff --git a/mm/page_io.c b/mm/page_io.c
index aafd19ec1db4..a451ffa9491c 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -283,7 +283,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
struct swap_info_struct *sis = page_swap_info(page);
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
- if (sis->flags & SWP_FILE) {
+ if (sis->flags & SWP_FS) {
struct kiocb kiocb;
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
@@ -339,7 +339,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
goto out;
}
bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc);
- bio_associate_blkcg_from_page(bio, page);
+ bio_associate_blkg_from_page(bio, page);
count_swpout_vm_event(page);
set_page_writeback(page);
unlock_page(page);
@@ -365,7 +365,7 @@ int swap_readpage(struct page *page, bool synchronous)
goto out;
}
- if (sis->flags & SWP_FILE) {
+ if (sis->flags & SWP_FS) {
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
@@ -423,7 +423,7 @@ int swap_set_page_dirty(struct page *page)
{
struct swap_info_struct *sis = page_swap_info(page);
- if (sis->flags & SWP_FILE) {
+ if (sis->flags & SWP_FS) {
struct address_space *mapping = sis->swap_file->f_mapping;
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
diff --git a/mm/percpu.c b/mm/percpu.c
index a749d4d96e3e..4b90682623e9 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1212,6 +1212,7 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
{
if (!chunk)
return;
+ pcpu_mem_free(chunk->md_blocks);
pcpu_mem_free(chunk->bound_map);
pcpu_mem_free(chunk->alloc_map);
pcpu_mem_free(chunk);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index cf2af04b34b9..532c29276fce 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -8,6 +8,7 @@
*/
#include <linux/pagemap.h>
+#include <linux/hugetlb.h>
#include <asm/tlb.h>
#include <asm-generic/pgtable.h>
diff --git a/mm/rmap.c b/mm/rmap.c
index eb477809a5c0..1e79fac3186b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1362,11 +1362,21 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
}
/*
- * We have to assume the worse case ie pmd for invalidation. Note that
- * the page can not be free in this function as call of try_to_unmap()
- * must hold a reference on the page.
+ * For THP, we have to assume the worse case ie pmd for invalidation.
+ * For hugetlb, it could be much worse if we need to do pud
+ * invalidation in the case of pmd sharing.
+ *
+ * Note that the page can not be free in this function as call of
+ * try_to_unmap() must hold a reference on the page.
*/
end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page)));
+ if (PageHuge(page)) {
+ /*
+ * If sharing is possible, start and end will be adjusted
+ * accordingly.
+ */
+ adjust_range_if_pmd_sharing_possible(vma, &start, &end);
+ }
mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
while (page_vma_mapped_walk(&pvmw)) {
@@ -1409,6 +1419,32 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
address = pvmw.address;
+ if (PageHuge(page)) {
+ if (huge_pmd_unshare(mm, &address, pvmw.pte)) {
+ /*
+ * huge_pmd_unshare unmapped an entire PMD
+ * page. There is no way of knowing exactly
+ * which PMDs may be cached for this mm, so
+ * we must flush them all. start/end were
+ * already adjusted above to cover this range.
+ */
+ flush_cache_range(vma, start, end);
+ flush_tlb_range(vma, start, end);
+ mmu_notifier_invalidate_range(mm, start, end);
+
+ /*
+ * The ref count of the PMD page was dropped
+ * which is part of the way map counting
+ * is done for shared PMDs. Return 'true'
+ * here. When there is no other sharing,
+ * huge_pmd_unshare returns false and we will
+ * unmap the actual page and drop map count
+ * to zero.
+ */
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ }
if (IS_ENABLED(CONFIG_MIGRATION) &&
(flags & TTU_MIGRATION) &&
diff --git a/mm/slab.c b/mm/slab.c
index aa76a70e087e..2a5654bb3b3f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1288,7 +1288,7 @@ void __init kmem_cache_init(void)
* Initialize the caches that provide memory for the kmem_cache_node
* structures first. Without this, further allocations will bug.
*/
- kmalloc_caches[INDEX_NODE] = create_kmalloc_cache(
+ kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache(
kmalloc_info[INDEX_NODE].name,
kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS,
0, kmalloc_size(INDEX_NODE));
@@ -1304,7 +1304,7 @@ void __init kmem_cache_init(void)
for_each_online_node(nid) {
init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid);
- init_list(kmalloc_caches[INDEX_NODE],
+ init_list(kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE],
&init_kmem_cache_node[SIZE_NODE + nid], nid);
}
}
@@ -3675,6 +3675,8 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
struct kmem_cache *cachep;
void *ret;
+ if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
+ return NULL;
cachep = kmalloc_slab(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
@@ -3710,6 +3712,8 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
struct kmem_cache *cachep;
void *ret;
+ if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
+ return NULL;
cachep = kmalloc_slab(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index fea3376f9816..7eb8dc136c1c 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -973,14 +973,10 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name,
return s;
}
-struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init;
+struct kmem_cache *
+kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init;
EXPORT_SYMBOL(kmalloc_caches);
-#ifdef CONFIG_ZONE_DMA
-struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init;
-EXPORT_SYMBOL(kmalloc_dma_caches);
-#endif
-
/*
* Conversion table for small slabs sizes / 8 to the index in the
* kmalloc array. This is necessary for slabs < 192 since we have non power
@@ -1027,25 +1023,20 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
{
unsigned int index;
- if (unlikely(size > KMALLOC_MAX_SIZE)) {
- WARN_ON_ONCE(!(flags & __GFP_NOWARN));
- return NULL;
- }
-
if (size <= 192) {
if (!size)
return ZERO_SIZE_PTR;
index = size_index[size_index_elem(size)];
- } else
+ } else {
+ if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
+ WARN_ON(1);
+ return NULL;
+ }
index = fls(size - 1);
+ }
-#ifdef CONFIG_ZONE_DMA
- if (unlikely((flags & GFP_DMA)))
- return kmalloc_dma_caches[index];
-
-#endif
- return kmalloc_caches[index];
+ return kmalloc_caches[kmalloc_type(flags)][index];
}
/*
@@ -1059,15 +1050,15 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = {
{"kmalloc-16", 16}, {"kmalloc-32", 32},
{"kmalloc-64", 64}, {"kmalloc-128", 128},
{"kmalloc-256", 256}, {"kmalloc-512", 512},
- {"kmalloc-1024", 1024}, {"kmalloc-2048", 2048},
- {"kmalloc-4096", 4096}, {"kmalloc-8192", 8192},
- {"kmalloc-16384", 16384}, {"kmalloc-32768", 32768},
- {"kmalloc-65536", 65536}, {"kmalloc-131072", 131072},
- {"kmalloc-262144", 262144}, {"kmalloc-524288", 524288},
- {"kmalloc-1048576", 1048576}, {"kmalloc-2097152", 2097152},
- {"kmalloc-4194304", 4194304}, {"kmalloc-8388608", 8388608},
- {"kmalloc-16777216", 16777216}, {"kmalloc-33554432", 33554432},
- {"kmalloc-67108864", 67108864}
+ {"kmalloc-1k", 1024}, {"kmalloc-2k", 2048},
+ {"kmalloc-4k", 4096}, {"kmalloc-8k", 8192},
+ {"kmalloc-16k", 16384}, {"kmalloc-32k", 32768},
+ {"kmalloc-64k", 65536}, {"kmalloc-128k", 131072},
+ {"kmalloc-256k", 262144}, {"kmalloc-512k", 524288},
+ {"kmalloc-1M", 1048576}, {"kmalloc-2M", 2097152},
+ {"kmalloc-4M", 4194304}, {"kmalloc-8M", 8388608},
+ {"kmalloc-16M", 16777216}, {"kmalloc-32M", 33554432},
+ {"kmalloc-64M", 67108864}
};
/*
@@ -1117,9 +1108,36 @@ void __init setup_kmalloc_cache_index_table(void)
}
}
-static void __init new_kmalloc_cache(int idx, slab_flags_t flags)
+static const char *
+kmalloc_cache_name(const char *prefix, unsigned int size)
+{
+
+ static const char units[3] = "\0kM";
+ int idx = 0;
+
+ while (size >= 1024 && (size % 1024 == 0)) {
+ size /= 1024;
+ idx++;
+ }
+
+ return kasprintf(GFP_NOWAIT, "%s-%u%c", prefix, size, units[idx]);
+}
+
+static void __init
+new_kmalloc_cache(int idx, int type, slab_flags_t flags)
{
- kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name,
+ const char *name;
+
+ if (type == KMALLOC_RECLAIM) {
+ flags |= SLAB_RECLAIM_ACCOUNT;
+ name = kmalloc_cache_name("kmalloc-rcl",
+ kmalloc_info[idx].size);
+ BUG_ON(!name);
+ } else {
+ name = kmalloc_info[idx].name;
+ }
+
+ kmalloc_caches[type][idx] = create_kmalloc_cache(name,
kmalloc_info[idx].size, flags, 0,
kmalloc_info[idx].size);
}
@@ -1131,21 +1149,25 @@ static void __init new_kmalloc_cache(int idx, slab_flags_t flags)
*/
void __init create_kmalloc_caches(slab_flags_t flags)
{
- int i;
+ int i, type;
- for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
- if (!kmalloc_caches[i])
- new_kmalloc_cache(i, flags);
+ for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) {
+ for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
+ if (!kmalloc_caches[type][i])
+ new_kmalloc_cache(i, type, flags);
- /*
- * Caches that are not of the two-to-the-power-of size.
- * These have to be created immediately after the
- * earlier power of two caches
- */
- if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6)
- new_kmalloc_cache(1, flags);
- if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7)
- new_kmalloc_cache(2, flags);
+ /*
+ * Caches that are not of the two-to-the-power-of size.
+ * These have to be created immediately after the
+ * earlier power of two caches
+ */
+ if (KMALLOC_MIN_SIZE <= 32 && i == 6 &&
+ !kmalloc_caches[type][1])
+ new_kmalloc_cache(1, type, flags);
+ if (KMALLOC_MIN_SIZE <= 64 && i == 7 &&
+ !kmalloc_caches[type][2])
+ new_kmalloc_cache(2, type, flags);
+ }
}
/* Kmalloc array is now usable */
@@ -1153,16 +1175,15 @@ void __init create_kmalloc_caches(slab_flags_t flags)
#ifdef CONFIG_ZONE_DMA
for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
- struct kmem_cache *s = kmalloc_caches[i];
+ struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i];
if (s) {
unsigned int size = kmalloc_size(i);
- char *n = kasprintf(GFP_NOWAIT,
- "dma-kmalloc-%u", size);
+ const char *n = kmalloc_cache_name("dma-kmalloc", size);
BUG_ON(!n);
- kmalloc_dma_caches[i] = create_kmalloc_cache(n,
- size, SLAB_CACHE_DMA | flags, 0, 0);
+ kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache(
+ n, size, SLAB_CACHE_DMA | flags, 0, 0);
}
}
#endif
diff --git a/mm/slub.c b/mm/slub.c
index 8da34a8af53d..e3629cd7aff1 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1276,16 +1276,54 @@ out:
__setup("slub_debug", setup_slub_debug);
+/*
+ * kmem_cache_flags - apply debugging options to the cache
+ * @object_size: the size of an object without meta data
+ * @flags: flags to set
+ * @name: name of the cache
+ * @ctor: constructor function
+ *
+ * Debug option(s) are applied to @flags. In addition to the debug
+ * option(s), if a slab name (or multiple) is specified i.e.
+ * slub_debug=<Debug-Options>,<slab name1>,<slab name2> ...
+ * then only the select slabs will receive the debug option(s).
+ */
slab_flags_t kmem_cache_flags(unsigned int object_size,
slab_flags_t flags, const char *name,
void (*ctor)(void *))
{
- /*
- * Enable debugging if selected on the kernel commandline.
- */
- if (slub_debug && (!slub_debug_slabs || (name &&
- !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))))
- flags |= slub_debug;
+ char *iter;
+ size_t len;
+
+ /* If slub_debug = 0, it folds into the if conditional. */
+ if (!slub_debug_slabs)
+ return flags | slub_debug;
+
+ len = strlen(name);
+ iter = slub_debug_slabs;
+ while (*iter) {
+ char *end, *glob;
+ size_t cmplen;
+
+ end = strchr(iter, ',');
+ if (!end)
+ end = iter + strlen(iter);
+
+ glob = strnchr(iter, end - iter, '*');
+ if (glob)
+ cmplen = glob - iter;
+ else
+ cmplen = max_t(size_t, len, (end - iter));
+
+ if (!strncmp(name, iter, cmplen)) {
+ flags |= slub_debug;
+ break;
+ }
+
+ if (!*end)
+ break;
+ iter = end + 1;
+ }
return flags;
}
@@ -3621,9 +3659,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
#ifdef CONFIG_SLUB_DEBUG
void *addr = page_address(page);
void *p;
- unsigned long *map = kcalloc(BITS_TO_LONGS(page->objects),
- sizeof(long),
- GFP_ATOMIC);
+ unsigned long *map = bitmap_zalloc(page->objects, GFP_ATOMIC);
if (!map)
return;
slab_err(s, page, text, s->name);
@@ -3638,7 +3674,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
}
}
slab_unlock(page);
- kfree(map);
+ bitmap_free(map);
#endif
}
@@ -4411,10 +4447,8 @@ static long validate_slab_cache(struct kmem_cache *s)
{
int node;
unsigned long count = 0;
- unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)),
- sizeof(unsigned long),
- GFP_KERNEL);
struct kmem_cache_node *n;
+ unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
if (!map)
return -ENOMEM;
@@ -4422,7 +4456,7 @@ static long validate_slab_cache(struct kmem_cache *s)
flush_all(s);
for_each_kmem_cache_node(s, node, n)
count += validate_slab_node(s, n, map);
- kfree(map);
+ bitmap_free(map);
return count;
}
/*
@@ -4573,14 +4607,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
unsigned long i;
struct loc_track t = { 0, 0, NULL };
int node;
- unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)),
- sizeof(unsigned long),
- GFP_KERNEL);
struct kmem_cache_node *n;
+ unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
GFP_KERNEL)) {
- kfree(map);
+ bitmap_free(map);
return sprintf(buf, "Out of memory\n");
}
/* Push back cpu slabs */
@@ -4646,7 +4678,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
}
free_loc_track(&t);
- kfree(map);
+ bitmap_free(map);
if (!t.count)
len += sprintf(buf, "No data\n");
return len;
@@ -4657,6 +4689,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
static void __init resiliency_test(void)
{
u8 *p;
+ int type = KMALLOC_NORMAL;
BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
@@ -4669,7 +4702,7 @@ static void __init resiliency_test(void)
pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n",
p + 16);
- validate_slab_cache(kmalloc_caches[4]);
+ validate_slab_cache(kmalloc_caches[type][4]);
/* Hmmm... The next two are dangerous */
p = kzalloc(32, GFP_KERNEL);
@@ -4678,33 +4711,33 @@ static void __init resiliency_test(void)
p);
pr_err("If allocated object is overwritten then not detectable\n\n");
- validate_slab_cache(kmalloc_caches[5]);
+ validate_slab_cache(kmalloc_caches[type][5]);
p = kzalloc(64, GFP_KERNEL);
p += 64 + (get_cycles() & 0xff) * sizeof(void *);
*p = 0x56;
pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
p);
pr_err("If allocated object is overwritten then not detectable\n\n");
- validate_slab_cache(kmalloc_caches[6]);
+ validate_slab_cache(kmalloc_caches[type][6]);
pr_err("\nB. Corruption after free\n");
p = kzalloc(128, GFP_KERNEL);
kfree(p);
*p = 0x78;
pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
- validate_slab_cache(kmalloc_caches[7]);
+ validate_slab_cache(kmalloc_caches[type][7]);
p = kzalloc(256, GFP_KERNEL);
kfree(p);
p[50] = 0x9a;
pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
- validate_slab_cache(kmalloc_caches[8]);
+ validate_slab_cache(kmalloc_caches[type][8]);
p = kzalloc(512, GFP_KERNEL);
kfree(p);
p[512] = 0xab;
pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
- validate_slab_cache(kmalloc_caches[9]);
+ validate_slab_cache(kmalloc_caches[type][9]);
}
#else
#ifdef CONFIG_SYSFS
diff --git a/mm/sparse.c b/mm/sparse.c
index 10b07eea9a6e..67ad061f7fb8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -696,13 +696,11 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat,
goto out;
}
-#ifdef CONFIG_DEBUG_VM
/*
* Poison uninitialized struct pages in order to catch invalid flags
* combinations.
*/
- memset(memmap, PAGE_POISON_PATTERN, sizeof(struct page) * PAGES_PER_SECTION);
-#endif
+ page_init_poison(memmap, sizeof(struct page) * PAGES_PER_SECTION);
section_mark_present(ms);
sparse_init_one_section(ms, section_nr, memmap, usemap);
diff --git a/mm/swap.c b/mm/swap.c
index 6861f3140a13..aa483719922e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -29,7 +29,6 @@
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/backing-dev.h>
-#include <linux/memremap.h>
#include <linux/memcontrol.h>
#include <linux/gfp.h>
#include <linux/uio.h>
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 31c45a25b2d3..fd2f21e1c60a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -410,6 +410,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
if (likely(!err)) {
/* Initiate read into locked page */
+ SetPageWorkingset(new_page);
lru_cache_add_anon(new_page);
*new_page_allocated = true;
return new_page;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d954b71c4f9c..644f746e167a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -103,26 +103,39 @@ static inline unsigned char swap_count(unsigned char ent)
return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
}
+/* Reclaim the swap entry anyway if possible */
+#define TTRS_ANYWAY 0x1
+/*
+ * Reclaim the swap entry if there are no more mappings of the
+ * corresponding page
+ */
+#define TTRS_UNMAPPED 0x2
+/* Reclaim the swap entry if swap is getting full*/
+#define TTRS_FULL 0x4
+
/* returns 1 if swap entry is freed */
-static int
-__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
+static int __try_to_reclaim_swap(struct swap_info_struct *si,
+ unsigned long offset, unsigned long flags)
{
swp_entry_t entry = swp_entry(si->type, offset);
struct page *page;
int ret = 0;
- page = find_get_page(swap_address_space(entry), swp_offset(entry));
+ page = find_get_page(swap_address_space(entry), offset);
if (!page)
return 0;
/*
- * This function is called from scan_swap_map() and it's called
- * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
- * We have to use trylock for avoiding deadlock. This is a special
+ * When this function is called from scan_swap_map_slots() and it's
+ * called by vmscan.c at reclaiming pages. So, we hold a lock on a page,
+ * here. We have to use trylock for avoiding deadlock. This is a special
* case and you should use try_to_free_swap() with explicit lock_page()
* in usual operations.
*/
if (trylock_page(page)) {
- ret = try_to_free_swap(page);
+ if ((flags & TTRS_ANYWAY) ||
+ ((flags & TTRS_UNMAPPED) && !page_mapped(page)) ||
+ ((flags & TTRS_FULL) && mem_cgroup_swap_full(page)))
+ ret = try_to_free_swap(page);
unlock_page(page);
}
put_page(page);
@@ -780,7 +793,7 @@ checks:
int swap_was_freed;
unlock_cluster(ci);
spin_unlock(&si->lock);
- swap_was_freed = __try_to_reclaim_swap(si, offset);
+ swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
spin_lock(&si->lock);
/* entry was freed successfully, try to use this again */
if (swap_was_freed)
@@ -919,6 +932,7 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
struct swap_cluster_info *ci;
ci = lock_cluster(si, offset);
+ memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
cluster_set_count_flag(ci, 0, 0);
free_cluster(si, idx);
unlock_cluster(ci);
@@ -989,7 +1003,7 @@ start_over:
goto nextsi;
}
if (size == SWAPFILE_CLUSTER) {
- if (!(si->flags & SWP_FILE))
+ if (!(si->flags & SWP_FS))
n_ret = swap_alloc_cluster(si, swp_entries);
} else
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
@@ -1169,6 +1183,8 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p,
ci = lock_cluster_or_swap_info(p, offset);
usage = __swap_entry_free_locked(p, offset, usage);
unlock_cluster_or_swap_info(p, ci);
+ if (!usage)
+ free_swap_slot(entry);
return usage;
}
@@ -1199,10 +1215,8 @@ void swap_free(swp_entry_t entry)
struct swap_info_struct *p;
p = _swap_info_get(entry);
- if (p) {
- if (!__swap_entry_free(p, entry, 1))
- free_swap_slot(entry);
- }
+ if (p)
+ __swap_entry_free(p, entry, 1);
}
/*
@@ -1237,9 +1251,6 @@ void put_swap_page(struct page *page, swp_entry_t entry)
if (free_entries == SWAPFILE_CLUSTER) {
unlock_cluster_or_swap_info(si, ci);
spin_lock(&si->lock);
- ci = lock_cluster(si, offset);
- memset(map, 0, SWAPFILE_CLUSTER);
- unlock_cluster(ci);
mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
swap_free_cluster(si, idx);
spin_unlock(&si->lock);
@@ -1612,7 +1623,6 @@ int try_to_free_swap(struct page *page)
int free_swap_and_cache(swp_entry_t entry)
{
struct swap_info_struct *p;
- struct page *page = NULL;
unsigned char count;
if (non_swap_entry(entry))
@@ -1622,30 +1632,9 @@ int free_swap_and_cache(swp_entry_t entry)
if (p) {
count = __swap_entry_free(p, entry, 1);
if (count == SWAP_HAS_CACHE &&
- !swap_page_trans_huge_swapped(p, entry)) {
- page = find_get_page(swap_address_space(entry),
- swp_offset(entry));
- if (page && !trylock_page(page)) {
- put_page(page);
- page = NULL;
- }
- } else if (!count)
- free_swap_slot(entry);
- }
- if (page) {
- /*
- * Not mapped elsewhere, or swap space full? Free it!
- * Also recheck PageSwapCache now page is locked (above).
- */
- if (PageSwapCache(page) && !PageWriteback(page) &&
- (!page_mapped(page) || mem_cgroup_swap_full(page)) &&
- !swap_page_trans_huge_swapped(p, entry)) {
- page = compound_head(page);
- delete_from_swap_cache(page);
- SetPageDirty(page);
- }
- unlock_page(page);
- put_page(page);
+ !swap_page_trans_huge_swapped(p, entry))
+ __try_to_reclaim_swap(p, swp_offset(entry),
+ TTRS_UNMAPPED | TTRS_FULL);
}
return p != NULL;
}
@@ -2310,12 +2299,13 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
kfree(se);
}
- if (sis->flags & SWP_FILE) {
+ if (sis->flags & SWP_ACTIVATED) {
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
- sis->flags &= ~SWP_FILE;
- mapping->a_ops->swap_deactivate(swap_file);
+ sis->flags &= ~SWP_ACTIVATED;
+ if (mapping->a_ops->swap_deactivate)
+ mapping->a_ops->swap_deactivate(swap_file);
}
}
@@ -2364,6 +2354,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
list_add_tail(&new_se->list, &sis->first_swap_extent.list);
return 1;
}
+EXPORT_SYMBOL_GPL(add_swap_extent);
/*
* A `swap extent' is a simple thing which maps a contiguous range of pages
@@ -2411,8 +2402,10 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
if (mapping->a_ops->swap_activate) {
ret = mapping->a_ops->swap_activate(sis, swap_file, span);
+ if (ret >= 0)
+ sis->flags |= SWP_ACTIVATED;
if (!ret) {
- sis->flags |= SWP_FILE;
+ sis->flags |= SWP_FS;
ret = add_swap_extent(sis, 0, sis->max, 0);
*span = sis->pages;
}
diff --git a/mm/util.c b/mm/util.c
index 9e3ebd2ef65f..8bf08b5b5760 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -15,17 +15,10 @@
#include <linux/vmalloc.h>
#include <linux/userfaultfd_k.h>
-#include <asm/sections.h>
#include <linux/uaccess.h>
#include "internal.h"
-static inline int is_kernel_rodata(unsigned long addr)
-{
- return addr >= (unsigned long)__start_rodata &&
- addr < (unsigned long)__end_rodata;
-}
-
/**
* kfree_const - conditionally free memory
* @x: pointer to the memory
@@ -442,7 +435,7 @@ EXPORT_SYMBOL(kvmalloc_node);
* It is slightly more efficient to use kfree() or vfree() if you are certain
* that you know which one to use.
*
- * Context: Any context except NMI.
+ * Context: Either preemptible task context or not-NMI interrupt.
*/
void kvfree(const void *addr)
{
@@ -685,8 +678,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
* Part of the kernel memory, which can be released
* under memory pressure.
*/
- free += global_node_page_state(
- NR_INDIRECTLY_RECLAIMABLE_BYTES) >> PAGE_SHIFT;
+ free += global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
/*
* Leave reserved pages. The pages are not for anonymous pages.
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a728fc492557..97d4b25d0373 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1577,6 +1577,8 @@ void vfree_atomic(const void *addr)
* have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
* conventions for vfree() arch-depenedent would be a really bad idea)
*
+ * May sleep if called *not* from interrupt context.
+ *
* NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
*/
void vfree(const void *addr)
@@ -1585,6 +1587,8 @@ void vfree(const void *addr)
kmemleak_free(addr);
+ might_sleep_if(!in_interrupt());
+
if (!addr)
return;
if (unlikely(in_interrupt()))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f9cc86e91812..62ac0c488624 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -49,6 +49,7 @@
#include <linux/prefetch.h>
#include <linux/printk.h>
#include <linux/dax.h>
+#include <linux/psi.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -473,9 +474,18 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
total_scan = nr;
- delta = freeable >> priority;
- delta *= 4;
- do_div(delta, shrinker->seeks);
+ if (shrinker->seeks) {
+ delta = freeable >> priority;
+ delta *= 4;
+ do_div(delta, shrinker->seeks);
+ } else {
+ /*
+ * These objects don't require any IO to create. Trim
+ * them aggressively under memory pressure to keep
+ * them from causing refetches in the IO caches.
+ */
+ delta = freeable / 2;
+ }
/*
* Make sure we apply some minimal pressure on default priority
@@ -580,8 +590,8 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
struct mem_cgroup *memcg, int priority)
{
struct memcg_shrinker_map *map;
- unsigned long freed = 0;
- int ret, i;
+ unsigned long ret, freed = 0;
+ int i;
if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))
return 0;
@@ -677,9 +687,8 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
struct mem_cgroup *memcg,
int priority)
{
+ unsigned long ret, freed = 0;
struct shrinker *shrinker;
- unsigned long freed = 0;
- int ret;
if (!mem_cgroup_is_root(memcg))
return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
@@ -2146,6 +2155,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
}
ClearPageActive(page); /* we are de-activating */
+ SetPageWorkingset(page);
list_add(&page->lru, &l_inactive);
}
@@ -2457,9 +2467,11 @@ out:
/*
* Scan types proportional to swappiness and
* their relative recent reclaim efficiency.
+ * Make sure we don't miss the last page
+ * because of a round-off error.
*/
- scan = div64_u64(scan * fraction[file],
- denominator);
+ scan = DIV64_U64_ROUND_UP(scan * fraction[file],
+ denominator);
break;
case SCAN_FILE:
case SCAN_ANON:
@@ -3303,6 +3315,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
{
struct zonelist *zonelist;
unsigned long nr_reclaimed;
+ unsigned long pflags;
int nid;
unsigned int noreclaim_flag;
struct scan_control sc = {
@@ -3331,9 +3344,13 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
sc.gfp_mask,
sc.reclaim_idx);
+ psi_memstall_enter(&pflags);
noreclaim_flag = memalloc_noreclaim_save();
+
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+
memalloc_noreclaim_restore(noreclaim_flag);
+ psi_memstall_leave(&pflags);
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
@@ -3498,6 +3515,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
int i;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
+ unsigned long pflags;
struct zone *zone;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
@@ -3508,6 +3526,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
.may_swap = 1,
};
+ psi_memstall_enter(&pflags);
__fs_reclaim_acquire();
count_vm_event(PAGEOUTRUN);
@@ -3609,6 +3628,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
out:
snapshot_refaults(NULL, pgdat);
__fs_reclaim_release();
+ psi_memstall_leave(&pflags);
/*
* Return the order kswapd stopped reclaiming at as
* prepare_kswapd_sleep() takes it into account. If another caller
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8ba0870ecddd..6038ce593ce3 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1143,8 +1143,10 @@ const char * const vmstat_text[] = {
"nr_slab_unreclaimable",
"nr_isolated_anon",
"nr_isolated_file",
+ "workingset_nodes",
"workingset_refault",
"workingset_activate",
+ "workingset_restore",
"workingset_nodereclaim",
"nr_anon_pages",
"nr_mapped",
@@ -1161,7 +1163,7 @@ const char * const vmstat_text[] = {
"nr_vmscan_immediate_reclaim",
"nr_dirtied",
"nr_written",
- "", /* nr_indirectly_reclaimable */
+ "nr_kernel_misc_reclaimable",
/* enum writeback_stat_item counters */
"nr_dirty_threshold",
@@ -1275,6 +1277,9 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_SMP
"nr_tlb_remote_flush",
"nr_tlb_remote_flush_received",
+#else
+ "", /* nr_tlb_remote_flush */
+ "", /* nr_tlb_remote_flush_received */
#endif /* CONFIG_SMP */
"nr_tlb_local_flush_all",
"nr_tlb_local_flush_one",
@@ -1283,7 +1288,6 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_DEBUG_VM_VMACACHE
"vmacache_find_calls",
"vmacache_find_hits",
- "vmacache_full_flushes",
#endif
#ifdef CONFIG_SWAP
"swap_ra",
@@ -1661,6 +1665,8 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
stat_items_size += sizeof(struct vm_event_state);
#endif
+ BUILD_BUG_ON(stat_items_size !=
+ ARRAY_SIZE(vmstat_text) * sizeof(unsigned long));
v = kmalloc(stat_items_size, GFP_KERNEL);
m->private = v;
if (!v)
@@ -1704,10 +1710,6 @@ static int vmstat_show(struct seq_file *m, void *arg)
unsigned long *l = arg;
unsigned long off = l - (unsigned long *)m->private;
- /* Skip hidden vmstat items. */
- if (*vmstat_text[off] == '\0')
- return 0;
-
seq_puts(m, vmstat_text[off]);
seq_put_decimal_ull(m, " ", *l);
seq_putc(m, '\n');
diff --git a/mm/workingset.c b/mm/workingset.c
index 5cfb29ec3fd9..d46f8c92aa2f 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -121,7 +121,7 @@
* the only thing eating into inactive list space is active pages.
*
*
- * Activating refaulting pages
+ * Refaulting inactive pages
*
* All that is known about the active list is that the pages have been
* accessed more than once in the past. This means that at any given
@@ -134,6 +134,10 @@
* used less frequently than the refaulting page - or even not used at
* all anymore.
*
+ * That means if inactive cache is refaulting with a suitable refault
+ * distance, we assume the cache workingset is transitioning and put
+ * pressure on the current active list.
+ *
* If this is wrong and demotion kicks in, the pages which are truly
* used more frequently will be reactivated while the less frequently
* used once will be evicted from memory.
@@ -141,6 +145,14 @@
* But if this is right, the stale pages will be pushed out of memory
* and the used pages get to stay in cache.
*
+ * Refaulting active pages
+ *
+ * If on the other hand the refaulting pages have recently been
+ * deactivated, it means that the active list is no longer protecting
+ * actively used cache from reclaim. The cache is NOT transitioning to
+ * a different workingset; the existing workingset is thrashing in the
+ * space allocated to the page cache.
+ *
*
* Implementation
*
@@ -156,8 +168,7 @@
*/
#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \
- NODES_SHIFT + \
- MEM_CGROUP_ID_SHIFT)
+ 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)
#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
/*
@@ -170,22 +181,27 @@
*/
static unsigned int bucket_order __read_mostly;
-static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
+static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
+ bool workingset)
{
eviction >>= bucket_order;
eviction &= EVICTION_MASK;
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
+ eviction = (eviction << 1) | workingset;
return xa_mk_value(eviction);
}
static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
- unsigned long *evictionp)
+ unsigned long *evictionp, bool *workingsetp)
{
unsigned long entry = xa_to_value(shadow);
int memcgid, nid;
+ bool workingset;
+ workingset = entry & 1;
+ entry >>= 1;
nid = entry & ((1UL << NODES_SHIFT) - 1);
entry >>= NODES_SHIFT;
memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
@@ -194,6 +210,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
*memcgidp = memcgid;
*pgdat = NODE_DATA(nid);
*evictionp = entry << bucket_order;
+ *workingsetp = workingset;
}
/**
@@ -206,8 +223,8 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
*/
void *workingset_eviction(struct address_space *mapping, struct page *page)
{
- struct mem_cgroup *memcg = page_memcg(page);
struct pglist_data *pgdat = page_pgdat(page);
+ struct mem_cgroup *memcg = page_memcg(page);
int memcgid = mem_cgroup_id(memcg);
unsigned long eviction;
struct lruvec *lruvec;
@@ -219,30 +236,30 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
lruvec = mem_cgroup_lruvec(pgdat, memcg);
eviction = atomic_long_inc_return(&lruvec->inactive_age);
- return pack_shadow(memcgid, pgdat, eviction);
+ return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
}
/**
* workingset_refault - evaluate the refault of a previously evicted page
+ * @page: the freshly allocated replacement page
* @shadow: shadow entry of the evicted page
*
* Calculates and evaluates the refault distance of the previously
* evicted page in the context of the node it was allocated in.
- *
- * Returns %true if the page should be activated, %false otherwise.
*/
-bool workingset_refault(void *shadow)
+void workingset_refault(struct page *page, void *shadow)
{
unsigned long refault_distance;
+ struct pglist_data *pgdat;
unsigned long active_file;
struct mem_cgroup *memcg;
unsigned long eviction;
struct lruvec *lruvec;
unsigned long refault;
- struct pglist_data *pgdat;
+ bool workingset;
int memcgid;
- unpack_shadow(shadow, &memcgid, &pgdat, &eviction);
+ unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
rcu_read_lock();
/*
@@ -262,41 +279,51 @@ bool workingset_refault(void *shadow)
* configurations instead.
*/
memcg = mem_cgroup_from_id(memcgid);
- if (!mem_cgroup_disabled() && !memcg) {
- rcu_read_unlock();
- return false;
- }
+ if (!mem_cgroup_disabled() && !memcg)
+ goto out;
lruvec = mem_cgroup_lruvec(pgdat, memcg);
refault = atomic_long_read(&lruvec->inactive_age);
active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
/*
- * The unsigned subtraction here gives an accurate distance
- * across inactive_age overflows in most cases.
+ * Calculate the refault distance
*
- * There is a special case: usually, shadow entries have a
- * short lifetime and are either refaulted or reclaimed along
- * with the inode before they get too old. But it is not
- * impossible for the inactive_age to lap a shadow entry in
- * the field, which can then can result in a false small
- * refault distance, leading to a false activation should this
- * old entry actually refault again. However, earlier kernels
- * used to deactivate unconditionally with *every* reclaim
- * invocation for the longest time, so the occasional
- * inappropriate activation leading to pressure on the active
- * list is not a problem.
+ * The unsigned subtraction here gives an accurate distance
+ * across inactive_age overflows in most cases. There is a
+ * special case: usually, shadow entries have a short lifetime
+ * and are either refaulted or reclaimed along with the inode
+ * before they get too old. But it is not impossible for the
+ * inactive_age to lap a shadow entry in the field, which can
+ * then result in a false small refault distance, leading to a
+ * false activation should this old entry actually refault
+ * again. However, earlier kernels used to deactivate
+ * unconditionally with *every* reclaim invocation for the
+ * longest time, so the occasional inappropriate activation
+ * leading to pressure on the active list is not a problem.
*/
refault_distance = (refault - eviction) & EVICTION_MASK;
inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
- if (refault_distance <= active_file) {
- inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
- rcu_read_unlock();
- return true;
+ /*
+ * Compare the distance to the existing workingset size. We
+ * don't act on pages that couldn't stay resident even if all
+ * the memory was available to the page cache.
+ */
+ if (refault_distance > active_file)
+ goto out;
+
+ SetPageActive(page);
+ atomic_long_inc(&lruvec->inactive_age);
+ inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
+
+ /* Page was active prior to eviction */
+ if (workingset) {
+ SetPageWorkingset(page);
+ inc_lruvec_state(lruvec, WORKINGSET_RESTORE);
}
+out:
rcu_read_unlock();
- return false;
}
/**
@@ -349,12 +376,20 @@ void workingset_update_node(struct xa_node *node)
* already where they should be. The list_empty() test is safe
* as node->private_list is protected by the i_pages lock.
*/
+ VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */
+
if (node->count && node->count == node->nr_values) {
- if (list_empty(&node->private_list))
+ if (list_empty(&node->private_list)) {
list_lru_add(&shadow_nodes, &node->private_list);
+ __inc_lruvec_page_state(virt_to_page(node),
+ WORKINGSET_NODES);
+ }
} else {
- if (!list_empty(&node->private_list))
+ if (!list_empty(&node->private_list)) {
list_lru_del(&shadow_nodes, &node->private_list);
+ __dec_lruvec_page_state(virt_to_page(node),
+ WORKINGSET_NODES);
+ }
}
}
@@ -363,7 +398,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
{
unsigned long max_nodes;
unsigned long nodes;
- unsigned long cache;
+ unsigned long pages;
nodes = list_lru_shrink_count(&shadow_nodes, sc);
@@ -389,14 +424,20 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
*
* PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE
*/
+#ifdef CONFIG_MEMCG
if (sc->memcg) {
- cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
- LRU_ALL_FILE);
- } else {
- cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) +
- node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE);
- }
- max_nodes = cache >> (XA_CHUNK_SHIFT - 3);
+ struct lruvec *lruvec;
+
+ pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
+ LRU_ALL);
+ lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg);
+ pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE);
+ pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE);
+ } else
+#endif
+ pages = node_present_pages(sc->nid);
+
+ max_nodes = pages >> (XA_CHUNK_SHIFT - 3);
if (!nodes)
return SHRINK_EMPTY;
@@ -438,6 +479,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
}
list_lru_isolate(lru, item);
+ __dec_lruvec_page_state(virt_to_page(node), WORKINGSET_NODES);
+
spin_unlock(lru_lock);
/*
@@ -459,7 +502,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
* shadow entries we were tracking ...
*/
xas_store(&xas, NULL);
- inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
+ __inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
out_invalid:
xa_unlock_irq(&mapping->i_pages);
@@ -481,7 +524,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
static struct shrinker workingset_shadow_shrinker = {
.count_objects = count_shadow_nodes,
.scan_objects = scan_shadow_nodes,
- .seeks = DEFAULT_SEEKS,
+ .seeks = 0, /* ->count reports only fully expendable nodes */
.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
};
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 9da65552e7ca..0787d33b80d8 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -418,7 +418,7 @@ static void *zs_zpool_map(void *pool, unsigned long handle,
case ZPOOL_MM_WO:
zs_mm = ZS_MM_WO;
break;
- case ZPOOL_MM_RW: /* fallthru */
+ case ZPOOL_MM_RW: /* fall through */
default:
zs_mm = ZS_MM_RW;
break;