summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/cleancache.c2
-rw-r--r--mm/filemap.c76
-rw-r--r--mm/gup.c35
-rw-r--r--mm/huge_memory.c8
-rw-r--r--mm/hugetlb.c5
-rw-r--r--mm/khugepaged.c1
-rw-r--r--mm/ksm.c3
-rw-r--r--mm/memblock.c23
-rw-r--r--mm/memcontrol.c10
-rw-r--r--mm/memory-failure.c13
-rw-r--r--mm/memory.c78
-rw-r--r--mm/mempool.c2
-rw-r--r--mm/mlock.c5
-rw-r--r--mm/mmap.c160
-rw-r--r--mm/page_alloc.c37
-rw-r--r--mm/page_io.c4
-rw-r--r--mm/rmap.c16
-rw-r--r--mm/shmem.c8
-rw-r--r--mm/slub.c46
-rw-r--r--mm/swap_cgroup.c3
-rw-r--r--mm/util.c7
-rw-r--r--mm/vmalloc.c15
-rw-r--r--mm/vmpressure.c6
-rw-r--r--mm/vmscan.c2
25 files changed, 345 insertions, 222 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index beb7a455915d..398b46064544 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -137,7 +137,7 @@ config HAVE_MEMBLOCK_NODE_MAP
config HAVE_MEMBLOCK_PHYS_MAP
bool
-config HAVE_GENERIC_RCU_GUP
+config HAVE_GENERIC_GUP
bool
config ARCH_DISCARD_MEMBLOCK
diff --git a/mm/cleancache.c b/mm/cleancache.c
index ba5d8f3e6d68..f7b9fdc79d97 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -130,7 +130,7 @@ void __cleancache_init_shared_fs(struct super_block *sb)
int pool_id = CLEANCACHE_NO_BACKEND_SHARED;
if (cleancache_ops) {
- pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE);
+ pool_id = cleancache_ops->init_shared_fs(&sb->s_uuid, PAGE_SIZE);
if (pool_id < 0)
pool_id = CLEANCACHE_NO_POOL;
}
diff --git a/mm/filemap.c b/mm/filemap.c
index 6f1be573a5e6..aea58e983a73 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -376,6 +376,38 @@ int filemap_flush(struct address_space *mapping)
}
EXPORT_SYMBOL(filemap_flush);
+/**
+ * filemap_range_has_page - check if a page exists in range.
+ * @mapping: address space within which to check
+ * @start_byte: offset in bytes where the range starts
+ * @end_byte: offset in bytes where the range ends (inclusive)
+ *
+ * Find at least one page in the range supplied, usually used to check if
+ * direct writing in this range will trigger a writeback.
+ */
+bool filemap_range_has_page(struct address_space *mapping,
+ loff_t start_byte, loff_t end_byte)
+{
+ pgoff_t index = start_byte >> PAGE_SHIFT;
+ pgoff_t end = end_byte >> PAGE_SHIFT;
+ struct pagevec pvec;
+ bool ret;
+
+ if (end_byte < start_byte)
+ return false;
+
+ if (mapping->nrpages == 0)
+ return false;
+
+ pagevec_init(&pvec, 0);
+ if (!pagevec_lookup(&pvec, mapping, index, 1))
+ return false;
+ ret = (pvec.pages[0]->index <= end);
+ pagevec_release(&pvec);
+ return ret;
+}
+EXPORT_SYMBOL(filemap_range_has_page);
+
static int __filemap_fdatawait_range(struct address_space *mapping,
loff_t start_byte, loff_t end_byte)
{
@@ -768,10 +800,10 @@ struct wait_page_key {
struct wait_page_queue {
struct page *page;
int bit_nr;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
};
-static int wake_page_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
+static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
{
struct wait_page_key *key = arg;
struct wait_page_queue *wait_page
@@ -834,7 +866,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
struct page *page, int bit_nr, int state, bool lock)
{
struct wait_page_queue wait_page;
- wait_queue_t *wait = &wait_page.wait;
+ wait_queue_entry_t *wait = &wait_page.wait;
int ret = 0;
init_wait(wait);
@@ -845,9 +877,9 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
for (;;) {
spin_lock_irq(&q->lock);
- if (likely(list_empty(&wait->task_list))) {
+ if (likely(list_empty(&wait->entry))) {
if (lock)
- __add_wait_queue_tail_exclusive(q, wait);
+ __add_wait_queue_entry_tail_exclusive(q, wait);
else
__add_wait_queue(q, wait);
SetPageWaiters(page);
@@ -907,7 +939,7 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr)
*
* Add an arbitrary @waiter to the wait queue for the nominated @page.
*/
-void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
+void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
{
wait_queue_head_t *q = page_waitqueue(page);
unsigned long flags;
@@ -2038,10 +2070,17 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
loff_t size;
size = i_size_read(inode);
- retval = filemap_write_and_wait_range(mapping, iocb->ki_pos,
- iocb->ki_pos + count - 1);
- if (retval < 0)
- goto out;
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (filemap_range_has_page(mapping, iocb->ki_pos,
+ iocb->ki_pos + count - 1))
+ return -EAGAIN;
+ } else {
+ retval = filemap_write_and_wait_range(mapping,
+ iocb->ki_pos,
+ iocb->ki_pos + count - 1);
+ if (retval < 0)
+ goto out;
+ }
file_accessed(file);
@@ -2642,6 +2681,9 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
pos = iocb->ki_pos;
+ if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+ return -EINVAL;
+
if (limit != RLIM_INFINITY) {
if (iocb->ki_pos >= limit) {
send_sig(SIGXFSZ, current, 0);
@@ -2710,9 +2752,17 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
write_len = iov_iter_count(from);
end = (pos + write_len - 1) >> PAGE_SHIFT;
- written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
- if (written)
- goto out;
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ /* If there are pages to writeback, return */
+ if (filemap_range_has_page(inode->i_mapping, pos,
+ pos + iov_iter_count(from)))
+ return -EAGAIN;
+ } else {
+ written = filemap_write_and_wait_range(mapping, pos,
+ pos + write_len - 1);
+ if (written)
+ goto out;
+ }
/*
* After a write we want buffered reads to be sure to go to disk to get
diff --git a/mm/gup.c b/mm/gup.c
index d9e6fddcc51f..3ab78dc3db7d 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -387,11 +387,6 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
/* mlock all present pages, but do not fault in new pages */
if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
return -ENOENT;
- /* For mm_populate(), just skip the stack guard page. */
- if ((*flags & FOLL_POPULATE) &&
- (stack_guard_page_start(vma, address) ||
- stack_guard_page_end(vma, address + PAGE_SIZE)))
- return -ENOENT;
if (*flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
if (*flags & FOLL_REMOTE)
@@ -407,12 +402,10 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
ret = handle_mm_fault(vma, address, fault_flags);
if (ret & VM_FAULT_ERROR) {
- if (ret & VM_FAULT_OOM)
- return -ENOMEM;
- if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
- return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT;
- if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
- return -EFAULT;
+ int err = vm_fault_to_errno(ret, *flags);
+
+ if (err)
+ return err;
BUG();
}
@@ -723,12 +716,10 @@ retry:
ret = handle_mm_fault(vma, address, fault_flags);
major |= ret & VM_FAULT_MAJOR;
if (ret & VM_FAULT_ERROR) {
- if (ret & VM_FAULT_OOM)
- return -ENOMEM;
- if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
- return -EHWPOISON;
- if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
- return -EFAULT;
+ int err = vm_fault_to_errno(ret, 0);
+
+ if (err)
+ return err;
BUG();
}
@@ -1155,7 +1146,7 @@ struct page *get_dump_page(unsigned long addr)
#endif /* CONFIG_ELF_CORE */
/*
- * Generic RCU Fast GUP
+ * Generic Fast GUP
*
* get_user_pages_fast attempts to pin user pages by walking the page
* tables directly and avoids taking locks. Thus the walker needs to be
@@ -1176,8 +1167,8 @@ struct page *get_dump_page(unsigned long addr)
* Before activating this code, please be aware that the following assumptions
* are currently made:
*
- * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
- * pages containing page tables.
+ * *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
+ * free pages containing page tables or TLB flushing requires IPI broadcast.
*
* *) ptes can be read atomically by the architecture.
*
@@ -1187,7 +1178,7 @@ struct page *get_dump_page(unsigned long addr)
*
* This code is based heavily on the PowerPC implementation by Nick Piggin.
*/
-#ifdef CONFIG_HAVE_GENERIC_RCU_GUP
+#ifdef CONFIG_HAVE_GENERIC_GUP
#ifndef gup_get_pte
/*
@@ -1677,4 +1668,4 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
return ret;
}
-#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */
+#endif /* CONFIG_HAVE_GENERIC_GUP */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a84909cf20d3..88c6167f194d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1426,8 +1426,11 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
*/
if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
page = pmd_page(*vmf->pmd);
+ if (!get_page_unless_zero(page))
+ goto out_unlock;
spin_unlock(vmf->ptl);
wait_on_page_locked(page);
+ put_page(page);
goto out;
}
@@ -1459,9 +1462,12 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
/* Migration could have started since the pmd_trans_migrating check */
if (!page_locked) {
+ page_nid = -1;
+ if (!get_page_unless_zero(page))
+ goto out_unlock;
spin_unlock(vmf->ptl);
wait_on_page_locked(page);
- page_nid = -1;
+ put_page(page);
goto out;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e5828875f7bb..3eedb187e549 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4170,6 +4170,11 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
if (ret & VM_FAULT_ERROR) {
+ int err = vm_fault_to_errno(ret, flags);
+
+ if (err)
+ return err;
+
remainder = 0;
break;
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 945fd1ca49b5..df4ebdb2b10a 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -652,7 +652,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
spin_unlock(ptl);
free_page_and_swap_cache(src_page);
}
- cond_resched();
}
}
diff --git a/mm/ksm.c b/mm/ksm.c
index d9fc0e456128..216184af0e19 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1028,8 +1028,7 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
goto out;
if (PageTransCompound(page)) {
- err = split_huge_page(page);
- if (err)
+ if (split_huge_page(page))
goto out_unlock;
}
diff --git a/mm/memblock.c b/mm/memblock.c
index b049c9b2dba8..7b8a5db76a2f 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1739,6 +1739,29 @@ static void __init_memblock memblock_dump(struct memblock_type *type)
}
}
+extern unsigned long __init_memblock
+memblock_reserved_memory_within(phys_addr_t start_addr, phys_addr_t end_addr)
+{
+ struct memblock_region *rgn;
+ unsigned long size = 0;
+ int idx;
+
+ for_each_memblock_type((&memblock.reserved), rgn) {
+ phys_addr_t start, end;
+
+ if (rgn->base + rgn->size < start_addr)
+ continue;
+ if (rgn->base > end_addr)
+ continue;
+
+ start = rgn->base;
+ end = start + rgn->size;
+ size += end - start;
+ }
+
+ return size;
+}
+
void __init_memblock __memblock_dump_all(void)
{
pr_info("MEMBLOCK configuration:\n");
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 94172089f52f..d75b38b66ef6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -170,7 +170,7 @@ struct mem_cgroup_event {
*/
poll_table pt;
wait_queue_head_t *wqh;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
struct work_struct remove;
};
@@ -1479,10 +1479,10 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
struct oom_wait_info {
struct mem_cgroup *memcg;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
};
-static int memcg_oom_wake_function(wait_queue_t *wait,
+static int memcg_oom_wake_function(wait_queue_entry_t *wait,
unsigned mode, int sync, void *arg)
{
struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
@@ -1570,7 +1570,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
owait.wait.flags = 0;
owait.wait.func = memcg_oom_wake_function;
owait.wait.private = current;
- INIT_LIST_HEAD(&owait.wait.task_list);
+ INIT_LIST_HEAD(&owait.wait.entry);
prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
mem_cgroup_mark_under_oom(memcg);
@@ -3725,7 +3725,7 @@ static void memcg_event_remove(struct work_struct *work)
*
* Called with wqh->lock held and interrupts disabled.
*/
-static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
+static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
int sync, void *key)
{
struct mem_cgroup_event *event =
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 2527dfeddb00..ecc183fd94f3 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1184,7 +1184,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* page_remove_rmap() in try_to_unmap_one(). So to determine page status
* correctly, we save a copy of the page flags at this time.
*/
- page_flags = p->flags;
+ if (PageHuge(p))
+ page_flags = hpage->flags;
+ else
+ page_flags = p->flags;
/*
* unpoison always clear PG_hwpoison inside page lock
@@ -1595,12 +1598,8 @@ static int soft_offline_huge_page(struct page *page, int flags)
if (ret) {
pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n",
pfn, ret, page->flags, &page->flags);
- /*
- * We know that soft_offline_huge_page() tries to migrate
- * only one hugepage pointed to by hpage, so we need not
- * run through the pagelist here.
- */
- putback_active_hugepage(hpage);
+ if (!list_empty(&pagelist))
+ putback_movable_pages(&pagelist);
if (ret > 0)
ret = -EIO;
} else {
diff --git a/mm/memory.c b/mm/memory.c
index 6ff5d729ded0..bb11c474857e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2855,40 +2855,6 @@ out_release:
}
/*
- * This is like a special single-page "expand_{down|up}wards()",
- * except we must first make sure that 'address{-|+}PAGE_SIZE'
- * doesn't hit another vma.
- */
-static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
-{
- address &= PAGE_MASK;
- if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
- struct vm_area_struct *prev = vma->vm_prev;
-
- /*
- * Is there a mapping abutting this one below?
- *
- * That's only ok if it's the same stack mapping
- * that has gotten split..
- */
- if (prev && prev->vm_end == address)
- return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
-
- return expand_downwards(vma, address - PAGE_SIZE);
- }
- if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
- struct vm_area_struct *next = vma->vm_next;
-
- /* As VM_GROWSDOWN but s/below/above/ */
- if (next && next->vm_start == address + PAGE_SIZE)
- return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
-
- return expand_upwards(vma, address + PAGE_SIZE);
- }
- return 0;
-}
-
-/*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
@@ -2904,10 +2870,6 @@ static int do_anonymous_page(struct vm_fault *vmf)
if (vma->vm_flags & VM_SHARED)
return VM_FAULT_SIGBUS;
- /* Check if we need to add a guard page to the stack */
- if (check_stack_guard_page(vma, vmf->address) < 0)
- return VM_FAULT_SIGSEGV;
-
/*
* Use pte_alloc() instead of pte_alloc_map(). We can't run
* pte_offset_map() on pmds where a huge pmd might be created
@@ -3029,6 +2991,17 @@ static int __do_fault(struct vm_fault *vmf)
return ret;
}
+/*
+ * The ordering of these checks is important for pmds with _PAGE_DEVMAP set.
+ * If we check pmd_trans_unstable() first we will trip the bad_pmd() check
+ * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly
+ * returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
+ */
+static int pmd_devmap_trans_unstable(pmd_t *pmd)
+{
+ return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
+}
+
static int pte_alloc_one_map(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
@@ -3052,18 +3025,27 @@ static int pte_alloc_one_map(struct vm_fault *vmf)
map_pte:
/*
* If a huge pmd materialized under us just retry later. Use
- * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
- * didn't become pmd_trans_huge under us and then back to pmd_none, as
- * a result of MADV_DONTNEED running immediately after a huge pmd fault
- * in a different thread of this mm, in turn leading to a misleading
- * pmd_trans_huge() retval. All we have to ensure is that it is a
- * regular pmd that we can walk with pte_offset_map() and we can do that
- * through an atomic read in C, which is what pmd_trans_unstable()
- * provides.
+ * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of
+ * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge
+ * under us and then back to pmd_none, as a result of MADV_DONTNEED
+ * running immediately after a huge pmd fault in a different thread of
+ * this mm, in turn leading to a misleading pmd_trans_huge() retval.
+ * All we have to ensure is that it is a regular pmd that we can walk
+ * with pte_offset_map() and we can do that through an atomic read in
+ * C, which is what pmd_trans_unstable() provides.
*/
- if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
+ if (pmd_devmap_trans_unstable(vmf->pmd))
return VM_FAULT_NOPAGE;
+ /*
+ * At this point we know that our vmf->pmd points to a page of ptes
+ * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge()
+ * for the duration of the fault. If a racing MADV_DONTNEED runs and
+ * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still
+ * be valid and we will re-check to make sure the vmf->pte isn't
+ * pte_none() under vmf->ptl protection when we return to
+ * alloc_set_pte().
+ */
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
return 0;
@@ -3690,7 +3672,7 @@ static int handle_pte_fault(struct vm_fault *vmf)
vmf->pte = NULL;
} else {
/* See comment in pte_alloc_one_map() */
- if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
+ if (pmd_devmap_trans_unstable(vmf->pmd))
return 0;
/*
* A regular pmd is established and it can't morph into a huge
diff --git a/mm/mempool.c b/mm/mempool.c
index 47a659dedd44..1c0294858527 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -312,7 +312,7 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
{
void *element;
unsigned long flags;
- wait_queue_t wait;
+ wait_queue_entry_t wait;
gfp_t gfp_temp;
VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
diff --git a/mm/mlock.c b/mm/mlock.c
index c483c5c20b4b..b562b5523a65 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -284,7 +284,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
{
int i;
int nr = pagevec_count(pvec);
- int delta_munlocked;
+ int delta_munlocked = -nr;
struct pagevec pvec_putback;
int pgrescued = 0;
@@ -304,6 +304,8 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
continue;
else
__munlock_isolation_failed(page);
+ } else {
+ delta_munlocked++;
}
/*
@@ -315,7 +317,6 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
pagevec_add(&pvec_putback, pvec->pages[i]);
pvec->pages[i] = NULL;
}
- delta_munlocked = -nr + pagevec_count(&pvec_putback);
__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
spin_unlock_irq(zone_lru_lock(zone));
diff --git a/mm/mmap.c b/mm/mmap.c
index f82741e199c0..a5e3dcd75e79 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -183,6 +183,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
unsigned long retval;
unsigned long newbrk, oldbrk;
struct mm_struct *mm = current->mm;
+ struct vm_area_struct *next;
unsigned long min_brk;
bool populate;
LIST_HEAD(uf);
@@ -229,7 +230,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
}
/* Check against existing mmap mappings. */
- if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
+ next = find_vma(mm, oldbrk);
+ if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
goto out;
/* Ok, looks good - let it rip. */
@@ -253,10 +255,22 @@ out:
static long vma_compute_subtree_gap(struct vm_area_struct *vma)
{
- unsigned long max, subtree_gap;
- max = vma->vm_start;
- if (vma->vm_prev)
- max -= vma->vm_prev->vm_end;
+ unsigned long max, prev_end, subtree_gap;
+
+ /*
+ * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we
+ * allow two stack_guard_gaps between them here, and when choosing
+ * an unmapped area; whereas when expanding we only require one.
+ * That's a little inconsistent, but keeps the code here simpler.
+ */
+ max = vm_start_gap(vma);
+ if (vma->vm_prev) {
+ prev_end = vm_end_gap(vma->vm_prev);
+ if (max > prev_end)
+ max -= prev_end;
+ else
+ max = 0;
+ }
if (vma->vm_rb.rb_left) {
subtree_gap = rb_entry(vma->vm_rb.rb_left,
struct vm_area_struct, vm_rb)->rb_subtree_gap;
@@ -352,7 +366,7 @@ static void validate_mm(struct mm_struct *mm)
anon_vma_unlock_read(anon_vma);
}
- highest_address = vma->vm_end;
+ highest_address = vm_end_gap(vma);
vma = vma->vm_next;
i++;
}
@@ -541,7 +555,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
if (vma->vm_next)
vma_gap_update(vma->vm_next);
else
- mm->highest_vm_end = vma->vm_end;
+ mm->highest_vm_end = vm_end_gap(vma);
/*
* vma->vm_prev wasn't known when we followed the rbtree to find the
@@ -856,7 +870,7 @@ again:
vma_gap_update(vma);
if (end_changed) {
if (!next)
- mm->highest_vm_end = end;
+ mm->highest_vm_end = vm_end_gap(vma);
else if (!adjust_next)
vma_gap_update(next);
}
@@ -941,7 +955,7 @@ again:
* mm->highest_vm_end doesn't need any update
* in remove_next == 1 case.
*/
- VM_WARN_ON(mm->highest_vm_end != end);
+ VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
}
}
if (insert && file)
@@ -1787,7 +1801,7 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info)
while (true) {
/* Visit left subtree if it looks promising */
- gap_end = vma->vm_start;
+ gap_end = vm_start_gap(vma);
if (gap_end >= low_limit && vma->vm_rb.rb_left) {
struct vm_area_struct *left =
rb_entry(vma->vm_rb.rb_left,
@@ -1798,12 +1812,13 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info)
}
}
- gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
+ gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
check_current:
/* Check if current node has a suitable gap */
if (gap_start > high_limit)
return -ENOMEM;
- if (gap_end >= low_limit && gap_end - gap_start >= length)
+ if (gap_end >= low_limit &&
+ gap_end > gap_start && gap_end - gap_start >= length)
goto found;
/* Visit right subtree if it looks promising */
@@ -1825,8 +1840,8 @@ check_current:
vma = rb_entry(rb_parent(prev),
struct vm_area_struct, vm_rb);
if (prev == vma->vm_rb.rb_left) {
- gap_start = vma->vm_prev->vm_end;
- gap_end = vma->vm_start;
+ gap_start = vm_end_gap(vma->vm_prev);
+ gap_end = vm_start_gap(vma);
goto check_current;
}
}
@@ -1890,7 +1905,7 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
while (true) {
/* Visit right subtree if it looks promising */
- gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
+ gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
if (gap_start <= high_limit && vma->vm_rb.rb_right) {
struct vm_area_struct *right =
rb_entry(vma->vm_rb.rb_right,
@@ -1903,10 +1918,11 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
check_current:
/* Check if current node has a suitable gap */
- gap_end = vma->vm_start;
+ gap_end = vm_start_gap(vma);
if (gap_end < low_limit)
return -ENOMEM;
- if (gap_start <= high_limit && gap_end - gap_start >= length)
+ if (gap_start <= high_limit &&
+ gap_end > gap_start && gap_end - gap_start >= length)
goto found;
/* Visit left subtree if it looks promising */
@@ -1929,7 +1945,7 @@ check_current:
struct vm_area_struct, vm_rb);
if (prev == vma->vm_rb.rb_right) {
gap_start = vma->vm_prev ?
- vma->vm_prev->vm_end : 0;
+ vm_end_gap(vma->vm_prev) : 0;
goto check_current;
}
}
@@ -1967,7 +1983,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma, *prev;
struct vm_unmapped_area_info info;
if (len > TASK_SIZE - mmap_min_addr)
@@ -1978,9 +1994,10 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
if (addr) {
addr = PAGE_ALIGN(addr);
- vma = find_vma(mm, addr);
+ vma = find_vma_prev(mm, addr, &prev);
if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)) &&
+ (!prev || addr >= vm_end_gap(prev)))
return addr;
}
@@ -2003,7 +2020,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
const unsigned long len, const unsigned long pgoff,
const unsigned long flags)
{
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma, *prev;
struct mm_struct *mm = current->mm;
unsigned long addr = addr0;
struct vm_unmapped_area_info info;
@@ -2018,9 +2035,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
/* requesting a specific address */
if (addr) {
addr = PAGE_ALIGN(addr);
- vma = find_vma(mm, addr);
+ vma = find_vma_prev(mm, addr, &prev);
if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)) &&
+ (!prev || addr >= vm_end_gap(prev)))
return addr;
}
@@ -2155,21 +2173,19 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,
* update accounting. This is shared with both the
* grow-up and grow-down cases.
*/
-static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow)
+static int acct_stack_growth(struct vm_area_struct *vma,
+ unsigned long size, unsigned long grow)
{
struct mm_struct *mm = vma->vm_mm;
struct rlimit *rlim = current->signal->rlim;
- unsigned long new_start, actual_size;
+ unsigned long new_start;
/* address space limit tests */
if (!may_expand_vm(mm, vma->vm_flags, grow))
return -ENOMEM;
/* Stack limit test */
- actual_size = size;
- if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN)))
- actual_size -= PAGE_SIZE;
- if (actual_size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur))
+ if (size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur))
return -ENOMEM;
/* mlock limit tests */
@@ -2207,16 +2223,32 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *next;
+ unsigned long gap_addr;
int error = 0;
if (!(vma->vm_flags & VM_GROWSUP))
return -EFAULT;
- /* Guard against wrapping around to address 0. */
- if (address < PAGE_ALIGN(address+4))
- address = PAGE_ALIGN(address+4);
- else
+ /* Guard against exceeding limits of the address space. */
+ address &= PAGE_MASK;
+ if (address >= TASK_SIZE)
return -ENOMEM;
+ address += PAGE_SIZE;
+
+ /* Enforce stack_guard_gap */
+ gap_addr = address + stack_guard_gap;
+
+ /* Guard against overflow */
+ if (gap_addr < address || gap_addr > TASK_SIZE)
+ gap_addr = TASK_SIZE;
+
+ next = vma->vm_next;
+ if (next && next->vm_start < gap_addr) {
+ if (!(next->vm_flags & VM_GROWSUP))
+ return -ENOMEM;
+ /* Check that both stack segments have the same anon_vma? */
+ }
/* We must make sure the anon_vma is allocated. */
if (unlikely(anon_vma_prepare(vma)))
@@ -2261,7 +2293,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
if (vma->vm_next)
vma_gap_update(vma->vm_next);
else
- mm->highest_vm_end = address;
+ mm->highest_vm_end = vm_end_gap(vma);
spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
@@ -2282,6 +2314,8 @@ int expand_downwards(struct vm_area_struct *vma,
unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *prev;
+ unsigned long gap_addr;
int error;
address &= PAGE_MASK;
@@ -2289,6 +2323,17 @@ int expand_downwards(struct vm_area_struct *vma,
if (error)
return error;
+ /* Enforce stack_guard_gap */
+ gap_addr = address - stack_guard_gap;
+ if (gap_addr > address)
+ return -ENOMEM;
+ prev = vma->vm_prev;
+ if (prev && prev->vm_end > gap_addr) {
+ if (!(prev->vm_flags & VM_GROWSDOWN))
+ return -ENOMEM;
+ /* Check that both stack segments have the same anon_vma? */
+ }
+
/* We must make sure the anon_vma is allocated. */
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
@@ -2343,28 +2388,25 @@ int expand_downwards(struct vm_area_struct *vma,
return error;
}
-/*
- * Note how expand_stack() refuses to expand the stack all the way to
- * abut the next virtual mapping, *unless* that mapping itself is also
- * a stack mapping. We want to leave room for a guard page, after all
- * (the guard page itself is not added here, that is done by the
- * actual page faulting logic)
- *
- * This matches the behavior of the guard page logic (see mm/memory.c:
- * check_stack_guard_page()), which only allows the guard page to be
- * removed under these circumstances.
- */
+/* enforced gap between the expanding stack and other mappings. */
+unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
+
+static int __init cmdline_parse_stack_guard_gap(char *p)
+{
+ unsigned long val;
+ char *endptr;
+
+ val = simple_strtoul(p, &endptr, 10);
+ if (!*endptr)
+ stack_guard_gap = val << PAGE_SHIFT;
+
+ return 0;
+}
+__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
+
#ifdef CONFIG_STACK_GROWSUP
int expand_stack(struct vm_area_struct *vma, unsigned long address)
{
- struct vm_area_struct *next;
-
- address &= PAGE_MASK;
- next = vma->vm_next;
- if (next && next->vm_start == address + PAGE_SIZE) {
- if (!(next->vm_flags & VM_GROWSUP))
- return -ENOMEM;
- }
return expand_upwards(vma, address);
}
@@ -2386,14 +2428,6 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
#else
int expand_stack(struct vm_area_struct *vma, unsigned long address)
{
- struct vm_area_struct *prev;
-
- address &= PAGE_MASK;
- prev = vma->vm_prev;
- if (prev && prev->vm_end == address) {
- if (!(prev->vm_flags & VM_GROWSDOWN))
- return -ENOMEM;
- }
return expand_downwards(vma, address);
}
@@ -2491,7 +2525,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
vma->vm_prev = prev;
vma_gap_update(vma);
} else
- mm->highest_vm_end = prev ? prev->vm_end : 0;
+ mm->highest_vm_end = prev ? vm_end_gap(prev) : 0;
tail_vma->vm_next = NULL;
/* Kill the cache */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f9e450c6b6e4..2302f250d6b1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -292,6 +292,26 @@ int page_group_by_mobility_disabled __read_mostly;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
static inline void reset_deferred_meminit(pg_data_t *pgdat)
{
+ unsigned long max_initialise;
+ unsigned long reserved_lowmem;
+
+ /*
+ * Initialise at least 2G of a node but also take into account that
+ * two large system hashes that can take up 1GB for 0.25TB/node.
+ */
+ max_initialise = max(2UL << (30 - PAGE_SHIFT),
+ (pgdat->node_spanned_pages >> 8));
+
+ /*
+ * Compensate the all the memblock reservations (e.g. crash kernel)
+ * from the initial estimation to make sure we will initialize enough
+ * memory to boot.
+ */
+ reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn,
+ pgdat->node_start_pfn + max_initialise);
+ max_initialise += reserved_lowmem;
+
+ pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages);
pgdat->first_deferred_pfn = ULONG_MAX;
}
@@ -314,20 +334,11 @@ static inline bool update_defer_init(pg_data_t *pgdat,
unsigned long pfn, unsigned long zone_end,
unsigned long *nr_initialised)
{
- unsigned long max_initialise;
-
/* Always populate low zones for address-contrained allocations */
if (zone_end < pgdat_end_pfn(pgdat))
return true;
- /*
- * Initialise at least 2G of a node but also take into account that
- * two large system hashes that can take up 1GB for 0.25TB/node.
- */
- max_initialise = max(2UL << (30 - PAGE_SHIFT),
- (pgdat->node_spanned_pages >> 8));
-
(*nr_initialised)++;
- if ((*nr_initialised > max_initialise) &&
+ if ((*nr_initialised > pgdat->static_init_size) &&
(pfn & (PAGES_PER_SECTION - 1)) == 0) {
pgdat->first_deferred_pfn = pfn;
return false;
@@ -3870,7 +3881,9 @@ retry:
goto got_pg;
/* Avoid allocations with no watermarks from looping endlessly */
- if (test_thread_flag(TIF_MEMDIE))
+ if (test_thread_flag(TIF_MEMDIE) &&
+ (alloc_flags == ALLOC_NO_WATERMARKS ||
+ (gfp_mask & __GFP_NOMEMALLOC)))
goto nopage;
/* Retry as long as the OOM killer is making progress */
@@ -6136,7 +6149,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
/* pg_data_t should be reset to zero when it's allocated */
WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
- reset_deferred_meminit(pgdat);
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
pgdat->per_cpu_nodestats = NULL;
@@ -6158,6 +6170,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
(unsigned long)pgdat->node_mem_map);
#endif
+ reset_deferred_meminit(pgdat);
free_area_init_core(pgdat);
}
diff --git a/mm/page_io.c b/mm/page_io.c
index 23f6d0d3470f..2da71e627812 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -45,7 +45,7 @@ void end_swap_bio_write(struct bio *bio)
{
struct page *page = bio->bi_io_vec[0].bv_page;
- if (bio->bi_error) {
+ if (bio->bi_status) {
SetPageError(page);
/*
* We failed to write the page out to swap-space.
@@ -118,7 +118,7 @@ static void end_swap_bio_read(struct bio *bio)
{
struct page *page = bio->bi_io_vec[0].bv_page;
- if (bio->bi_error) {
+ if (bio->bi_status) {
SetPageError(page);
ClearPageUptodate(page);
pr_alert("Read-error on swap-device (%u:%u:%llu)\n",
diff --git a/mm/rmap.c b/mm/rmap.c
index d405f0e0ee96..130c238fe384 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -579,25 +579,13 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
void try_to_unmap_flush(void)
{
struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
- int cpu;
if (!tlb_ubc->flush_required)
return;
- cpu = get_cpu();
-
- if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) {
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- local_flush_tlb();
- trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
- }
-
- if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids)
- flush_tlb_others(&tlb_ubc->cpumask, NULL, 0, TLB_FLUSH_ALL);
- cpumask_clear(&tlb_ubc->cpumask);
+ arch_tlbbatch_flush(&tlb_ubc->arch);
tlb_ubc->flush_required = false;
tlb_ubc->writable = false;
- put_cpu();
}
/* Flush iff there are potentially writable TLB entries that can race with IO */
@@ -613,7 +601,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
{
struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
- cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm));
+ arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
tlb_ubc->flush_required = true;
/*
diff --git a/mm/shmem.c b/mm/shmem.c
index e67d6ba4e98e..9100c4952698 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -75,6 +75,7 @@ static struct vfsmount *shm_mnt;
#include <uapi/linux/memfd.h>
#include <linux/userfaultfd_k.h>
#include <linux/rmap.h>
+#include <linux/uuid.h>
#include <linux/uaccess.h>
#include <asm/pgtable.h>
@@ -1902,10 +1903,10 @@ unlock:
* entry unconditionally - even if something else had already woken the
* target.
*/
-static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
int ret = default_wake_function(wait, mode, sync, key);
- list_del_init(&wait->task_list);
+ list_del_init(&wait->entry);
return ret;
}
@@ -2840,7 +2841,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
spin_lock(&inode->i_lock);
inode->i_private = NULL;
wake_up_all(&shmem_falloc_waitq);
- WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.task_list));
+ WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
spin_unlock(&inode->i_lock);
error = 0;
goto out;
@@ -3761,6 +3762,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_TMPFS_POSIX_ACL
sb->s_flags |= MS_POSIXACL;
#endif
+ uuid_gen(&sb->s_uuid);
inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
if (!inode)
diff --git a/mm/slub.c b/mm/slub.c
index 57e5156f02be..8addc535bcdc 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5512,6 +5512,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
char mbuf[64];
char *buf;
struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
+ ssize_t len;
if (!attr || !attr->store || !attr->show)
continue;
@@ -5536,8 +5537,9 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
buf = buffer;
}
- attr->show(root_cache, buf);
- attr->store(s, buf, strlen(buf));
+ len = attr->show(root_cache, buf);
+ if (len > 0)
+ attr->store(s, buf, len);
}
if (buffer)
@@ -5623,6 +5625,28 @@ static char *create_unique_id(struct kmem_cache *s)
return name;
}
+static void sysfs_slab_remove_workfn(struct work_struct *work)
+{
+ struct kmem_cache *s =
+ container_of(work, struct kmem_cache, kobj_remove_work);
+
+ if (!s->kobj.state_in_sysfs)
+ /*
+ * For a memcg cache, this may be called during
+ * deactivation and again on shutdown. Remove only once.
+ * A cache is never shut down before deactivation is
+ * complete, so no need to worry about synchronization.
+ */
+ return;
+
+#ifdef CONFIG_MEMCG
+ kset_unregister(s->memcg_kset);
+#endif
+ kobject_uevent(&s->kobj, KOBJ_REMOVE);
+ kobject_del(&s->kobj);
+ kobject_put(&s->kobj);
+}
+
static int sysfs_slab_add(struct kmem_cache *s)
{
int err;
@@ -5630,6 +5654,8 @@ static int sysfs_slab_add(struct kmem_cache *s)
struct kset *kset = cache_kset(s);
int unmergeable = slab_unmergeable(s);
+ INIT_WORK(&s->kobj_remove_work, sysfs_slab_remove_workfn);
+
if (!kset) {
kobject_init(&s->kobj, &slab_ktype);
return 0;
@@ -5693,20 +5719,8 @@ static void sysfs_slab_remove(struct kmem_cache *s)
*/
return;
- if (!s->kobj.state_in_sysfs)
- /*
- * For a memcg cache, this may be called during
- * deactivation and again on shutdown. Remove only once.
- * A cache is never shut down before deactivation is
- * complete, so no need to worry about synchronization.
- */
- return;
-
-#ifdef CONFIG_MEMCG
- kset_unregister(s->memcg_kset);
-#endif
- kobject_uevent(&s->kobj, KOBJ_REMOVE);
- kobject_del(&s->kobj);
+ kobject_get(&s->kobj);
+ schedule_work(&s->kobj_remove_work);
}
void sysfs_slab_release(struct kmem_cache *s)
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index ac6318a064d3..3405b4ee1757 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -48,6 +48,9 @@ static int swap_cgroup_prepare(int type)
if (!page)
goto not_enough_page;
ctrl->map[idx] = page;
+
+ if (!(idx % SWAP_CLUSTER_MAX))
+ cond_resched();
}
return 0;
not_enough_page:
diff --git a/mm/util.c b/mm/util.c
index 464df3489903..26be6407abd7 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -357,8 +357,11 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
WARN_ON_ONCE((flags & GFP_KERNEL) != GFP_KERNEL);
/*
- * Make sure that larger requests are not too disruptive - no OOM
- * killer and no allocation failure warnings as we have a fallback
+ * We want to attempt a large physically contiguous block first because
+ * it is less likely to fragment multiple larger blocks and therefore
+ * contribute to a long term fragmentation less than vmalloc fallback.
+ * However make sure that larger requests are not too disruptive - no
+ * OOM killer and no allocation failure warnings as we have a fallback.
*/
if (size > PAGE_SIZE) {
kmalloc_flags |= __GFP_NOWARN;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 34a1c3e46ed7..ecc97f74ab18 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -287,10 +287,21 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
if (p4d_none(*p4d))
return NULL;
pud = pud_offset(p4d, addr);
- if (pud_none(*pud))
+
+ /*
+ * Don't dereference bad PUD or PMD (below) entries. This will also
+ * identify huge mappings, which we may encounter on architectures
+ * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
+ * identified as vmalloc addresses by is_vmalloc_addr(), but are
+ * not [unambiguously] associated with a struct page, so there is
+ * no correct value to return for them.
+ */
+ WARN_ON_ONCE(pud_bad(*pud));
+ if (pud_none(*pud) || pud_bad(*pud))
return NULL;
pmd = pmd_offset(pud, addr);
- if (pmd_none(*pmd))
+ WARN_ON_ONCE(pmd_bad(*pmd));
+ if (pmd_none(*pmd) || pmd_bad(*pmd))
return NULL;
ptep = pte_offset_map(pmd, addr);
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 6063581f705c..ce0618bfa8d0 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -115,9 +115,9 @@ static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
unsigned long pressure = 0;
/*
- * reclaimed can be greater than scanned in cases
- * like THP, where the scanned is 1 and reclaimed
- * could be 512
+ * reclaimed can be greater than scanned for things such as reclaimed
+ * slab pages. shrink_node() just adds reclaimed pages without a
+ * related increment to scanned pages.
*/
if (reclaimed >= scanned)
goto out;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8ad39bbc79e6..c3c1c6ac62da 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3652,7 +3652,7 @@ int kswapd_run(int nid)
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
- BUG_ON(system_state == SYSTEM_BOOTING);
+ BUG_ON(system_state < SYSTEM_RUNNING);
pr_err("Failed to start kswapd on node %d\n", nid);
ret = PTR_ERR(pgdat->kswapd);
pgdat->kswapd = NULL;