diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 2 | ||||
-rw-r--r-- | mm/cleancache.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 76 | ||||
-rw-r--r-- | mm/gup.c | 35 | ||||
-rw-r--r-- | mm/huge_memory.c | 8 | ||||
-rw-r--r-- | mm/hugetlb.c | 5 | ||||
-rw-r--r-- | mm/khugepaged.c | 1 | ||||
-rw-r--r-- | mm/ksm.c | 3 | ||||
-rw-r--r-- | mm/memblock.c | 23 | ||||
-rw-r--r-- | mm/memcontrol.c | 10 | ||||
-rw-r--r-- | mm/memory-failure.c | 13 | ||||
-rw-r--r-- | mm/memory.c | 78 | ||||
-rw-r--r-- | mm/mempool.c | 2 | ||||
-rw-r--r-- | mm/mlock.c | 5 | ||||
-rw-r--r-- | mm/mmap.c | 160 | ||||
-rw-r--r-- | mm/page_alloc.c | 37 | ||||
-rw-r--r-- | mm/page_io.c | 4 | ||||
-rw-r--r-- | mm/rmap.c | 16 | ||||
-rw-r--r-- | mm/shmem.c | 8 | ||||
-rw-r--r-- | mm/slub.c | 46 | ||||
-rw-r--r-- | mm/swap_cgroup.c | 3 | ||||
-rw-r--r-- | mm/util.c | 7 | ||||
-rw-r--r-- | mm/vmalloc.c | 15 | ||||
-rw-r--r-- | mm/vmpressure.c | 6 | ||||
-rw-r--r-- | mm/vmscan.c | 2 |
25 files changed, 345 insertions, 222 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index beb7a455915d..398b46064544 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -137,7 +137,7 @@ config HAVE_MEMBLOCK_NODE_MAP config HAVE_MEMBLOCK_PHYS_MAP bool -config HAVE_GENERIC_RCU_GUP +config HAVE_GENERIC_GUP bool config ARCH_DISCARD_MEMBLOCK diff --git a/mm/cleancache.c b/mm/cleancache.c index ba5d8f3e6d68..f7b9fdc79d97 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -130,7 +130,7 @@ void __cleancache_init_shared_fs(struct super_block *sb) int pool_id = CLEANCACHE_NO_BACKEND_SHARED; if (cleancache_ops) { - pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE); + pool_id = cleancache_ops->init_shared_fs(&sb->s_uuid, PAGE_SIZE); if (pool_id < 0) pool_id = CLEANCACHE_NO_POOL; } diff --git a/mm/filemap.c b/mm/filemap.c index 6f1be573a5e6..aea58e983a73 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -376,6 +376,38 @@ int filemap_flush(struct address_space *mapping) } EXPORT_SYMBOL(filemap_flush); +/** + * filemap_range_has_page - check if a page exists in range. + * @mapping: address space within which to check + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) + * + * Find at least one page in the range supplied, usually used to check if + * direct writing in this range will trigger a writeback. + */ +bool filemap_range_has_page(struct address_space *mapping, + loff_t start_byte, loff_t end_byte) +{ + pgoff_t index = start_byte >> PAGE_SHIFT; + pgoff_t end = end_byte >> PAGE_SHIFT; + struct pagevec pvec; + bool ret; + + if (end_byte < start_byte) + return false; + + if (mapping->nrpages == 0) + return false; + + pagevec_init(&pvec, 0); + if (!pagevec_lookup(&pvec, mapping, index, 1)) + return false; + ret = (pvec.pages[0]->index <= end); + pagevec_release(&pvec); + return ret; +} +EXPORT_SYMBOL(filemap_range_has_page); + static int __filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { @@ -768,10 +800,10 @@ struct wait_page_key { struct wait_page_queue { struct page *page; int bit_nr; - wait_queue_t wait; + wait_queue_entry_t wait; }; -static int wake_page_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) +static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { struct wait_page_key *key = arg; struct wait_page_queue *wait_page @@ -834,7 +866,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, struct page *page, int bit_nr, int state, bool lock) { struct wait_page_queue wait_page; - wait_queue_t *wait = &wait_page.wait; + wait_queue_entry_t *wait = &wait_page.wait; int ret = 0; init_wait(wait); @@ -845,9 +877,9 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, for (;;) { spin_lock_irq(&q->lock); - if (likely(list_empty(&wait->task_list))) { + if (likely(list_empty(&wait->entry))) { if (lock) - __add_wait_queue_tail_exclusive(q, wait); + __add_wait_queue_entry_tail_exclusive(q, wait); else __add_wait_queue(q, wait); SetPageWaiters(page); @@ -907,7 +939,7 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr) * * Add an arbitrary @waiter to the wait queue for the nominated @page. */ -void add_page_wait_queue(struct page *page, wait_queue_t *waiter) +void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter) { wait_queue_head_t *q = page_waitqueue(page); unsigned long flags; @@ -2038,10 +2070,17 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) loff_t size; size = i_size_read(inode); - retval = filemap_write_and_wait_range(mapping, iocb->ki_pos, - iocb->ki_pos + count - 1); - if (retval < 0) - goto out; + if (iocb->ki_flags & IOCB_NOWAIT) { + if (filemap_range_has_page(mapping, iocb->ki_pos, + iocb->ki_pos + count - 1)) + return -EAGAIN; + } else { + retval = filemap_write_and_wait_range(mapping, + iocb->ki_pos, + iocb->ki_pos + count - 1); + if (retval < 0) + goto out; + } file_accessed(file); @@ -2642,6 +2681,9 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) pos = iocb->ki_pos; + if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) + return -EINVAL; + if (limit != RLIM_INFINITY) { if (iocb->ki_pos >= limit) { send_sig(SIGXFSZ, current, 0); @@ -2710,9 +2752,17 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) write_len = iov_iter_count(from); end = (pos + write_len - 1) >> PAGE_SHIFT; - written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); - if (written) - goto out; + if (iocb->ki_flags & IOCB_NOWAIT) { + /* If there are pages to writeback, return */ + if (filemap_range_has_page(inode->i_mapping, pos, + pos + iov_iter_count(from))) + return -EAGAIN; + } else { + written = filemap_write_and_wait_range(mapping, pos, + pos + write_len - 1); + if (written) + goto out; + } /* * After a write we want buffered reads to be sure to go to disk to get @@ -387,11 +387,6 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, /* mlock all present pages, but do not fault in new pages */ if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) return -ENOENT; - /* For mm_populate(), just skip the stack guard page. */ - if ((*flags & FOLL_POPULATE) && - (stack_guard_page_start(vma, address) || - stack_guard_page_end(vma, address + PAGE_SIZE))) - return -ENOENT; if (*flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; if (*flags & FOLL_REMOTE) @@ -407,12 +402,10 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, ret = handle_mm_fault(vma, address, fault_flags); if (ret & VM_FAULT_ERROR) { - if (ret & VM_FAULT_OOM) - return -ENOMEM; - if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) - return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT; - if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) - return -EFAULT; + int err = vm_fault_to_errno(ret, *flags); + + if (err) + return err; BUG(); } @@ -723,12 +716,10 @@ retry: ret = handle_mm_fault(vma, address, fault_flags); major |= ret & VM_FAULT_MAJOR; if (ret & VM_FAULT_ERROR) { - if (ret & VM_FAULT_OOM) - return -ENOMEM; - if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) - return -EHWPOISON; - if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) - return -EFAULT; + int err = vm_fault_to_errno(ret, 0); + + if (err) + return err; BUG(); } @@ -1155,7 +1146,7 @@ struct page *get_dump_page(unsigned long addr) #endif /* CONFIG_ELF_CORE */ /* - * Generic RCU Fast GUP + * Generic Fast GUP * * get_user_pages_fast attempts to pin user pages by walking the page * tables directly and avoids taking locks. Thus the walker needs to be @@ -1176,8 +1167,8 @@ struct page *get_dump_page(unsigned long addr) * Before activating this code, please be aware that the following assumptions * are currently made: * - * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free - * pages containing page tables. + * *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to + * free pages containing page tables or TLB flushing requires IPI broadcast. * * *) ptes can be read atomically by the architecture. * @@ -1187,7 +1178,7 @@ struct page *get_dump_page(unsigned long addr) * * This code is based heavily on the PowerPC implementation by Nick Piggin. */ -#ifdef CONFIG_HAVE_GENERIC_RCU_GUP +#ifdef CONFIG_HAVE_GENERIC_GUP #ifndef gup_get_pte /* @@ -1677,4 +1668,4 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, return ret; } -#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */ +#endif /* CONFIG_HAVE_GENERIC_GUP */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a84909cf20d3..88c6167f194d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1426,8 +1426,11 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) */ if (unlikely(pmd_trans_migrating(*vmf->pmd))) { page = pmd_page(*vmf->pmd); + if (!get_page_unless_zero(page)) + goto out_unlock; spin_unlock(vmf->ptl); wait_on_page_locked(page); + put_page(page); goto out; } @@ -1459,9 +1462,12 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) /* Migration could have started since the pmd_trans_migrating check */ if (!page_locked) { + page_nid = -1; + if (!get_page_unless_zero(page)) + goto out_unlock; spin_unlock(vmf->ptl); wait_on_page_locked(page); - page_nid = -1; + put_page(page); goto out; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e5828875f7bb..3eedb187e549 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4170,6 +4170,11 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, } ret = hugetlb_fault(mm, vma, vaddr, fault_flags); if (ret & VM_FAULT_ERROR) { + int err = vm_fault_to_errno(ret, flags); + + if (err) + return err; + remainder = 0; break; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 945fd1ca49b5..df4ebdb2b10a 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -652,7 +652,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, spin_unlock(ptl); free_page_and_swap_cache(src_page); } - cond_resched(); } } @@ -1028,8 +1028,7 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, goto out; if (PageTransCompound(page)) { - err = split_huge_page(page); - if (err) + if (split_huge_page(page)) goto out_unlock; } diff --git a/mm/memblock.c b/mm/memblock.c index b049c9b2dba8..7b8a5db76a2f 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1739,6 +1739,29 @@ static void __init_memblock memblock_dump(struct memblock_type *type) } } +extern unsigned long __init_memblock +memblock_reserved_memory_within(phys_addr_t start_addr, phys_addr_t end_addr) +{ + struct memblock_region *rgn; + unsigned long size = 0; + int idx; + + for_each_memblock_type((&memblock.reserved), rgn) { + phys_addr_t start, end; + + if (rgn->base + rgn->size < start_addr) + continue; + if (rgn->base > end_addr) + continue; + + start = rgn->base; + end = start + rgn->size; + size += end - start; + } + + return size; +} + void __init_memblock __memblock_dump_all(void) { pr_info("MEMBLOCK configuration:\n"); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 94172089f52f..d75b38b66ef6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -170,7 +170,7 @@ struct mem_cgroup_event { */ poll_table pt; wait_queue_head_t *wqh; - wait_queue_t wait; + wait_queue_entry_t wait; struct work_struct remove; }; @@ -1479,10 +1479,10 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); struct oom_wait_info { struct mem_cgroup *memcg; - wait_queue_t wait; + wait_queue_entry_t wait; }; -static int memcg_oom_wake_function(wait_queue_t *wait, +static int memcg_oom_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; @@ -1570,7 +1570,7 @@ bool mem_cgroup_oom_synchronize(bool handle) owait.wait.flags = 0; owait.wait.func = memcg_oom_wake_function; owait.wait.private = current; - INIT_LIST_HEAD(&owait.wait.task_list); + INIT_LIST_HEAD(&owait.wait.entry); prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); mem_cgroup_mark_under_oom(memcg); @@ -3725,7 +3725,7 @@ static void memcg_event_remove(struct work_struct *work) * * Called with wqh->lock held and interrupts disabled. */ -static int memcg_event_wake(wait_queue_t *wait, unsigned mode, +static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct mem_cgroup_event *event = diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2527dfeddb00..ecc183fd94f3 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1184,7 +1184,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * page_remove_rmap() in try_to_unmap_one(). So to determine page status * correctly, we save a copy of the page flags at this time. */ - page_flags = p->flags; + if (PageHuge(p)) + page_flags = hpage->flags; + else + page_flags = p->flags; /* * unpoison always clear PG_hwpoison inside page lock @@ -1595,12 +1598,8 @@ static int soft_offline_huge_page(struct page *page, int flags) if (ret) { pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n", pfn, ret, page->flags, &page->flags); - /* - * We know that soft_offline_huge_page() tries to migrate - * only one hugepage pointed to by hpage, so we need not - * run through the pagelist here. - */ - putback_active_hugepage(hpage); + if (!list_empty(&pagelist)) + putback_movable_pages(&pagelist); if (ret > 0) ret = -EIO; } else { diff --git a/mm/memory.c b/mm/memory.c index 6ff5d729ded0..bb11c474857e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2855,40 +2855,6 @@ out_release: } /* - * This is like a special single-page "expand_{down|up}wards()", - * except we must first make sure that 'address{-|+}PAGE_SIZE' - * doesn't hit another vma. - */ -static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address) -{ - address &= PAGE_MASK; - if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) { - struct vm_area_struct *prev = vma->vm_prev; - - /* - * Is there a mapping abutting this one below? - * - * That's only ok if it's the same stack mapping - * that has gotten split.. - */ - if (prev && prev->vm_end == address) - return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; - - return expand_downwards(vma, address - PAGE_SIZE); - } - if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { - struct vm_area_struct *next = vma->vm_next; - - /* As VM_GROWSDOWN but s/below/above/ */ - if (next && next->vm_start == address + PAGE_SIZE) - return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM; - - return expand_upwards(vma, address + PAGE_SIZE); - } - return 0; -} - -/* * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. @@ -2904,10 +2870,6 @@ static int do_anonymous_page(struct vm_fault *vmf) if (vma->vm_flags & VM_SHARED) return VM_FAULT_SIGBUS; - /* Check if we need to add a guard page to the stack */ - if (check_stack_guard_page(vma, vmf->address) < 0) - return VM_FAULT_SIGSEGV; - /* * Use pte_alloc() instead of pte_alloc_map(). We can't run * pte_offset_map() on pmds where a huge pmd might be created @@ -3029,6 +2991,17 @@ static int __do_fault(struct vm_fault *vmf) return ret; } +/* + * The ordering of these checks is important for pmds with _PAGE_DEVMAP set. + * If we check pmd_trans_unstable() first we will trip the bad_pmd() check + * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly + * returning 1 but not before it spams dmesg with the pmd_clear_bad() output. + */ +static int pmd_devmap_trans_unstable(pmd_t *pmd) +{ + return pmd_devmap(*pmd) || pmd_trans_unstable(pmd); +} + static int pte_alloc_one_map(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -3052,18 +3025,27 @@ static int pte_alloc_one_map(struct vm_fault *vmf) map_pte: /* * If a huge pmd materialized under us just retry later. Use - * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd - * didn't become pmd_trans_huge under us and then back to pmd_none, as - * a result of MADV_DONTNEED running immediately after a huge pmd fault - * in a different thread of this mm, in turn leading to a misleading - * pmd_trans_huge() retval. All we have to ensure is that it is a - * regular pmd that we can walk with pte_offset_map() and we can do that - * through an atomic read in C, which is what pmd_trans_unstable() - * provides. + * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of + * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge + * under us and then back to pmd_none, as a result of MADV_DONTNEED + * running immediately after a huge pmd fault in a different thread of + * this mm, in turn leading to a misleading pmd_trans_huge() retval. + * All we have to ensure is that it is a regular pmd that we can walk + * with pte_offset_map() and we can do that through an atomic read in + * C, which is what pmd_trans_unstable() provides. */ - if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd)) + if (pmd_devmap_trans_unstable(vmf->pmd)) return VM_FAULT_NOPAGE; + /* + * At this point we know that our vmf->pmd points to a page of ptes + * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge() + * for the duration of the fault. If a racing MADV_DONTNEED runs and + * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still + * be valid and we will re-check to make sure the vmf->pte isn't + * pte_none() under vmf->ptl protection when we return to + * alloc_set_pte(). + */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); return 0; @@ -3690,7 +3672,7 @@ static int handle_pte_fault(struct vm_fault *vmf) vmf->pte = NULL; } else { /* See comment in pte_alloc_one_map() */ - if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd)) + if (pmd_devmap_trans_unstable(vmf->pmd)) return 0; /* * A regular pmd is established and it can't morph into a huge diff --git a/mm/mempool.c b/mm/mempool.c index 47a659dedd44..1c0294858527 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -312,7 +312,7 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) { void *element; unsigned long flags; - wait_queue_t wait; + wait_queue_entry_t wait; gfp_t gfp_temp; VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); diff --git a/mm/mlock.c b/mm/mlock.c index c483c5c20b4b..b562b5523a65 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -284,7 +284,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) { int i; int nr = pagevec_count(pvec); - int delta_munlocked; + int delta_munlocked = -nr; struct pagevec pvec_putback; int pgrescued = 0; @@ -304,6 +304,8 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) continue; else __munlock_isolation_failed(page); + } else { + delta_munlocked++; } /* @@ -315,7 +317,6 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) pagevec_add(&pvec_putback, pvec->pages[i]); pvec->pages[i] = NULL; } - delta_munlocked = -nr + pagevec_count(&pvec_putback); __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); spin_unlock_irq(zone_lru_lock(zone)); diff --git a/mm/mmap.c b/mm/mmap.c index f82741e199c0..a5e3dcd75e79 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -183,6 +183,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) unsigned long retval; unsigned long newbrk, oldbrk; struct mm_struct *mm = current->mm; + struct vm_area_struct *next; unsigned long min_brk; bool populate; LIST_HEAD(uf); @@ -229,7 +230,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) } /* Check against existing mmap mappings. */ - if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) + next = find_vma(mm, oldbrk); + if (next && newbrk + PAGE_SIZE > vm_start_gap(next)) goto out; /* Ok, looks good - let it rip. */ @@ -253,10 +255,22 @@ out: static long vma_compute_subtree_gap(struct vm_area_struct *vma) { - unsigned long max, subtree_gap; - max = vma->vm_start; - if (vma->vm_prev) - max -= vma->vm_prev->vm_end; + unsigned long max, prev_end, subtree_gap; + + /* + * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we + * allow two stack_guard_gaps between them here, and when choosing + * an unmapped area; whereas when expanding we only require one. + * That's a little inconsistent, but keeps the code here simpler. + */ + max = vm_start_gap(vma); + if (vma->vm_prev) { + prev_end = vm_end_gap(vma->vm_prev); + if (max > prev_end) + max -= prev_end; + else + max = 0; + } if (vma->vm_rb.rb_left) { subtree_gap = rb_entry(vma->vm_rb.rb_left, struct vm_area_struct, vm_rb)->rb_subtree_gap; @@ -352,7 +366,7 @@ static void validate_mm(struct mm_struct *mm) anon_vma_unlock_read(anon_vma); } - highest_address = vma->vm_end; + highest_address = vm_end_gap(vma); vma = vma->vm_next; i++; } @@ -541,7 +555,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, if (vma->vm_next) vma_gap_update(vma->vm_next); else - mm->highest_vm_end = vma->vm_end; + mm->highest_vm_end = vm_end_gap(vma); /* * vma->vm_prev wasn't known when we followed the rbtree to find the @@ -856,7 +870,7 @@ again: vma_gap_update(vma); if (end_changed) { if (!next) - mm->highest_vm_end = end; + mm->highest_vm_end = vm_end_gap(vma); else if (!adjust_next) vma_gap_update(next); } @@ -941,7 +955,7 @@ again: * mm->highest_vm_end doesn't need any update * in remove_next == 1 case. */ - VM_WARN_ON(mm->highest_vm_end != end); + VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma)); } } if (insert && file) @@ -1787,7 +1801,7 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info) while (true) { /* Visit left subtree if it looks promising */ - gap_end = vma->vm_start; + gap_end = vm_start_gap(vma); if (gap_end >= low_limit && vma->vm_rb.rb_left) { struct vm_area_struct *left = rb_entry(vma->vm_rb.rb_left, @@ -1798,12 +1812,13 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info) } } - gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; + gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0; check_current: /* Check if current node has a suitable gap */ if (gap_start > high_limit) return -ENOMEM; - if (gap_end >= low_limit && gap_end - gap_start >= length) + if (gap_end >= low_limit && + gap_end > gap_start && gap_end - gap_start >= length) goto found; /* Visit right subtree if it looks promising */ @@ -1825,8 +1840,8 @@ check_current: vma = rb_entry(rb_parent(prev), struct vm_area_struct, vm_rb); if (prev == vma->vm_rb.rb_left) { - gap_start = vma->vm_prev->vm_end; - gap_end = vma->vm_start; + gap_start = vm_end_gap(vma->vm_prev); + gap_end = vm_start_gap(vma); goto check_current; } } @@ -1890,7 +1905,7 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) while (true) { /* Visit right subtree if it looks promising */ - gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; + gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0; if (gap_start <= high_limit && vma->vm_rb.rb_right) { struct vm_area_struct *right = rb_entry(vma->vm_rb.rb_right, @@ -1903,10 +1918,11 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) check_current: /* Check if current node has a suitable gap */ - gap_end = vma->vm_start; + gap_end = vm_start_gap(vma); if (gap_end < low_limit) return -ENOMEM; - if (gap_start <= high_limit && gap_end - gap_start >= length) + if (gap_start <= high_limit && + gap_end > gap_start && gap_end - gap_start >= length) goto found; /* Visit left subtree if it looks promising */ @@ -1929,7 +1945,7 @@ check_current: struct vm_area_struct, vm_rb); if (prev == vma->vm_rb.rb_right) { gap_start = vma->vm_prev ? - vma->vm_prev->vm_end : 0; + vm_end_gap(vma->vm_prev) : 0; goto check_current; } } @@ -1967,7 +1983,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; + struct vm_area_struct *vma, *prev; struct vm_unmapped_area_info info; if (len > TASK_SIZE - mmap_min_addr) @@ -1978,9 +1994,10 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, if (addr) { addr = PAGE_ALIGN(addr); - vma = find_vma(mm, addr); + vma = find_vma_prev(mm, addr, &prev); if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma)) && + (!prev || addr >= vm_end_gap(prev))) return addr; } @@ -2003,7 +2020,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, const unsigned long len, const unsigned long pgoff, const unsigned long flags) { - struct vm_area_struct *vma; + struct vm_area_struct *vma, *prev; struct mm_struct *mm = current->mm; unsigned long addr = addr0; struct vm_unmapped_area_info info; @@ -2018,9 +2035,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, /* requesting a specific address */ if (addr) { addr = PAGE_ALIGN(addr); - vma = find_vma(mm, addr); + vma = find_vma_prev(mm, addr, &prev); if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma)) && + (!prev || addr >= vm_end_gap(prev))) return addr; } @@ -2155,21 +2173,19 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr, * update accounting. This is shared with both the * grow-up and grow-down cases. */ -static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) +static int acct_stack_growth(struct vm_area_struct *vma, + unsigned long size, unsigned long grow) { struct mm_struct *mm = vma->vm_mm; struct rlimit *rlim = current->signal->rlim; - unsigned long new_start, actual_size; + unsigned long new_start; /* address space limit tests */ if (!may_expand_vm(mm, vma->vm_flags, grow)) return -ENOMEM; /* Stack limit test */ - actual_size = size; - if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN))) - actual_size -= PAGE_SIZE; - if (actual_size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur)) + if (size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur)) return -ENOMEM; /* mlock limit tests */ @@ -2207,16 +2223,32 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns int expand_upwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *next; + unsigned long gap_addr; int error = 0; if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; - /* Guard against wrapping around to address 0. */ - if (address < PAGE_ALIGN(address+4)) - address = PAGE_ALIGN(address+4); - else + /* Guard against exceeding limits of the address space. */ + address &= PAGE_MASK; + if (address >= TASK_SIZE) return -ENOMEM; + address += PAGE_SIZE; + + /* Enforce stack_guard_gap */ + gap_addr = address + stack_guard_gap; + + /* Guard against overflow */ + if (gap_addr < address || gap_addr > TASK_SIZE) + gap_addr = TASK_SIZE; + + next = vma->vm_next; + if (next && next->vm_start < gap_addr) { + if (!(next->vm_flags & VM_GROWSUP)) + return -ENOMEM; + /* Check that both stack segments have the same anon_vma? */ + } /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) @@ -2261,7 +2293,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (vma->vm_next) vma_gap_update(vma->vm_next); else - mm->highest_vm_end = address; + mm->highest_vm_end = vm_end_gap(vma); spin_unlock(&mm->page_table_lock); perf_event_mmap(vma); @@ -2282,6 +2314,8 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *prev; + unsigned long gap_addr; int error; address &= PAGE_MASK; @@ -2289,6 +2323,17 @@ int expand_downwards(struct vm_area_struct *vma, if (error) return error; + /* Enforce stack_guard_gap */ + gap_addr = address - stack_guard_gap; + if (gap_addr > address) + return -ENOMEM; + prev = vma->vm_prev; + if (prev && prev->vm_end > gap_addr) { + if (!(prev->vm_flags & VM_GROWSDOWN)) + return -ENOMEM; + /* Check that both stack segments have the same anon_vma? */ + } + /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) return -ENOMEM; @@ -2343,28 +2388,25 @@ int expand_downwards(struct vm_area_struct *vma, return error; } -/* - * Note how expand_stack() refuses to expand the stack all the way to - * abut the next virtual mapping, *unless* that mapping itself is also - * a stack mapping. We want to leave room for a guard page, after all - * (the guard page itself is not added here, that is done by the - * actual page faulting logic) - * - * This matches the behavior of the guard page logic (see mm/memory.c: - * check_stack_guard_page()), which only allows the guard page to be - * removed under these circumstances. - */ +/* enforced gap between the expanding stack and other mappings. */ +unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT; + +static int __init cmdline_parse_stack_guard_gap(char *p) +{ + unsigned long val; + char *endptr; + + val = simple_strtoul(p, &endptr, 10); + if (!*endptr) + stack_guard_gap = val << PAGE_SHIFT; + + return 0; +} +__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap); + #ifdef CONFIG_STACK_GROWSUP int expand_stack(struct vm_area_struct *vma, unsigned long address) { - struct vm_area_struct *next; - - address &= PAGE_MASK; - next = vma->vm_next; - if (next && next->vm_start == address + PAGE_SIZE) { - if (!(next->vm_flags & VM_GROWSUP)) - return -ENOMEM; - } return expand_upwards(vma, address); } @@ -2386,14 +2428,6 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) #else int expand_stack(struct vm_area_struct *vma, unsigned long address) { - struct vm_area_struct *prev; - - address &= PAGE_MASK; - prev = vma->vm_prev; - if (prev && prev->vm_end == address) { - if (!(prev->vm_flags & VM_GROWSDOWN)) - return -ENOMEM; - } return expand_downwards(vma, address); } @@ -2491,7 +2525,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, vma->vm_prev = prev; vma_gap_update(vma); } else - mm->highest_vm_end = prev ? prev->vm_end : 0; + mm->highest_vm_end = prev ? vm_end_gap(prev) : 0; tail_vma->vm_next = NULL; /* Kill the cache */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f9e450c6b6e4..2302f250d6b1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -292,6 +292,26 @@ int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT static inline void reset_deferred_meminit(pg_data_t *pgdat) { + unsigned long max_initialise; + unsigned long reserved_lowmem; + + /* + * Initialise at least 2G of a node but also take into account that + * two large system hashes that can take up 1GB for 0.25TB/node. + */ + max_initialise = max(2UL << (30 - PAGE_SHIFT), + (pgdat->node_spanned_pages >> 8)); + + /* + * Compensate the all the memblock reservations (e.g. crash kernel) + * from the initial estimation to make sure we will initialize enough + * memory to boot. + */ + reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn, + pgdat->node_start_pfn + max_initialise); + max_initialise += reserved_lowmem; + + pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages); pgdat->first_deferred_pfn = ULONG_MAX; } @@ -314,20 +334,11 @@ static inline bool update_defer_init(pg_data_t *pgdat, unsigned long pfn, unsigned long zone_end, unsigned long *nr_initialised) { - unsigned long max_initialise; - /* Always populate low zones for address-contrained allocations */ if (zone_end < pgdat_end_pfn(pgdat)) return true; - /* - * Initialise at least 2G of a node but also take into account that - * two large system hashes that can take up 1GB for 0.25TB/node. - */ - max_initialise = max(2UL << (30 - PAGE_SHIFT), - (pgdat->node_spanned_pages >> 8)); - (*nr_initialised)++; - if ((*nr_initialised > max_initialise) && + if ((*nr_initialised > pgdat->static_init_size) && (pfn & (PAGES_PER_SECTION - 1)) == 0) { pgdat->first_deferred_pfn = pfn; return false; @@ -3870,7 +3881,9 @@ retry: goto got_pg; /* Avoid allocations with no watermarks from looping endlessly */ - if (test_thread_flag(TIF_MEMDIE)) + if (test_thread_flag(TIF_MEMDIE) && + (alloc_flags == ALLOC_NO_WATERMARKS || + (gfp_mask & __GFP_NOMEMALLOC))) goto nopage; /* Retry as long as the OOM killer is making progress */ @@ -6136,7 +6149,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, /* pg_data_t should be reset to zero when it's allocated */ WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); - reset_deferred_meminit(pgdat); pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; pgdat->per_cpu_nodestats = NULL; @@ -6158,6 +6170,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, (unsigned long)pgdat->node_mem_map); #endif + reset_deferred_meminit(pgdat); free_area_init_core(pgdat); } diff --git a/mm/page_io.c b/mm/page_io.c index 23f6d0d3470f..2da71e627812 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -45,7 +45,7 @@ void end_swap_bio_write(struct bio *bio) { struct page *page = bio->bi_io_vec[0].bv_page; - if (bio->bi_error) { + if (bio->bi_status) { SetPageError(page); /* * We failed to write the page out to swap-space. @@ -118,7 +118,7 @@ static void end_swap_bio_read(struct bio *bio) { struct page *page = bio->bi_io_vec[0].bv_page; - if (bio->bi_error) { + if (bio->bi_status) { SetPageError(page); ClearPageUptodate(page); pr_alert("Read-error on swap-device (%u:%u:%llu)\n", diff --git a/mm/rmap.c b/mm/rmap.c index d405f0e0ee96..130c238fe384 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -579,25 +579,13 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma) void try_to_unmap_flush(void) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; - int cpu; if (!tlb_ubc->flush_required) return; - cpu = get_cpu(); - - if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) { - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - local_flush_tlb(); - trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); - } - - if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) - flush_tlb_others(&tlb_ubc->cpumask, NULL, 0, TLB_FLUSH_ALL); - cpumask_clear(&tlb_ubc->cpumask); + arch_tlbbatch_flush(&tlb_ubc->arch); tlb_ubc->flush_required = false; tlb_ubc->writable = false; - put_cpu(); } /* Flush iff there are potentially writable TLB entries that can race with IO */ @@ -613,7 +601,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; - cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm)); + arch_tlbbatch_add_mm(&tlb_ubc->arch, mm); tlb_ubc->flush_required = true; /* diff --git a/mm/shmem.c b/mm/shmem.c index e67d6ba4e98e..9100c4952698 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -75,6 +75,7 @@ static struct vfsmount *shm_mnt; #include <uapi/linux/memfd.h> #include <linux/userfaultfd_k.h> #include <linux/rmap.h> +#include <linux/uuid.h> #include <linux/uaccess.h> #include <asm/pgtable.h> @@ -1902,10 +1903,10 @@ unlock: * entry unconditionally - even if something else had already woken the * target. */ -static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); - list_del_init(&wait->task_list); + list_del_init(&wait->entry); return ret; } @@ -2840,7 +2841,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, spin_lock(&inode->i_lock); inode->i_private = NULL; wake_up_all(&shmem_falloc_waitq); - WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.task_list)); + WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head)); spin_unlock(&inode->i_lock); error = 0; goto out; @@ -3761,6 +3762,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_TMPFS_POSIX_ACL sb->s_flags |= MS_POSIXACL; #endif + uuid_gen(&sb->s_uuid); inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); if (!inode) diff --git a/mm/slub.c b/mm/slub.c index 57e5156f02be..8addc535bcdc 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5512,6 +5512,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) char mbuf[64]; char *buf; struct slab_attribute *attr = to_slab_attr(slab_attrs[i]); + ssize_t len; if (!attr || !attr->store || !attr->show) continue; @@ -5536,8 +5537,9 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) buf = buffer; } - attr->show(root_cache, buf); - attr->store(s, buf, strlen(buf)); + len = attr->show(root_cache, buf); + if (len > 0) + attr->store(s, buf, len); } if (buffer) @@ -5623,6 +5625,28 @@ static char *create_unique_id(struct kmem_cache *s) return name; } +static void sysfs_slab_remove_workfn(struct work_struct *work) +{ + struct kmem_cache *s = + container_of(work, struct kmem_cache, kobj_remove_work); + + if (!s->kobj.state_in_sysfs) + /* + * For a memcg cache, this may be called during + * deactivation and again on shutdown. Remove only once. + * A cache is never shut down before deactivation is + * complete, so no need to worry about synchronization. + */ + return; + +#ifdef CONFIG_MEMCG + kset_unregister(s->memcg_kset); +#endif + kobject_uevent(&s->kobj, KOBJ_REMOVE); + kobject_del(&s->kobj); + kobject_put(&s->kobj); +} + static int sysfs_slab_add(struct kmem_cache *s) { int err; @@ -5630,6 +5654,8 @@ static int sysfs_slab_add(struct kmem_cache *s) struct kset *kset = cache_kset(s); int unmergeable = slab_unmergeable(s); + INIT_WORK(&s->kobj_remove_work, sysfs_slab_remove_workfn); + if (!kset) { kobject_init(&s->kobj, &slab_ktype); return 0; @@ -5693,20 +5719,8 @@ static void sysfs_slab_remove(struct kmem_cache *s) */ return; - if (!s->kobj.state_in_sysfs) - /* - * For a memcg cache, this may be called during - * deactivation and again on shutdown. Remove only once. - * A cache is never shut down before deactivation is - * complete, so no need to worry about synchronization. - */ - return; - -#ifdef CONFIG_MEMCG - kset_unregister(s->memcg_kset); -#endif - kobject_uevent(&s->kobj, KOBJ_REMOVE); - kobject_del(&s->kobj); + kobject_get(&s->kobj); + schedule_work(&s->kobj_remove_work); } void sysfs_slab_release(struct kmem_cache *s) diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index ac6318a064d3..3405b4ee1757 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -48,6 +48,9 @@ static int swap_cgroup_prepare(int type) if (!page) goto not_enough_page; ctrl->map[idx] = page; + + if (!(idx % SWAP_CLUSTER_MAX)) + cond_resched(); } return 0; not_enough_page: diff --git a/mm/util.c b/mm/util.c index 464df3489903..26be6407abd7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -357,8 +357,11 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) WARN_ON_ONCE((flags & GFP_KERNEL) != GFP_KERNEL); /* - * Make sure that larger requests are not too disruptive - no OOM - * killer and no allocation failure warnings as we have a fallback + * We want to attempt a large physically contiguous block first because + * it is less likely to fragment multiple larger blocks and therefore + * contribute to a long term fragmentation less than vmalloc fallback. + * However make sure that larger requests are not too disruptive - no + * OOM killer and no allocation failure warnings as we have a fallback. */ if (size > PAGE_SIZE) { kmalloc_flags |= __GFP_NOWARN; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 34a1c3e46ed7..ecc97f74ab18 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -287,10 +287,21 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) if (p4d_none(*p4d)) return NULL; pud = pud_offset(p4d, addr); - if (pud_none(*pud)) + + /* + * Don't dereference bad PUD or PMD (below) entries. This will also + * identify huge mappings, which we may encounter on architectures + * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be + * identified as vmalloc addresses by is_vmalloc_addr(), but are + * not [unambiguously] associated with a struct page, so there is + * no correct value to return for them. + */ + WARN_ON_ONCE(pud_bad(*pud)); + if (pud_none(*pud) || pud_bad(*pud)) return NULL; pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) + WARN_ON_ONCE(pmd_bad(*pmd)); + if (pmd_none(*pmd) || pmd_bad(*pmd)) return NULL; ptep = pte_offset_map(pmd, addr); diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 6063581f705c..ce0618bfa8d0 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -115,9 +115,9 @@ static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned, unsigned long pressure = 0; /* - * reclaimed can be greater than scanned in cases - * like THP, where the scanned is 1 and reclaimed - * could be 512 + * reclaimed can be greater than scanned for things such as reclaimed + * slab pages. shrink_node() just adds reclaimed pages without a + * related increment to scanned pages. */ if (reclaimed >= scanned) goto out; diff --git a/mm/vmscan.c b/mm/vmscan.c index 8ad39bbc79e6..c3c1c6ac62da 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3652,7 +3652,7 @@ int kswapd_run(int nid) pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ - BUG_ON(system_state == SYSTEM_BOOTING); + BUG_ON(system_state < SYSTEM_RUNNING); pr_err("Failed to start kswapd on node %d\n", nid); ret = PTR_ERR(pgdat->kswapd); pgdat->kswapd = NULL; |