diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/filemap.c | 160 | ||||
-rw-r--r-- | mm/huge_memory.c | 42 | ||||
-rw-r--r-- | mm/hugetlb.c | 49 | ||||
-rw-r--r-- | mm/khugepaged.c | 2 | ||||
-rw-r--r-- | mm/ksm.c | 4 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 6 | ||||
-rw-r--r-- | mm/memory.c | 37 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 14 | ||||
-rw-r--r-- | mm/memremap.c | 2 | ||||
-rw-r--r-- | mm/migrate.c | 32 | ||||
-rw-r--r-- | mm/mlock.c | 24 | ||||
-rw-r--r-- | mm/page_isolation.c | 8 | ||||
-rw-r--r-- | mm/percpu.c | 2 | ||||
-rw-r--r-- | mm/rmap.c | 9 | ||||
-rw-r--r-- | mm/shmem.c | 10 | ||||
-rw-r--r-- | mm/slub.c | 12 | ||||
-rw-r--r-- | mm/swap.c | 6 | ||||
-rw-r--r-- | mm/vmscan.c | 18 |
19 files changed, 322 insertions, 117 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 054d93a86f8a..8c8f2deee4e3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -988,9 +988,43 @@ void __init pagecache_init(void) page_writeback_init(); } +/* + * The page wait code treats the "wait->flags" somewhat unusually, because + * we have multiple different kinds of waits, not just the usual "exclusive" + * one. + * + * We have: + * + * (a) no special bits set: + * + * We're just waiting for the bit to be released, and when a waker + * calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up, + * and remove it from the wait queue. + * + * Simple and straightforward. + * + * (b) WQ_FLAG_EXCLUSIVE: + * + * The waiter is waiting to get the lock, and only one waiter should + * be woken up to avoid any thundering herd behavior. We'll set the + * WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue. + * + * This is the traditional exclusive wait. + * + * (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM: + * + * The waiter is waiting to get the bit, and additionally wants the + * lock to be transferred to it for fair lock behavior. If the lock + * cannot be taken, we stop walking the wait queue without waking + * the waiter. + * + * This is the "fair lock handoff" case, and in addition to setting + * WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see + * that it now has the lock. + */ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { - int ret; + unsigned int flags; struct wait_page_key *key = arg; struct wait_page_queue *wait_page = container_of(wait, struct wait_page_queue, wait); @@ -999,35 +1033,44 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, return 0; /* - * If it's an exclusive wait, we get the bit for it, and - * stop walking if we can't. - * - * If it's a non-exclusive wait, then the fact that this - * wake function was called means that the bit already - * was cleared, and we don't care if somebody then - * re-took it. + * If it's a lock handoff wait, we get the bit for it, and + * stop walking (and do not wake it up) if we can't. */ - ret = 0; - if (wait->flags & WQ_FLAG_EXCLUSIVE) { - if (test_and_set_bit(key->bit_nr, &key->page->flags)) + flags = wait->flags; + if (flags & WQ_FLAG_EXCLUSIVE) { + if (test_bit(key->bit_nr, &key->page->flags)) return -1; - ret = 1; + if (flags & WQ_FLAG_CUSTOM) { + if (test_and_set_bit(key->bit_nr, &key->page->flags)) + return -1; + flags |= WQ_FLAG_DONE; + } } - wait->flags |= WQ_FLAG_WOKEN; + /* + * We are holding the wait-queue lock, but the waiter that + * is waiting for this will be checking the flags without + * any locking. + * + * So update the flags atomically, and wake up the waiter + * afterwards to avoid any races. This store-release pairs + * with the load-acquire in wait_on_page_bit_common(). + */ + smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN); wake_up_state(wait->private, mode); /* * Ok, we have successfully done what we're waiting for, * and we can unconditionally remove the wait entry. * - * Note that this has to be the absolute last thing we do, - * since after list_del_init(&wait->entry) the wait entry + * Note that this pairs with the "finish_wait()" in the + * waiter, and has to be the absolute last thing we do. + * After this list_del_init(&wait->entry) the wait entry * might be de-allocated and the process might even have * exited. */ list_del_init_careful(&wait->entry); - return ret; + return (flags & WQ_FLAG_EXCLUSIVE) != 0; } static void wake_up_page_bit(struct page *page, int bit_nr) @@ -1107,8 +1150,8 @@ enum behavior { }; /* - * Attempt to check (or get) the page bit, and mark the - * waiter woken if successful. + * Attempt to check (or get) the page bit, and mark us done + * if successful. */ static inline bool trylock_page_bit_common(struct page *page, int bit_nr, struct wait_queue_entry *wait) @@ -1119,13 +1162,17 @@ static inline bool trylock_page_bit_common(struct page *page, int bit_nr, } else if (test_bit(bit_nr, &page->flags)) return false; - wait->flags |= WQ_FLAG_WOKEN; + wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE; return true; } +/* How many times do we accept lock stealing from under a waiter? */ +int sysctl_page_lock_unfairness = 5; + static inline int wait_on_page_bit_common(wait_queue_head_t *q, struct page *page, int bit_nr, int state, enum behavior behavior) { + int unfairness = sysctl_page_lock_unfairness; struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; bool thrashing = false; @@ -1143,11 +1190,18 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, } init_wait(wait); - wait->flags = behavior == EXCLUSIVE ? WQ_FLAG_EXCLUSIVE : 0; wait->func = wake_page_function; wait_page.page = page; wait_page.bit_nr = bit_nr; +repeat: + wait->flags = 0; + if (behavior == EXCLUSIVE) { + wait->flags = WQ_FLAG_EXCLUSIVE; + if (--unfairness < 0) + wait->flags |= WQ_FLAG_CUSTOM; + } + /* * Do one last check whether we can get the * page bit synchronously. @@ -1170,27 +1224,63 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, /* * From now on, all the logic will be based on - * the WQ_FLAG_WOKEN flag, and the and the page - * bit testing (and setting) will be - or has - * already been - done by the wake function. + * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to + * see whether the page bit testing has already + * been done by the wake function. * * We can drop our reference to the page. */ if (behavior == DROP) put_page(page); + /* + * Note that until the "finish_wait()", or until + * we see the WQ_FLAG_WOKEN flag, we need to + * be very careful with the 'wait->flags', because + * we may race with a waker that sets them. + */ for (;;) { + unsigned int flags; + set_current_state(state); - if (signal_pending_state(state, current)) + /* Loop until we've been woken or interrupted */ + flags = smp_load_acquire(&wait->flags); + if (!(flags & WQ_FLAG_WOKEN)) { + if (signal_pending_state(state, current)) + break; + + io_schedule(); + continue; + } + + /* If we were non-exclusive, we're done */ + if (behavior != EXCLUSIVE) break; - if (wait->flags & WQ_FLAG_WOKEN) + /* If the waker got the lock for us, we're done */ + if (flags & WQ_FLAG_DONE) break; - io_schedule(); + /* + * Otherwise, if we're getting the lock, we need to + * try to get it ourselves. + * + * And if that fails, we'll have to retry this all. + */ + if (unlikely(test_and_set_bit(bit_nr, &page->flags))) + goto repeat; + + wait->flags |= WQ_FLAG_DONE; + break; } + /* + * If a signal happened, this 'finish_wait()' may remove the last + * waiter from the wait-queues, but the PageWaiters bit will remain + * set. That's ok. The next wakeup will take care of it, and trying + * to do it here would be difficult and prone to races. + */ finish_wait(q, wait); if (thrashing) { @@ -1200,12 +1290,20 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, } /* - * A signal could leave PageWaiters set. Clearing it here if - * !waitqueue_active would be possible (by open-coding finish_wait), - * but still fail to catch it in the case of wait hash collision. We - * already can fail to clear wait hash collision cases, so don't - * bother with signals either. + * NOTE! The wait->flags weren't stable until we've done the + * 'finish_wait()', and we could have exited the loop above due + * to a signal, and had a wakeup event happen after the signal + * test but before the 'finish_wait()'. + * + * So only after the finish_wait() can we reliably determine + * if we got woken up or not, so we can now figure out the final + * return value based on that state without races. + * + * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive + * waiter, but an exclusive one requires WQ_FLAG_DONE. */ + if (behavior == EXCLUSIVE) + return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR; return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7ff29cc3d55c..faadc449cca5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2022,7 +2022,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, put_page(page); add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); return; - } else if (is_huge_zero_pmd(*pmd)) { + } else if (pmd_trans_huge(*pmd) && is_huge_zero_pmd(*pmd)) { /* * FIXME: Do we want to invalidate secondary mmu by calling * mmu_notifier_invalidate_range() see comments below inside @@ -2116,30 +2116,34 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, pte = pte_offset_map(&_pmd, addr); BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, entry); - atomic_inc(&page[i]._mapcount); - pte_unmap(pte); - } - - /* - * Set PG_double_map before dropping compound_mapcount to avoid - * false-negative page_mapped(). - */ - if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) { - for (i = 0; i < HPAGE_PMD_NR; i++) + if (!pmd_migration) atomic_inc(&page[i]._mapcount); + pte_unmap(pte); } - lock_page_memcg(page); - if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { - /* Last compound_mapcount is gone. */ - __dec_lruvec_page_state(page, NR_ANON_THPS); - if (TestClearPageDoubleMap(page)) { - /* No need in mapcount reference anymore */ + if (!pmd_migration) { + /* + * Set PG_double_map before dropping compound_mapcount to avoid + * false-negative page_mapped(). + */ + if (compound_mapcount(page) > 1 && + !TestSetPageDoubleMap(page)) { for (i = 0; i < HPAGE_PMD_NR; i++) - atomic_dec(&page[i]._mapcount); + atomic_inc(&page[i]._mapcount); + } + + lock_page_memcg(page); + if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { + /* Last compound_mapcount is gone. */ + __dec_lruvec_page_state(page, NR_ANON_THPS); + if (TestClearPageDoubleMap(page)) { + /* No need in mapcount reference anymore */ + for (i = 0; i < HPAGE_PMD_NR; i++) + atomic_dec(&page[i]._mapcount); + } } + unlock_page_memcg(page); } - unlock_page_memcg(page); smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a301c2d672bf..67fc6383995b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1250,21 +1250,32 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { unsigned long nr_pages = 1UL << huge_page_order(h); + if (nid == NUMA_NO_NODE) + nid = numa_mem_id(); #ifdef CONFIG_CMA { struct page *page; int node; - for_each_node_mask(node, *nodemask) { - if (!hugetlb_cma[node]) - continue; - - page = cma_alloc(hugetlb_cma[node], nr_pages, - huge_page_order(h), true); + if (hugetlb_cma[nid]) { + page = cma_alloc(hugetlb_cma[nid], nr_pages, + huge_page_order(h), true); if (page) return page; } + + if (!(gfp_mask & __GFP_THISNODE)) { + for_each_node_mask(node, *nodemask) { + if (node == nid || !hugetlb_cma[node]) + continue; + + page = cma_alloc(hugetlb_cma[node], nr_pages, + huge_page_order(h), true); + if (page) + return page; + } + } } #endif @@ -3454,6 +3465,22 @@ static unsigned int allowed_mems_nr(struct hstate *h) } #ifdef CONFIG_SYSCTL +static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write, + void *buffer, size_t *length, + loff_t *ppos, unsigned long *out) +{ + struct ctl_table dup_table; + + /* + * In order to avoid races with __do_proc_doulongvec_minmax(), we + * can duplicate the @table and alter the duplicate of it. + */ + dup_table = *table; + dup_table.data = out; + + return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos); +} + static int hugetlb_sysctl_handler_common(bool obey_mempolicy, struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) @@ -3465,9 +3492,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, if (!hugepages_supported()) return -EOPNOTSUPP; - table->data = &tmp; - table->maxlen = sizeof(unsigned long); - ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, + &tmp); if (ret) goto out; @@ -3510,9 +3536,8 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, if (write && hstate_is_gigantic(h)) return -EINVAL; - table->data = &tmp; - table->maxlen = sizeof(unsigned long); - ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, + &tmp); if (ret) goto out; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e749e568e1ea..cfa0dba5fd3b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1709,7 +1709,7 @@ static void collapse_file(struct mm_struct *mm, xas_unlock_irq(&xas); page_cache_sync_readahead(mapping, &file->f_ra, file, index, - PAGE_SIZE); + end - index); /* drain pagevecs to help isolate_lru_page() */ lru_add_drain(); page = find_lock_page(mapping, index); @@ -2586,6 +2586,10 @@ struct page *ksm_might_need_to_copy(struct page *page, return page; /* let do_swap_page report the error */ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (new_page && mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL)) { + put_page(new_page); + new_page = NULL; + } if (new_page) { copy_user_highpage(new_page, page, address, vma); diff --git a/mm/madvise.c b/mm/madvise.c index dd1d43cf026d..d4aa5f776543 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -289,9 +289,9 @@ static long madvise_willneed(struct vm_area_struct *vma, */ *prev = NULL; /* tell sys_madvise we drop mmap_lock */ get_file(file); - mmap_read_unlock(current->mm); offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + mmap_read_unlock(current->mm); vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); fput(file); mmap_read_lock(current->mm); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b807952b4d43..cfa6cbad21d5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6774,6 +6774,9 @@ static void uncharge_batch(const struct uncharge_gather *ug) __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages); memcg_check_events(ug->memcg, ug->dummy_page); local_irq_restore(flags); + + /* drop reference from uncharge_page */ + css_put(&ug->memcg->css); } static void uncharge_page(struct page *page, struct uncharge_gather *ug) @@ -6797,6 +6800,9 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) uncharge_gather_clear(ug); } ug->memcg = page->mem_cgroup; + + /* pairs with css_put in uncharge_batch */ + css_get(&ug->memcg->css); } nr_pages = compound_nr(page); diff --git a/mm/memory.c b/mm/memory.c index 148eafb8cbb1..469af373ae76 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -73,6 +73,7 @@ #include <linux/numa.h> #include <linux/perf_event.h> #include <linux/ptrace.h> +#include <linux/vmalloc.h> #include <trace/events/kmem.h> @@ -83,6 +84,7 @@ #include <asm/tlb.h> #include <asm/tlbflush.h> +#include "pgalloc-track.h" #include "internal.h" #if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST) @@ -2206,7 +2208,8 @@ EXPORT_SYMBOL(vm_iomap_memory); static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, - pte_fn_t fn, void *data, bool create) + pte_fn_t fn, void *data, bool create, + pgtbl_mod_mask *mask) { pte_t *pte; int err = 0; @@ -2214,7 +2217,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, if (create) { pte = (mm == &init_mm) ? - pte_alloc_kernel(pmd, addr) : + pte_alloc_kernel_track(pmd, addr, mask) : pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; @@ -2235,6 +2238,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, break; } } while (addr += PAGE_SIZE, addr != end); + *mask |= PGTBL_PTE_MODIFIED; arch_leave_lazy_mmu_mode(); @@ -2245,7 +2249,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, - pte_fn_t fn, void *data, bool create) + pte_fn_t fn, void *data, bool create, + pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; @@ -2254,7 +2259,7 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, BUG_ON(pud_huge(*pud)); if (create) { - pmd = pmd_alloc(mm, pud, addr); + pmd = pmd_alloc_track(mm, pud, addr, mask); if (!pmd) return -ENOMEM; } else { @@ -2264,7 +2269,7 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, next = pmd_addr_end(addr, end); if (create || !pmd_none_or_clear_bad(pmd)) { err = apply_to_pte_range(mm, pmd, addr, next, fn, data, - create); + create, mask); if (err) break; } @@ -2274,14 +2279,15 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, unsigned long addr, unsigned long end, - pte_fn_t fn, void *data, bool create) + pte_fn_t fn, void *data, bool create, + pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; int err = 0; if (create) { - pud = pud_alloc(mm, p4d, addr); + pud = pud_alloc_track(mm, p4d, addr, mask); if (!pud) return -ENOMEM; } else { @@ -2291,7 +2297,7 @@ static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, next = pud_addr_end(addr, end); if (create || !pud_none_or_clear_bad(pud)) { err = apply_to_pmd_range(mm, pud, addr, next, fn, data, - create); + create, mask); if (err) break; } @@ -2301,14 +2307,15 @@ static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, unsigned long end, - pte_fn_t fn, void *data, bool create) + pte_fn_t fn, void *data, bool create, + pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; int err = 0; if (create) { - p4d = p4d_alloc(mm, pgd, addr); + p4d = p4d_alloc_track(mm, pgd, addr, mask); if (!p4d) return -ENOMEM; } else { @@ -2318,7 +2325,7 @@ static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, next = p4d_addr_end(addr, end); if (create || !p4d_none_or_clear_bad(p4d)) { err = apply_to_pud_range(mm, p4d, addr, next, fn, data, - create); + create, mask); if (err) break; } @@ -2331,8 +2338,9 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, void *data, bool create) { pgd_t *pgd; - unsigned long next; + unsigned long start = addr, next; unsigned long end = addr + size; + pgtbl_mod_mask mask = 0; int err = 0; if (WARN_ON(addr >= end)) @@ -2343,11 +2351,14 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, next = pgd_addr_end(addr, end); if (!create && pgd_none_or_clear_bad(pgd)) continue; - err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create); + err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask); if (err) break; } while (pgd++, addr = next, addr != end); + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, start + size); + return err; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e9d5ab5d3ca0..b11a269e2356 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1575,6 +1575,20 @@ static int __ref __offline_pages(unsigned long start_pfn, /* check again */ ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, check_pages_isolated_cb); + /* + * per-cpu pages are drained in start_isolate_page_range, but if + * there are still pages that are not free, make sure that we + * drain again, because when we isolated range we might + * have raced with another thread that was adding pages to pcp + * list. + * + * Forward progress should be still guaranteed because + * pages on the pcp list can only belong to MOVABLE_ZONE + * because has_unmovable_pages explicitly checks for + * PageBuddy on freed pages on other zones. + */ + if (ret) + drain_all_pages(zone); } while (ret); /* Ok, all of our target is isolated. diff --git a/mm/memremap.c b/mm/memremap.c index 03e38b7a38f1..006dace60b1a 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -216,7 +216,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) return ERR_PTR(-EINVAL); } break; - case MEMORY_DEVICE_DEVDAX: + case MEMORY_DEVICE_GENERIC: need_devmap_managed = false; break; case MEMORY_DEVICE_PCI_P2PDMA: diff --git a/mm/migrate.c b/mm/migrate.c index 34a842a8eb6a..aecb1433cf3c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -246,13 +246,13 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, else if (pte_swp_uffd_wp(*pvmw.pte)) pte = pte_mkuffd_wp(pte); - if (unlikely(is_zone_device_page(new))) { - if (is_device_private_page(new)) { - entry = make_device_private_entry(new, pte_write(pte)); - pte = swp_entry_to_pte(entry); - if (pte_swp_uffd_wp(*pvmw.pte)) - pte = pte_mkuffd_wp(pte); - } + if (unlikely(is_device_private_page(new))) { + entry = make_device_private_entry(new, pte_write(pte)); + pte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(*pvmw.pte)) + pte = pte_swp_mksoft_dirty(pte); + if (pte_swp_uffd_wp(*pvmw.pte)) + pte = pte_swp_mkuffd_wp(pte); } #ifdef CONFIG_HUGETLB_PAGE @@ -668,7 +668,8 @@ void migrate_page_states(struct page *newpage, struct page *page) copy_page_owner(page, newpage); - mem_cgroup_migrate(page, newpage); + if (!PageHuge(page)) + mem_cgroup_migrate(page, newpage); } EXPORT_SYMBOL(migrate_page_states); @@ -2427,10 +2428,17 @@ again: entry = make_migration_entry(page, mpfn & MIGRATE_PFN_WRITE); swp_pte = swp_entry_to_pte(entry); - if (pte_soft_dirty(pte)) - swp_pte = pte_swp_mksoft_dirty(swp_pte); - if (pte_uffd_wp(pte)) - swp_pte = pte_swp_mkuffd_wp(swp_pte); + if (pte_present(pte)) { + if (pte_soft_dirty(pte)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pte)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + } else { + if (pte_swp_soft_dirty(pte)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_swp_uffd_wp(pte)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + } set_pte_at(mm, addr, ptep, swp_pte); /* diff --git a/mm/mlock.c b/mm/mlock.c index 93ca2bf30b4f..884b1216da6a 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -58,11 +58,14 @@ EXPORT_SYMBOL(can_do_mlock); */ void clear_page_mlock(struct page *page) { + int nr_pages; + if (!TestClearPageMlocked(page)) return; - mod_zone_page_state(page_zone(page), NR_MLOCK, -thp_nr_pages(page)); - count_vm_event(UNEVICTABLE_PGCLEARED); + nr_pages = thp_nr_pages(page); + mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); + count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); /* * The previous TestClearPageMlocked() corresponds to the smp_mb() * in __pagevec_lru_add_fn(). @@ -76,7 +79,7 @@ void clear_page_mlock(struct page *page) * We lost the race. the page already moved to evictable list. */ if (PageUnevictable(page)) - count_vm_event(UNEVICTABLE_PGSTRANDED); + count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); } } @@ -93,9 +96,10 @@ void mlock_vma_page(struct page *page) VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); if (!TestSetPageMlocked(page)) { - mod_zone_page_state(page_zone(page), NR_MLOCK, - thp_nr_pages(page)); - count_vm_event(UNEVICTABLE_PGMLOCKED); + int nr_pages = thp_nr_pages(page); + + mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); + count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); if (!isolate_lru_page(page)) putback_lru_page(page); } @@ -138,7 +142,7 @@ static void __munlock_isolated_page(struct page *page) /* Did try_to_unlock() succeed or punt? */ if (!PageMlocked(page)) - count_vm_event(UNEVICTABLE_PGMUNLOCKED); + count_vm_events(UNEVICTABLE_PGMUNLOCKED, thp_nr_pages(page)); putback_lru_page(page); } @@ -154,10 +158,12 @@ static void __munlock_isolated_page(struct page *page) */ static void __munlock_isolation_failed(struct page *page) { + int nr_pages = thp_nr_pages(page); + if (PageUnevictable(page)) - __count_vm_event(UNEVICTABLE_PGSTRANDED); + __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); else - __count_vm_event(UNEVICTABLE_PGMUNLOCKED); + __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); } /** diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 242c03121d73..63a3db10a8c0 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -170,6 +170,14 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * pageblocks we may have modified and return -EBUSY to caller. This * prevents two threads from simultaneously working on overlapping ranges. * + * Please note that there is no strong synchronization with the page allocator + * either. Pages might be freed while their page blocks are marked ISOLATED. + * In some cases pages might still end up on pcp lists and that would allow + * for their allocation even when they are in fact isolated already. Depending + * on how strong of a guarantee the caller needs drain_all_pages might be needed + * (e.g. __offline_pages will need to call it after check for isolated range for + * a next retry). + * * Return: the number of isolated pageblocks on success and -EBUSY if any part * of range cannot be isolated. */ diff --git a/mm/percpu.c b/mm/percpu.c index f4709629e6de..1ed1a349eab8 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1316,7 +1316,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, /* allocate chunk */ alloc_size = sizeof(struct pcpu_chunk) + - BITS_TO_LONGS(region_size >> PAGE_SHIFT); + BITS_TO_LONGS(region_size >> PAGE_SHIFT) * sizeof(unsigned long); chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!chunk) panic("%s: Failed to allocate %zu bytes\n", __func__, diff --git a/mm/rmap.c b/mm/rmap.c index 83cc459edc40..9425260774a1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1511,9 +1511,14 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, */ entry = make_migration_entry(page, 0); swp_pte = swp_entry_to_pte(entry); - if (pte_soft_dirty(pteval)) + + /* + * pteval maps a zone device page and is therefore + * a swap pte. + */ + if (pte_swp_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); - if (pte_uffd_wp(pteval)) + if (pte_swp_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); /* diff --git a/mm/shmem.c b/mm/shmem.c index 271548ca20f3..8e2b35ba93ad 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -279,11 +279,13 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) if (!(sb->s_flags & SB_KERNMOUNT)) { spin_lock(&sbinfo->stat_lock); - if (!sbinfo->free_inodes) { - spin_unlock(&sbinfo->stat_lock); - return -ENOSPC; + if (sbinfo->max_inodes) { + if (!sbinfo->free_inodes) { + spin_unlock(&sbinfo->stat_lock); + return -ENOSPC; + } + sbinfo->free_inodes--; } - sbinfo->free_inodes--; if (inop) { ino = sbinfo->next_ino++; if (unlikely(is_zero_ino(ino))) diff --git a/mm/slub.c b/mm/slub.c index 68c02b2eecd9..d4177aecedf6 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -672,12 +672,12 @@ static void slab_fix(struct kmem_cache *s, char *fmt, ...) } static bool freelist_corrupted(struct kmem_cache *s, struct page *page, - void *freelist, void *nextfree) + void **freelist, void *nextfree) { if ((s->flags & SLAB_CONSISTENCY_CHECKS) && - !check_valid_pointer(s, page, nextfree)) { - object_err(s, page, freelist, "Freechain corrupt"); - freelist = NULL; + !check_valid_pointer(s, page, nextfree) && freelist) { + object_err(s, page, *freelist, "Freechain corrupt"); + *freelist = NULL; slab_fix(s, "Isolate corrupted freechain"); return true; } @@ -1494,7 +1494,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} static bool freelist_corrupted(struct kmem_cache *s, struct page *page, - void *freelist, void *nextfree) + void **freelist, void *nextfree) { return false; } @@ -2184,7 +2184,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, * 'freelist' is already corrupted. So isolate all objects * starting at 'freelist'. */ - if (freelist_corrupted(s, page, freelist, nextfree)) + if (freelist_corrupted(s, page, &freelist, nextfree)) break; do { diff --git a/mm/swap.c b/mm/swap.c index d16d65d9b4e0..e7bdf094f76a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -494,14 +494,14 @@ void lru_cache_add_inactive_or_unevictable(struct page *page, unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED; if (unlikely(unevictable) && !TestSetPageMlocked(page)) { + int nr_pages = thp_nr_pages(page); /* * We use the irq-unsafe __mod_zone_page_stat because this * counter is not modified from interrupt context, and the pte * lock is held(spinlock), which implies preemption disabled. */ - __mod_zone_page_state(page_zone(page), NR_MLOCK, - thp_nr_pages(page)); - count_vm_event(UNEVICTABLE_PGMLOCKED); + __mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); + count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); } lru_cache_add(page); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 99e1796eb833..466fc3144fff 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2615,6 +2615,14 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) unsigned long reclaimed; unsigned long scanned; + /* + * This loop can become CPU-bound when target memcgs + * aren't eligible for reclaim - either because they + * don't have any reclaimable pages, or because their + * memory is explicitly protected. Avoid soft lockups. + */ + cond_resched(); + mem_cgroup_calculate_protection(target_memcg, memcg); if (mem_cgroup_below_min(memcg)) { @@ -4260,8 +4268,14 @@ void check_move_unevictable_pages(struct pagevec *pvec) for (i = 0; i < pvec->nr; i++) { struct page *page = pvec->pages[i]; struct pglist_data *pagepgdat = page_pgdat(page); + int nr_pages; + + if (PageTransTail(page)) + continue; + + nr_pages = thp_nr_pages(page); + pgscanned += nr_pages; - pgscanned++; if (pagepgdat != pgdat) { if (pgdat) spin_unlock_irq(&pgdat->lru_lock); @@ -4280,7 +4294,7 @@ void check_move_unevictable_pages(struct pagevec *pvec) ClearPageUnevictable(page); del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); add_page_to_lru_list(page, lruvec, lru); - pgrescued++; + pgrescued += nr_pages; } } |