diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/filemap.c | 31 | ||||
-rw-r--r-- | mm/gup.c | 4 | ||||
-rw-r--r-- | mm/highmem.c | 21 | ||||
-rw-r--r-- | mm/huge_memory.c | 10 | ||||
-rw-r--r-- | mm/hugetlb.c | 164 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 10 | ||||
-rw-r--r-- | mm/internal.h | 25 | ||||
-rw-r--r-- | mm/kasan/common.c | 2 | ||||
-rw-r--r-- | mm/kasan/kasan.h | 2 | ||||
-rw-r--r-- | mm/kasan/report_generic.c | 2 | ||||
-rw-r--r-- | mm/kfence/core.c | 9 | ||||
-rw-r--r-- | mm/kfence/report.c | 30 | ||||
-rw-r--r-- | mm/kmemleak.c | 3 | ||||
-rw-r--r-- | mm/madvise.c | 13 | ||||
-rw-r--r-- | mm/mapping_dirty_helpers.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 15 | ||||
-rw-r--r-- | mm/memory.c | 18 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 2 | ||||
-rw-r--r-- | mm/mmu_gather.c | 29 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 23 | ||||
-rw-r--r-- | mm/oom_kill.c | 2 | ||||
-rw-r--r-- | mm/page-writeback.c | 16 | ||||
-rw-r--r-- | mm/page_alloc.c | 167 | ||||
-rw-r--r-- | mm/page_poison.c | 4 | ||||
-rw-r--r-- | mm/percpu-internal.h | 2 | ||||
-rw-r--r-- | mm/percpu-stats.c | 9 | ||||
-rw-r--r-- | mm/percpu.c | 14 | ||||
-rw-r--r-- | mm/ptdump.c | 2 | ||||
-rw-r--r-- | mm/shuffle.c | 4 | ||||
-rw-r--r-- | mm/slab.c | 2 | ||||
-rw-r--r-- | mm/slub.c | 2 | ||||
-rw-r--r-- | mm/z3fold.c | 16 |
32 files changed, 438 insertions, 217 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 43700480d897..6ce832dc59e7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1969,8 +1969,14 @@ unlock: put: put_page(page); next: - if (!xa_is_value(page) && PageTransHuge(page)) - xas_set(&xas, page->index + thp_nr_pages(page)); + if (!xa_is_value(page) && PageTransHuge(page)) { + unsigned int nr_pages = thp_nr_pages(page); + + /* Final THP may cross MAX_LFS_FILESIZE on 32-bit */ + xas_set(&xas, page->index + nr_pages); + if (xas.xa_index < nr_pages) + break; + } } rcu_read_unlock(); @@ -2672,7 +2678,7 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, loff_t end, int whence) { XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT); - pgoff_t max = (end - 1) / PAGE_SIZE; + pgoff_t max = (end - 1) >> PAGE_SHIFT; bool seek_data = (whence == SEEK_DATA); struct page *page; @@ -2681,7 +2687,8 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, rcu_read_lock(); while ((page = find_get_entry(&xas, max, XA_PRESENT))) { - loff_t pos = xas.xa_index * PAGE_SIZE; + loff_t pos = (u64)xas.xa_index << PAGE_SHIFT; + unsigned int seek_size; if (start < pos) { if (!seek_data) @@ -2689,25 +2696,25 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, start = pos; } - pos += seek_page_size(&xas, page); + seek_size = seek_page_size(&xas, page); + pos = round_up(pos + 1, seek_size); start = page_seek_hole_data(&xas, mapping, page, start, pos, seek_data); if (start < pos) goto unlock; + if (start >= end) + break; + if (seek_size > PAGE_SIZE) + xas_set(&xas, pos >> PAGE_SHIFT); if (!xa_is_value(page)) put_page(page); } - rcu_read_unlock(); - if (seek_data) - return -ENXIO; - goto out; - + start = -ENXIO; unlock: rcu_read_unlock(); - if (!xa_is_value(page)) + if (page && !xa_is_value(page)) put_page(page); -out: if (start > end) return end; return start; @@ -1535,6 +1535,10 @@ struct page *get_dump_page(unsigned long addr) FOLL_FORCE | FOLL_DUMP | FOLL_GET); if (locked) mmap_read_unlock(mm); + + if (ret == 1 && is_page_poisoned(page)) + return NULL; + return (ret == 1) ? page : NULL; } #endif /* CONFIG_ELF_CORE */ diff --git a/mm/highmem.c b/mm/highmem.c index 874b732b120c..6ef8f5e05e7e 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -368,20 +368,24 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1, BUG_ON(end1 > page_size(page) || end2 > page_size(page)); + if (start1 >= end1) + start1 = end1 = 0; + if (start2 >= end2) + start2 = end2 = 0; + for (i = 0; i < compound_nr(page); i++) { void *kaddr = NULL; - if (start1 < PAGE_SIZE || start2 < PAGE_SIZE) - kaddr = kmap_atomic(page + i); - if (start1 >= PAGE_SIZE) { start1 -= PAGE_SIZE; end1 -= PAGE_SIZE; } else { unsigned this_end = min_t(unsigned, end1, PAGE_SIZE); - if (end1 > start1) + if (end1 > start1) { + kaddr = kmap_atomic(page + i); memset(kaddr + start1, 0, this_end - start1); + } end1 -= this_end; start1 = 0; } @@ -392,8 +396,11 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1, } else { unsigned this_end = min_t(unsigned, end2, PAGE_SIZE); - if (end2 > start2) + if (end2 > start2) { + if (!kaddr) + kaddr = kmap_atomic(page + i); memset(kaddr + start2, 0, this_end - start2); + } end2 -= this_end; start2 = 0; } @@ -611,7 +618,7 @@ void __kmap_local_sched_out(void) int idx; /* With debug all even slots are unmapped and act as guard */ - if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) { + if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL) && !(i & 0x01)) { WARN_ON_ONCE(!pte_none(pteval)); continue; } @@ -647,7 +654,7 @@ void __kmap_local_sched_in(void) int idx; /* With debug all even slots are unmapped and act as guard */ - if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) { + if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL) && !(i & 0x01)) { WARN_ON_ONCE(!pte_none(pteval)); continue; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 395c75111d33..ae907a9c2050 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1100,9 +1100,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, * best effort that the pinned pages won't be replaced by another * random page during the coming copy-on-write. */ - if (unlikely(is_cow_mapping(vma->vm_flags) && - atomic_read(&src_mm->has_pinned) && - page_maybe_dma_pinned(src_page))) { + if (unlikely(page_needs_cow_for_dma(vma, src_page))) { pte_free(dst_mm, pgtable); spin_unlock(src_ptl); spin_unlock(dst_ptl); @@ -1214,9 +1212,7 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, } /* Please refer to comments in copy_huge_pmd() */ - if (unlikely(is_cow_mapping(vma->vm_flags) && - atomic_read(&src_mm->has_pinned) && - page_maybe_dma_pinned(pud_page(pud)))) { + if (unlikely(page_needs_cow_for_dma(vma, pud_page(pud)))) { spin_unlock(src_ptl); spin_unlock(dst_ptl); __split_huge_pud(vma, src_pud, addr); @@ -2471,7 +2467,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, int i; /* complete memcg works before add pages to LRU */ - mem_cgroup_split_huge_fixup(head); + split_page_memcg(head, nr); if (PageAnon(head) && PageSwapCache(head)) { swp_entry_t entry = { .val = page_private(head) }; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8fb42c6dd74b..a86a58ef132d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -280,6 +280,17 @@ static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg, nrg->reservation_counter = &h_cg->rsvd_hugepage[hstate_index(h)]; nrg->css = &h_cg->css; + /* + * The caller will hold exactly one h_cg->css reference for the + * whole contiguous reservation region. But this area might be + * scattered when there are already some file_regions reside in + * it. As a result, many file_regions may share only one css + * reference. In order to ensure that one file_region must hold + * exactly one h_cg->css reference, we should do css_get for + * each file_region and leave the reference held by caller + * untouched. + */ + css_get(&h_cg->css); if (!resv->pages_per_hpage) resv->pages_per_hpage = pages_per_huge_page(h); /* pages_per_hpage should be the same for all entries in @@ -293,6 +304,14 @@ static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg, #endif } +static void put_uncharge_info(struct file_region *rg) +{ +#ifdef CONFIG_CGROUP_HUGETLB + if (rg->css) + css_put(rg->css); +#endif +} + static bool has_same_uncharge_info(struct file_region *rg, struct file_region *org) { @@ -316,6 +335,7 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) prg->to = rg->to; list_del(&rg->link); + put_uncharge_info(rg); kfree(rg); rg = prg; @@ -327,10 +347,29 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) nrg->from = rg->from; list_del(&rg->link); + put_uncharge_info(rg); kfree(rg); } } +static inline long +hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from, + long to, struct hstate *h, struct hugetlb_cgroup *cg, + long *regions_needed) +{ + struct file_region *nrg; + + if (!regions_needed) { + nrg = get_file_region_entry_from_cache(map, from, to); + record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg); + list_add(&nrg->link, rg->link.prev); + coalesce_file_region(map, nrg); + } else + *regions_needed += 1; + + return to - from; +} + /* * Must be called with resv->lock held. * @@ -346,7 +385,7 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t, long add = 0; struct list_head *head = &resv->regions; long last_accounted_offset = f; - struct file_region *rg = NULL, *trg = NULL, *nrg = NULL; + struct file_region *rg = NULL, *trg = NULL; if (regions_needed) *regions_needed = 0; @@ -369,24 +408,17 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t, /* When we find a region that starts beyond our range, we've * finished. */ - if (rg->from > t) + if (rg->from >= t) break; /* Add an entry for last_accounted_offset -> rg->from, and * update last_accounted_offset. */ - if (rg->from > last_accounted_offset) { - add += rg->from - last_accounted_offset; - if (!regions_needed) { - nrg = get_file_region_entry_from_cache( - resv, last_accounted_offset, rg->from); - record_hugetlb_cgroup_uncharge_info(h_cg, h, - resv, nrg); - list_add(&nrg->link, rg->link.prev); - coalesce_file_region(resv, nrg); - } else - *regions_needed += 1; - } + if (rg->from > last_accounted_offset) + add += hugetlb_resv_map_add(resv, rg, + last_accounted_offset, + rg->from, h, h_cg, + regions_needed); last_accounted_offset = rg->to; } @@ -394,17 +426,9 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t, /* Handle the case where our range extends beyond * last_accounted_offset. */ - if (last_accounted_offset < t) { - add += t - last_accounted_offset; - if (!regions_needed) { - nrg = get_file_region_entry_from_cache( - resv, last_accounted_offset, t); - record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg); - list_add(&nrg->link, rg->link.prev); - coalesce_file_region(resv, nrg); - } else - *regions_needed += 1; - } + if (last_accounted_offset < t) + add += hugetlb_resv_map_add(resv, rg, last_accounted_offset, + t, h, h_cg, regions_needed); VM_BUG_ON(add < 0); return add; @@ -659,7 +683,7 @@ retry: del += t - f; hugetlb_cgroup_uncharge_file_region( - resv, rg, t - f); + resv, rg, t - f, false); /* New entry for end of split region */ nrg->from = t; @@ -680,7 +704,7 @@ retry: if (f <= rg->from && t >= rg->to) { /* Remove entire region */ del += rg->to - rg->from; hugetlb_cgroup_uncharge_file_region(resv, rg, - rg->to - rg->from); + rg->to - rg->from, true); list_del(&rg->link); kfree(rg); continue; @@ -688,13 +712,13 @@ retry: if (f <= rg->from) { /* Trim beginning of region */ hugetlb_cgroup_uncharge_file_region(resv, rg, - t - rg->from); + t - rg->from, false); del += t - rg->from; rg->from = t; } else { /* Trim end of region */ hugetlb_cgroup_uncharge_file_region(resv, rg, - rg->to - f); + rg->to - f, false); del += rg->to - f; rg->to = f; @@ -3725,21 +3749,32 @@ static bool is_hugetlb_entry_hwpoisoned(pte_t pte) return false; } +static void +hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, + struct page *new_page) +{ + __SetPageUptodate(new_page); + set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1)); + hugepage_add_new_anon_rmap(new_page, vma, addr); + hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); + ClearHPageRestoreReserve(new_page); + SetHPageMigratable(new_page); +} + int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) { pte_t *src_pte, *dst_pte, entry, dst_entry; struct page *ptepage; unsigned long addr; - int cow; + bool cow = is_cow_mapping(vma->vm_flags); struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); + unsigned long npages = pages_per_huge_page(h); struct address_space *mapping = vma->vm_file->f_mapping; struct mmu_notifier_range range; int ret = 0; - cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; - if (cow) { mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src, vma->vm_start, @@ -3784,6 +3819,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); dst_entry = huge_ptep_get(dst_pte); +again: if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) { /* * Skip if src entry none. Also, skip in the @@ -3807,6 +3843,52 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, } set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); } else { + entry = huge_ptep_get(src_pte); + ptepage = pte_page(entry); + get_page(ptepage); + + /* + * This is a rare case where we see pinned hugetlb + * pages while they're prone to COW. We need to do the + * COW earlier during fork. + * + * When pre-allocating the page or copying data, we + * need to be without the pgtable locks since we could + * sleep during the process. + */ + if (unlikely(page_needs_cow_for_dma(vma, ptepage))) { + pte_t src_pte_old = entry; + struct page *new; + + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + /* Do not use reserve as it's private owned */ + new = alloc_huge_page(vma, addr, 1); + if (IS_ERR(new)) { + put_page(ptepage); + ret = PTR_ERR(new); + break; + } + copy_user_huge_page(new, ptepage, addr, vma, + npages); + put_page(ptepage); + + /* Install the new huge page if src pte stable */ + dst_ptl = huge_pte_lock(h, dst, dst_pte); + src_ptl = huge_pte_lockptr(h, src, src_pte); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + entry = huge_ptep_get(src_pte); + if (!pte_same(src_pte_old, entry)) { + put_page(new); + /* dst_entry won't change as in child */ + goto again; + } + hugetlb_install_page(vma, dst_pte, addr, new); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + continue; + } + if (cow) { /* * No need to notify as we are downgrading page @@ -3817,12 +3899,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, */ huge_ptep_set_wrprotect(src, addr, src_pte); } - entry = huge_ptep_get(src_pte); - ptepage = pte_page(entry); - get_page(ptepage); + page_dup_rmap(ptepage, true); set_huge_pte_at(dst, addr, dst_pte, entry); - hugetlb_count_add(pages_per_huge_page(h), dst); + hugetlb_count_add(npages, dst); } spin_unlock(src_ptl); spin_unlock(dst_ptl); @@ -5128,6 +5208,10 @@ bool hugetlb_reserve_pages(struct inode *inode, */ long rsv_adjust; + /* + * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the + * reference to h_cg->css. See comment below for detail. + */ hugetlb_cgroup_uncharge_cgroup_rsvd( hstate_index(h), (chg - add) * pages_per_huge_page(h), h_cg); @@ -5135,6 +5219,14 @@ bool hugetlb_reserve_pages(struct inode *inode, rsv_adjust = hugepage_subpool_put_pages(spool, chg - add); hugetlb_acct_memory(h, -rsv_adjust); + } else if (h_cg) { + /* + * The file_regions will hold their own reference to + * h_cg->css. So we should release the reference held + * via hugetlb_cgroup_charge_cgroup_rsvd() when we are + * done. + */ + hugetlb_cgroup_put_rsvd_cgroup(h_cg); } } return true; diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index f68b51fcda3d..603a131e262d 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -391,7 +391,8 @@ void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start, void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, struct file_region *rg, - unsigned long nr_pages) + unsigned long nr_pages, + bool region_del) { if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages) return; @@ -400,7 +401,12 @@ void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, !resv->reservation_counter) { page_counter_uncharge(rg->reservation_counter, nr_pages * resv->pages_per_hpage); - css_put(rg->css); + /* + * Only do css_put(rg->css) when we delete the entire region + * because one file_region must hold exactly one css reference. + */ + if (region_del) + css_put(rg->css); } } diff --git a/mm/internal.h b/mm/internal.h index 9902648f2206..cb3c5e0a7799 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -97,6 +97,26 @@ static inline void set_page_refcounted(struct page *page) set_page_count(page, 1); } +/* + * When kernel touch the user page, the user page may be have been marked + * poison but still mapped in user space, if without this page, the kernel + * can guarantee the data integrity and operation success, the kernel is + * better to check the posion status and avoid touching it, be good not to + * panic, coredump for process fatal signal is a sample case matching this + * scenario. Or if kernel can't guarantee the data integrity, it's better + * not to call this function, let kernel touch the poison page and get to + * panic. + */ +static inline bool is_page_poisoned(struct page *page) +{ + if (PageHWPoison(page)) + return true; + else if (PageHuge(page) && PageHWPoison(compound_head(page))) + return true; + + return false; +} + extern unsigned long highest_memmap_pfn; /* @@ -296,11 +316,6 @@ static inline unsigned int buddy_order(struct page *page) */ #define buddy_order_unsafe(page) READ_ONCE(page_private(page)) -static inline bool is_cow_mapping(vm_flags_t flags) -{ - return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; -} - /* * These three helpers classifies VMAs for virtual memory accounting. */ diff --git a/mm/kasan/common.c b/mm/kasan/common.c index b5e08d4cefec..7b53291dafa1 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -63,7 +63,7 @@ void __kasan_unpoison_range(const void *address, size_t size) kasan_unpoison(address, size); } -#if CONFIG_KASAN_STACK +#ifdef CONFIG_KASAN_STACK /* Unpoison the entire stack for a task. */ void kasan_unpoison_task_stack(struct task_struct *task) { diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 8c55634d6edd..3436c6bf7c0c 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -231,7 +231,7 @@ void *kasan_find_first_bad_addr(void *addr, size_t size); const char *kasan_get_bug_type(struct kasan_access_info *info); void kasan_metadata_fetch_row(char *buffer, void *row); -#if defined(CONFIG_KASAN_GENERIC) && CONFIG_KASAN_STACK +#if defined(CONFIG_KASAN_GENERIC) && defined(CONFIG_KASAN_STACK) void kasan_print_address_stack_frame(const void *addr); #else static inline void kasan_print_address_stack_frame(const void *addr) { } diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 41f374585144..de732bc341c5 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -128,7 +128,7 @@ void kasan_metadata_fetch_row(char *buffer, void *row) memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW); } -#if CONFIG_KASAN_STACK +#ifdef CONFIG_KASAN_STACK static bool __must_check tokenize_frame_descr(const char **frame_descr, char *token, size_t max_tok_len, unsigned long *value) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 3b8ec938470a..d53c91f881a4 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -12,6 +12,7 @@ #include <linux/debugfs.h> #include <linux/kcsan-checks.h> #include <linux/kfence.h> +#include <linux/kmemleak.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/memblock.h> @@ -480,6 +481,14 @@ static bool __init kfence_init_pool(void) addr += 2 * PAGE_SIZE; } + /* + * The pool is live and will never be deallocated from this point on. + * Remove the pool object from the kmemleak object tree, as it would + * otherwise overlap with allocations returned by kfence_alloc(), which + * are registered with kmemleak through the slab post-alloc hook. + */ + kmemleak_free(__kfence_pool); + return true; err: diff --git a/mm/kfence/report.c b/mm/kfence/report.c index ab83d5a59bb1..e3f71451ad9e 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -20,6 +20,11 @@ #include "kfence.h" +/* May be overridden by <asm/kfence.h>. */ +#ifndef ARCH_FUNC_PREFIX +#define ARCH_FUNC_PREFIX "" +#endif + extern bool no_hash_pointers; /* Helper function to either print to a seq_file or to console. */ @@ -67,8 +72,9 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries for (skipnr = 0; skipnr < num_entries; skipnr++) { int len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skipnr]); - if (str_has_prefix(buf, "kfence_") || str_has_prefix(buf, "__kfence_") || - !strncmp(buf, "__slab_free", len)) { + if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfence_") || + str_has_prefix(buf, ARCH_FUNC_PREFIX "__kfence_") || + !strncmp(buf, ARCH_FUNC_PREFIX "__slab_free", len)) { /* * In case of tail calls from any of the below * to any of the above. @@ -77,10 +83,10 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries } /* Also the *_bulk() variants by only checking prefixes. */ - if (str_has_prefix(buf, "kfree") || - str_has_prefix(buf, "kmem_cache_free") || - str_has_prefix(buf, "__kmalloc") || - str_has_prefix(buf, "kmem_cache_alloc")) + if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfree") || + str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_free") || + str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmalloc") || + str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_alloc")) goto found; } if (fallback < num_entries) @@ -116,12 +122,12 @@ void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *met lockdep_assert_held(&meta->lock); if (meta->state == KFENCE_OBJECT_UNUSED) { - seq_con_printf(seq, "kfence-#%zd unused\n", meta - kfence_metadata); + seq_con_printf(seq, "kfence-#%td unused\n", meta - kfence_metadata); return; } seq_con_printf(seq, - "kfence-#%zd [0x%p-0x%p" + "kfence-#%td [0x%p-0x%p" ", size=%d, cache=%s] allocated by task %d:\n", meta - kfence_metadata, (void *)start, (void *)(start + size - 1), size, (cache && cache->name) ? cache->name : "<destroyed>", meta->alloc_track.pid); @@ -204,7 +210,7 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r pr_err("BUG: KFENCE: out-of-bounds %s in %pS\n\n", get_access_type(is_write), (void *)stack_entries[skipnr]); - pr_err("Out-of-bounds %s at 0x%p (%luB %s of kfence-#%zd):\n", + pr_err("Out-of-bounds %s at 0x%p (%luB %s of kfence-#%td):\n", get_access_type(is_write), (void *)address, left_of_object ? meta->addr - address : address - meta->addr, left_of_object ? "left" : "right", object_index); @@ -213,14 +219,14 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r case KFENCE_ERROR_UAF: pr_err("BUG: KFENCE: use-after-free %s in %pS\n\n", get_access_type(is_write), (void *)stack_entries[skipnr]); - pr_err("Use-after-free %s at 0x%p (in kfence-#%zd):\n", + pr_err("Use-after-free %s at 0x%p (in kfence-#%td):\n", get_access_type(is_write), (void *)address, object_index); break; case KFENCE_ERROR_CORRUPTION: pr_err("BUG: KFENCE: memory corruption in %pS\n\n", (void *)stack_entries[skipnr]); pr_err("Corrupted memory at 0x%p ", (void *)address); print_diff_canary(address, 16, meta); - pr_cont(" (in kfence-#%zd):\n", object_index); + pr_cont(" (in kfence-#%td):\n", object_index); break; case KFENCE_ERROR_INVALID: pr_err("BUG: KFENCE: invalid %s in %pS\n\n", get_access_type(is_write), @@ -230,7 +236,7 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r break; case KFENCE_ERROR_INVALID_FREE: pr_err("BUG: KFENCE: invalid free in %pS\n\n", (void *)stack_entries[skipnr]); - pr_err("Invalid free of 0x%p (in kfence-#%zd):\n", (void *)address, + pr_err("Invalid free of 0x%p (in kfence-#%td):\n", (void *)address, object_index); break; } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index c0014d3b91c1..fe6e3ae8e8c6 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -97,6 +97,7 @@ #include <linux/atomic.h> #include <linux/kasan.h> +#include <linux/kfence.h> #include <linux/kmemleak.h> #include <linux/memory_hotplug.h> @@ -589,7 +590,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, atomic_set(&object->use_count, 1); object->flags = OBJECT_ALLOCATED; object->pointer = ptr; - object->size = size; + object->size = kfence_ksize((void *)ptr) ?: size; object->excess_ref = 0; object->min_count = min_count; object->count = 0; /* white color initially */ diff --git a/mm/madvise.c b/mm/madvise.c index df692d2e35d4..01fef79ac761 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1198,12 +1198,22 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, goto release_task; } - mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS); + /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ + mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (IS_ERR_OR_NULL(mm)) { ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; goto release_task; } + /* + * Require CAP_SYS_NICE for influencing process performance. Note that + * only non-destructive hints are currently supported. + */ + if (!capable(CAP_SYS_NICE)) { + ret = -EPERM; + goto release_mm; + } + total_len = iov_iter_count(&iter); while (iov_iter_count(&iter)) { @@ -1218,6 +1228,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, if (ret == 0) ret = total_len - iov_iter_count(&iter); +release_mm: mmput(mm); release_task: put_task_struct(task); diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index b59054ef2e10..b890854ec761 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -165,10 +165,12 @@ static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end, return 0; } +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD /* Huge pud */ walk->action = ACTION_CONTINUE; if (pud_trans_huge(pudval) || pud_devmap(pudval)) WARN_ON(pud_write(pudval) || pud_dirty(pudval)); +#endif return 0; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 845eec01ef9d..e064ac0d850a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3287,24 +3287,21 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) #endif /* CONFIG_MEMCG_KMEM */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* - * Because page_memcg(head) is not set on compound tails, set it now. + * Because page_memcg(head) is not set on tails, set it now. */ -void mem_cgroup_split_huge_fixup(struct page *head) +void split_page_memcg(struct page *head, unsigned int nr) { struct mem_cgroup *memcg = page_memcg(head); int i; - if (mem_cgroup_disabled()) + if (mem_cgroup_disabled() || !memcg) return; - for (i = 1; i < HPAGE_PMD_NR; i++) { - css_get(&memcg->css); - head[i].memcg_data = (unsigned long)memcg; - } + for (i = 1; i < nr; i++) + head[i].memcg_data = head->memcg_data; + css_get_many(&memcg->css, nr - 1); } -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_MEMCG_SWAP /** diff --git a/mm/memory.c b/mm/memory.c index c8e357627318..550405fc3b5e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -166,7 +166,7 @@ static int __init init_zero_pfn(void) zero_pfn = page_to_pfn(ZERO_PAGE(0)); return 0; } -core_initcall(init_zero_pfn); +early_initcall(init_zero_pfn); void mm_trace_rss_stat(struct mm_struct *mm, int member, long count) { @@ -809,12 +809,8 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, struct page **prealloc, pte_t pte, struct page *page) { - struct mm_struct *src_mm = src_vma->vm_mm; struct page *new_page; - if (!is_cow_mapping(src_vma->vm_flags)) - return 1; - /* * What we want to do is to check whether this page may * have been pinned by the parent process. If so, @@ -828,9 +824,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma * the page count. That might give false positives for * for pinning, but it will work correctly. */ - if (likely(!atomic_read(&src_mm->has_pinned))) - return 1; - if (likely(!page_maybe_dma_pinned(page))) + if (likely(!page_needs_cow_for_dma(src_vma, page))) return 1; new_page = *prealloc; @@ -3103,6 +3097,14 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) return handle_userfault(vmf, VM_UFFD_WP); } + /* + * Userfaultfd write-protect can defer flushes. Ensure the TLB + * is flushed in this case before copying. + */ + if (unlikely(userfaultfd_wp(vmf->vma) && + mm_tlb_flush_pending(vmf->vma->vm_mm))) + flush_tlb_page(vmf->vma, vmf->address); + vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); if (!vmf->page) { /* diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 5ba51a8bdaeb..0cdbbfbc5757 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1072,7 +1072,7 @@ static int online_memory_block(struct memory_block *mem, void *arg) */ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) { - struct mhp_params params = { .pgprot = PAGE_KERNEL }; + struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; u64 start, size; bool new_node = false; int ret; diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 0dc7149b0c61..1b9837419bf9 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -249,16 +249,6 @@ void tlb_flush_mmu(struct mmu_gather *tlb) tlb_flush_mmu_free(tlb); } -/** - * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down - * @tlb: the mmu_gather structure to initialize - * @mm: the mm_struct of the target address space - * @fullmm: @mm is without users and we're going to destroy the full address - * space (exit/execve) - * - * Called to initialize an (on-stack) mmu_gather structure for page-table - * tear-down from @mm. - */ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) { @@ -283,11 +273,30 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, inc_tlb_flush_pending(tlb->mm); } +/** + * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down + * @tlb: the mmu_gather structure to initialize + * @mm: the mm_struct of the target address space + * + * Called to initialize an (on-stack) mmu_gather structure for page-table + * tear-down from @mm. + */ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm) { __tlb_gather_mmu(tlb, mm, false); } +/** + * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down + * @tlb: the mmu_gather structure to initialize + * @mm: the mm_struct of the target address space + * + * In this case, @mm is without users and we're going to destroy the + * full address space (exit/execve). + * + * Called to initialize an (on-stack) mmu_gather structure for page-table + * tear-down from @mm. + */ void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm) { __tlb_gather_mmu(tlb, mm, true); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 61ee40ed804e..459d195d2ff6 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -501,10 +501,33 @@ static int mn_hlist_invalidate_range_start( ""); WARN_ON(mmu_notifier_range_blockable(range) || _ret != -EAGAIN); + /* + * We call all the notifiers on any EAGAIN, + * there is no way for a notifier to know if + * its start method failed, thus a start that + * does EAGAIN can't also do end. + */ + WARN_ON(ops->invalidate_range_end); ret = _ret; } } } + + if (ret) { + /* + * Must be non-blocking to get here. If there are multiple + * notifiers and one or more failed start, any that succeeded + * start are expecting their end to be called. Do so now. + */ + hlist_for_each_entry_rcu(subscription, &subscriptions->list, + hlist, srcu_read_lock_held(&srcu)) { + if (!subscription->ops->invalidate_range_end) + continue; + + subscription->ops->invalidate_range_end(subscription, + range); + } + } srcu_read_unlock(&srcu, id); return ret; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 9efaf430cfd3..fa1cf18bac97 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -170,7 +170,7 @@ static bool oom_unkillable_task(struct task_struct *p) return false; } -/** +/* * Check whether unreclaimable slab amount is greater than * all user memory(LRU pages). * dump_unreclaimable_slab() could help in the case that diff --git a/mm/page-writeback.c b/mm/page-writeback.c index eb34d204d4ee..9e35b636a393 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2833,6 +2833,22 @@ void wait_on_page_writeback(struct page *page) } EXPORT_SYMBOL_GPL(wait_on_page_writeback); +/* + * Wait for a page to complete writeback. Returns -EINTR if we get a + * fatal signal while waiting. + */ +int wait_on_page_writeback_killable(struct page *page) +{ + while (PageWriteback(page)) { + trace_wait_on_page_writeback(page, page_mapping(page)); + if (wait_on_page_bit_killable(page, PG_writeback)) + return -EINTR; + } + + return 0; +} +EXPORT_SYMBOL_GPL(wait_on_page_writeback_killable); + /** * wait_for_stable_page() - wait for writeback to finish, if necessary. * @page: The page to wait on. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3e4b29ee2b1e..cfc72873961d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1282,6 +1282,12 @@ static __always_inline bool free_pages_prepare(struct page *page, kernel_poison_pages(page, 1 << order); /* + * With hardware tag-based KASAN, memory tags must be set before the + * page becomes unavailable via debug_pagealloc or arch_free_page. + */ + kasan_free_nondeferred_pages(page, order); + + /* * arch_free_page() can make the page's contents inaccessible. s390 * does this. So nothing which can access the page's contents should * happen after this. @@ -1290,8 +1296,6 @@ static __always_inline bool free_pages_prepare(struct page *page, debug_pagealloc_unmap_pages(page, 1 << order); - kasan_free_nondeferred_pages(page, order); - return true; } @@ -3310,6 +3314,7 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); split_page_owner(page, 1 << order); + split_page_memcg(page, 1 << order); } EXPORT_SYMBOL_GPL(split_page); @@ -6259,12 +6264,65 @@ static void __meminit zone_init_free_lists(struct zone *zone) } } +#if !defined(CONFIG_FLAT_NODE_MEM_MAP) +/* + * Only struct pages that correspond to ranges defined by memblock.memory + * are zeroed and initialized by going through __init_single_page() during + * memmap_init_zone(). + * + * But, there could be struct pages that correspond to holes in + * memblock.memory. This can happen because of the following reasons: + * - physical memory bank size is not necessarily the exact multiple of the + * arbitrary section size + * - early reserved memory may not be listed in memblock.memory + * - memory layouts defined with memmap= kernel parameter may not align + * nicely with memmap sections + * + * Explicitly initialize those struct pages so that: + * - PG_Reserved is set + * - zone and node links point to zone and node that span the page if the + * hole is in the middle of a zone + * - zone and node links point to adjacent zone/node if the hole falls on + * the zone boundary; the pages in such holes will be prepended to the + * zone/node above the hole except for the trailing pages in the last + * section that will be appended to the zone/node below. + */ +static u64 __meminit init_unavailable_range(unsigned long spfn, + unsigned long epfn, + int zone, int node) +{ + unsigned long pfn; + u64 pgcnt = 0; + + for (pfn = spfn; pfn < epfn; pfn++) { + if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { + pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) + + pageblock_nr_pages - 1; + continue; + } + __init_single_page(pfn_to_page(pfn), pfn, zone, node); + __SetPageReserved(pfn_to_page(pfn)); + pgcnt++; + } + + return pgcnt; +} +#else +static inline u64 init_unavailable_range(unsigned long spfn, unsigned long epfn, + int zone, int node) +{ + return 0; +} +#endif + void __meminit __weak memmap_init_zone(struct zone *zone) { unsigned long zone_start_pfn = zone->zone_start_pfn; unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages; int i, nid = zone_to_nid(zone), zone_id = zone_idx(zone); + static unsigned long hole_pfn; unsigned long start_pfn, end_pfn; + u64 pgcnt = 0; for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn); @@ -6274,7 +6332,29 @@ void __meminit __weak memmap_init_zone(struct zone *zone) memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn, zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); + + if (hole_pfn < start_pfn) + pgcnt += init_unavailable_range(hole_pfn, start_pfn, + zone_id, nid); + hole_pfn = end_pfn; } + +#ifdef CONFIG_SPARSEMEM + /* + * Initialize the hole in the range [zone_end_pfn, section_end]. + * If zone boundary falls in the middle of a section, this hole + * will be re-initialized during the call to this function for the + * higher zone. + */ + end_pfn = round_up(zone_end_pfn, PAGES_PER_SECTION); + if (hole_pfn < end_pfn) + pgcnt += init_unavailable_range(hole_pfn, end_pfn, + zone_id, nid); +#endif + + if (pgcnt) + pr_info(" %s zone: %llu pages in unavailable ranges\n", + zone->name, pgcnt); } static int zone_batchsize(struct zone *zone) @@ -7071,88 +7151,6 @@ void __init free_area_init_memoryless_node(int nid) free_area_init_node(nid); } -#if !defined(CONFIG_FLAT_NODE_MEM_MAP) -/* - * Initialize all valid struct pages in the range [spfn, epfn) and mark them - * PageReserved(). Return the number of struct pages that were initialized. - */ -static u64 __init init_unavailable_range(unsigned long spfn, unsigned long epfn) -{ - unsigned long pfn; - u64 pgcnt = 0; - - for (pfn = spfn; pfn < epfn; pfn++) { - if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { - pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) - + pageblock_nr_pages - 1; - continue; - } - /* - * Use a fake node/zone (0) for now. Some of these pages - * (in memblock.reserved but not in memblock.memory) will - * get re-initialized via reserve_bootmem_region() later. - */ - __init_single_page(pfn_to_page(pfn), pfn, 0, 0); - __SetPageReserved(pfn_to_page(pfn)); - pgcnt++; - } - - return pgcnt; -} - -/* - * Only struct pages that are backed by physical memory are zeroed and - * initialized by going through __init_single_page(). But, there are some - * struct pages which are reserved in memblock allocator and their fields - * may be accessed (for example page_to_pfn() on some configuration accesses - * flags). We must explicitly initialize those struct pages. - * - * This function also addresses a similar issue where struct pages are left - * uninitialized because the physical address range is not covered by - * memblock.memory or memblock.reserved. That could happen when memblock - * layout is manually configured via memmap=, or when the highest physical - * address (max_pfn) does not end on a section boundary. - */ -static void __init init_unavailable_mem(void) -{ - phys_addr_t start, end; - u64 i, pgcnt; - phys_addr_t next = 0; - - /* - * Loop through unavailable ranges not covered by memblock.memory. - */ - pgcnt = 0; - for_each_mem_range(i, &start, &end) { - if (next < start) - pgcnt += init_unavailable_range(PFN_DOWN(next), - PFN_UP(start)); - next = end; - } - - /* - * Early sections always have a fully populated memmap for the whole - * section - see pfn_valid(). If the last section has holes at the - * end and that section is marked "online", the memmap will be - * considered initialized. Make sure that memmap has a well defined - * state. - */ - pgcnt += init_unavailable_range(PFN_DOWN(next), - round_up(max_pfn, PAGES_PER_SECTION)); - - /* - * Struct pages that do not have backing memory. This could be because - * firmware is using some of this memory, or for some other reasons. - */ - if (pgcnt) - pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt); -} -#else -static inline void __init init_unavailable_mem(void) -{ -} -#endif /* !CONFIG_FLAT_NODE_MEM_MAP */ - #if MAX_NUMNODES > 1 /* * Figure out the number of possible node ids. @@ -7576,7 +7574,6 @@ void __init free_area_init(unsigned long *max_zone_pfn) /* Initialise every node */ mminit_verify_pageflags_layout(); setup_nr_node_ids(); - init_unavailable_mem(); for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); free_area_init_node(nid); diff --git a/mm/page_poison.c b/mm/page_poison.c index 65cdf844c8ad..655dc5895604 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -77,12 +77,14 @@ static void unpoison_page(struct page *page) void *addr; addr = kmap_atomic(page); + kasan_disable_current(); /* * Page poisoning when enabled poisons each and every page * that is freed to buddy. Thus no extra check is done to * see if a page was poisoned. */ - check_poison_mem(addr, PAGE_SIZE); + check_poison_mem(kasan_reset_tag(addr), PAGE_SIZE); + kasan_enable_current(); kunmap_atomic(addr); } diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index 18b768ac7dca..095d7eaa0db4 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -87,7 +87,7 @@ extern spinlock_t pcpu_lock; extern struct list_head *pcpu_chunk_lists; extern int pcpu_nr_slots; -extern int pcpu_nr_empty_pop_pages; +extern int pcpu_nr_empty_pop_pages[]; extern struct pcpu_chunk *pcpu_first_chunk; extern struct pcpu_chunk *pcpu_reserved_chunk; diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index c8400a2adbc2..f6026dbcdf6b 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -145,6 +145,7 @@ static int percpu_stats_show(struct seq_file *m, void *v) int slot, max_nr_alloc; int *buffer; enum pcpu_chunk_type type; + int nr_empty_pop_pages; alloc_buffer: spin_lock_irq(&pcpu_lock); @@ -165,7 +166,11 @@ alloc_buffer: goto alloc_buffer; } -#define PL(X) \ + nr_empty_pop_pages = 0; + for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++) + nr_empty_pop_pages += pcpu_nr_empty_pop_pages[type]; + +#define PL(X) \ seq_printf(m, " %-20s: %12lld\n", #X, (long long int)pcpu_stats_ai.X) seq_printf(m, @@ -196,7 +201,7 @@ alloc_buffer: PU(nr_max_chunks); PU(min_alloc_size); PU(max_alloc_size); - P("empty_pop_pages", pcpu_nr_empty_pop_pages); + P("empty_pop_pages", nr_empty_pop_pages); seq_putc(m, '\n'); #undef PU diff --git a/mm/percpu.c b/mm/percpu.c index 6596a0a4286e..23308113a5ff 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -173,10 +173,10 @@ struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */ static LIST_HEAD(pcpu_map_extend_chunks); /* - * The number of empty populated pages, protected by pcpu_lock. The - * reserved chunk doesn't contribute to the count. + * The number of empty populated pages by chunk type, protected by pcpu_lock. + * The reserved chunk doesn't contribute to the count. */ -int pcpu_nr_empty_pop_pages; +int pcpu_nr_empty_pop_pages[PCPU_NR_CHUNK_TYPES]; /* * The number of populated pages in use by the allocator, protected by @@ -556,7 +556,7 @@ static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr) { chunk->nr_empty_pop_pages += nr; if (chunk != pcpu_reserved_chunk) - pcpu_nr_empty_pop_pages += nr; + pcpu_nr_empty_pop_pages[pcpu_chunk_type(chunk)] += nr; } /* @@ -1832,7 +1832,7 @@ area_found: mutex_unlock(&pcpu_alloc_mutex); } - if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) + if (pcpu_nr_empty_pop_pages[type] < PCPU_EMPTY_POP_PAGES_LOW) pcpu_schedule_balance_work(); /* clear the areas and return address relative to base address */ @@ -2000,7 +2000,7 @@ retry_pop: pcpu_atomic_alloc_failed = false; } else { nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH - - pcpu_nr_empty_pop_pages, + pcpu_nr_empty_pop_pages[type], 0, PCPU_EMPTY_POP_PAGES_HIGH); } @@ -2580,7 +2580,7 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* link the first chunk in */ pcpu_first_chunk = chunk; - pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages; + pcpu_nr_empty_pop_pages[PCPU_CHUNK_ROOT] = pcpu_first_chunk->nr_empty_pop_pages; pcpu_chunk_relocate(pcpu_first_chunk, -1); /* include all regions of the first chunk */ diff --git a/mm/ptdump.c b/mm/ptdump.c index 4354c1422d57..da751448d0e4 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -111,7 +111,7 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { struct ptdump_state *st = walk->private; - pte_t val = READ_ONCE(*pte); + pte_t val = ptep_get(pte); if (st->effective_prot) st->effective_prot(st, 4, pte_val(val)); diff --git a/mm/shuffle.c b/mm/shuffle.c index 9c2e145a747a..c13c33b247e8 100644 --- a/mm/shuffle.c +++ b/mm/shuffle.c @@ -147,8 +147,8 @@ void __meminit __shuffle_zone(struct zone *z) spin_unlock_irqrestore(&z->lock, flags); } -/** - * shuffle_free_memory - reduce the predictability of the page allocator +/* + * __shuffle_free_memory - reduce the predictability of the page allocator * @pgdat: node page data */ void __meminit __shuffle_free_memory(pg_data_t *pgdat) diff --git a/mm/slab.c b/mm/slab.c index 51fd424e0d6d..ae651bf540b7 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2992,7 +2992,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, void *objp, unsigned long caller) { WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); - if (!objp) + if (!objp || is_kfence_address(objp)) return objp; if (cachep->flags & SLAB_POISON) { check_poison_obj(cachep, objp); diff --git a/mm/slub.c b/mm/slub.c index e26c274b4657..3021ce9bf1b3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1993,7 +1993,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, t = acquire_slab(s, n, page, object == NULL, &objects); if (!t) - continue; /* cmpxchg raced */ + break; available += objects; if (!object) { diff --git a/mm/z3fold.c b/mm/z3fold.c index b5dafa7e44e4..9d889ad2bb86 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -1346,8 +1346,22 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) page = list_entry(pos, struct page, lru); zhdr = page_address(page); - if (test_bit(PAGE_HEADLESS, &page->private)) + if (test_bit(PAGE_HEADLESS, &page->private)) { + /* + * For non-headless pages, we wait to do this + * until we have the page lock to avoid racing + * with __z3fold_alloc(). Headless pages don't + * have a lock (and __z3fold_alloc() will never + * see them), but we still need to test and set + * PAGE_CLAIMED to avoid racing with + * z3fold_free(), so just do it now before + * leaving the loop. + */ + if (test_and_set_bit(PAGE_CLAIMED, &page->private)) + continue; + break; + } if (kref_get_unless_zero(&zhdr->refcount) == 0) { zhdr = NULL; |