diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-12-14 06:29:45 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-12-14 06:29:45 +0300 |
commit | e2ca6ba6ba0152361aa4fcbf6067db71b2c7a770 (patch) | |
tree | f7ed7753a2e66486a4ffe0fbbf98404ec4ba2212 /mm/huge_memory.c | |
parent | 7e68dd7d07a28faa2e6574dd6b9dbd90cdeaae91 (diff) | |
parent | c45bc55a99957b20e4e0333bcd42e12d1833a7f5 (diff) | |
download | linux-e2ca6ba6ba0152361aa4fcbf6067db71b2c7a770.tar.xz |
Merge tag 'mm-stable-2022-12-13' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton:
- More userfaultfs work from Peter Xu
- Several convert-to-folios series from Sidhartha Kumar and Huang Ying
- Some filemap cleanups from Vishal Moola
- David Hildenbrand added the ability to selftest anon memory COW
handling
- Some cpuset simplifications from Liu Shixin
- Addition of vmalloc tracing support by Uladzislau Rezki
- Some pagecache folioifications and simplifications from Matthew
Wilcox
- A pagemap cleanup from Kefeng Wang: we have VM_ACCESS_FLAGS, so use
it
- Miguel Ojeda contributed some cleanups for our use of the
__no_sanitize_thread__ gcc keyword.
This series should have been in the non-MM tree, my bad
- Naoya Horiguchi improved the interaction between memory poisoning and
memory section removal for huge pages
- DAMON cleanups and tuneups from SeongJae Park
- Tony Luck fixed the handling of COW faults against poisoned pages
- Peter Xu utilized the PTE marker code for handling swapin errors
- Hugh Dickins reworked compound page mapcount handling, simplifying it
and making it more efficient
- Removal of the autonuma savedwrite infrastructure from Nadav Amit and
David Hildenbrand
- zram support for multiple compression streams from Sergey Senozhatsky
- David Hildenbrand reworked the GUP code's R/O long-term pinning so
that drivers no longer need to use the FOLL_FORCE workaround which
didn't work very well anyway
- Mel Gorman altered the page allocator so that local IRQs can remnain
enabled during per-cpu page allocations
- Vishal Moola removed the try_to_release_page() wrapper
- Stefan Roesch added some per-BDI sysfs tunables which are used to
prevent network block devices from dirtying excessive amounts of
pagecache
- David Hildenbrand did some cleanup and repair work on KSM COW
breaking
- Nhat Pham and Johannes Weiner have implemented writeback in zswap's
zsmalloc backend
- Brian Foster has fixed a longstanding corner-case oddity in
file[map]_write_and_wait_range()
- sparse-vmemmap changes for MIPS, LoongArch and NIOS2 from Feiyang
Chen
- Shiyang Ruan has done some work on fsdax, to make its reflink mode
work better under xfstests. Better, but still not perfect
- Christoph Hellwig has removed the .writepage() method from several
filesystems. They only need .writepages()
- Yosry Ahmed wrote a series which fixes the memcg reclaim target
beancounting
- David Hildenbrand has fixed some of our MM selftests for 32-bit
machines
- Many singleton patches, as usual
* tag 'mm-stable-2022-12-13' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (313 commits)
mm/hugetlb: set head flag before setting compound_order in __prep_compound_gigantic_folio
mm: mmu_gather: allow more than one batch of delayed rmaps
mm: fix typo in struct pglist_data code comment
kmsan: fix memcpy tests
mm: add cond_resched() in swapin_walk_pmd_entry()
mm: do not show fs mm pc for VM_LOCKONFAULT pages
selftests/vm: ksm_functional_tests: fixes for 32bit
selftests/vm: cow: fix compile warning on 32bit
selftests/vm: madv_populate: fix missing MADV_POPULATE_(READ|WRITE) definitions
mm/gup_test: fix PIN_LONGTERM_TEST_READ with highmem
mm,thp,rmap: fix races between updates of subpages_mapcount
mm: memcg: fix swapcached stat accounting
mm: add nodes= arg to memory.reclaim
mm: disable top-tier fallback to reclaim on proactive reclaim
selftests: cgroup: make sure reclaim target memcg is unprotected
selftests: cgroup: refactor proactive reclaim code to reclaim_until()
mm: memcg: fix stale protection of reclaim target memcg
mm/mmap: properly unaccount memory on mas_preallocate() failure
omfs: remove ->writepage
jfs: remove ->writepage
...
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r-- | mm/huge_memory.c | 157 |
1 files changed, 84 insertions, 73 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ffbea56a8711..2546199ab3c0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1318,9 +1318,6 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); - VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE)); - VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE)); - if (is_huge_zero_pmd(orig_pmd)) goto fallback; @@ -1384,7 +1381,7 @@ reuse: if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); spin_unlock(vmf->ptl); - return VM_FAULT_WRITE; + return 0; } unlock_fallback: @@ -1395,6 +1392,36 @@ fallback: return VM_FAULT_FALLBACK; } +static inline bool can_change_pmd_writable(struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd) +{ + struct page *page; + + if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE))) + return false; + + /* Don't touch entries that are not even readable (NUMA hinting). */ + if (pmd_protnone(pmd)) + return false; + + /* Do we need write faults for softdirty tracking? */ + if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd)) + return false; + + /* Do we need write faults for uffd-wp tracking? */ + if (userfaultfd_huge_pmd_wp(vma, pmd)) + return false; + + if (!(vma->vm_flags & VM_SHARED)) { + /* See can_change_pte_writable(). */ + page = vm_normal_page_pmd(vma, addr, pmd); + return page && PageAnon(page) && PageAnonExclusive(page); + } + + /* See can_change_pte_writable(). */ + return pmd_dirty(pmd); +} + /* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */ static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, struct vm_area_struct *vma, @@ -1459,7 +1486,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, if (pmd_protnone(*pmd) && !gup_can_follow_protnone(flags)) return NULL; - if (!pmd_write(*pmd) && gup_must_unshare(flags, page)) + if (!pmd_write(*pmd) && gup_must_unshare(vma, flags, page)) return ERR_PTR(-EMLINK); VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && @@ -1488,8 +1515,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) unsigned long haddr = vmf->address & HPAGE_PMD_MASK; int page_nid = NUMA_NO_NODE; int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK); - bool migrated = false; - bool was_writable = pmd_savedwrite(oldpmd); + bool migrated = false, writable = false; int flags = 0; vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); @@ -1499,12 +1525,22 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) } pmd = pmd_modify(oldpmd, vma->vm_page_prot); + + /* + * Detect now whether the PMD could be writable; this information + * is only valid while holding the PT lock. + */ + writable = pmd_write(pmd); + if (!writable && vma_wants_manual_pte_write_upgrade(vma) && + can_change_pmd_writable(vma, vmf->address, pmd)) + writable = true; + page = vm_normal_page_pmd(vma, haddr, pmd); if (!page) goto out_map; /* See similar comment in do_numa_page for explanation */ - if (!was_writable) + if (!writable) flags |= TNF_NO_GROUP; page_nid = page_to_nid(page); @@ -1523,6 +1559,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) } spin_unlock(vmf->ptl); + writable = false; migrated = migrate_misplaced_page(page, vma, target_nid); if (migrated) { @@ -1549,7 +1586,7 @@ out_map: /* Restore the PMD */ pmd = pmd_modify(oldpmd, vma->vm_page_prot); pmd = pmd_mkyoung(pmd); - if (was_writable) + if (writable) pmd = pmd_mkwrite(pmd); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); @@ -1790,11 +1827,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; pmd_t oldpmd, entry; - bool preserve_write; - int ret; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; + int ret = 1; tlb_change_page_size(tlb, HPAGE_PMD_SIZE); @@ -1805,9 +1841,6 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (!ptl) return 0; - preserve_write = prot_numa && pmd_write(*pmd); - ret = 1; - #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION if (is_swap_pmd(*pmd)) { swp_entry_t entry = pmd_to_swp_entry(*pmd); @@ -1887,8 +1920,6 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, oldpmd = pmdp_invalidate_ad(vma, addr, pmd); entry = pmd_modify(oldpmd, newprot); - if (preserve_write) - entry = pmd_mk_savedwrite(entry); if (uffd_wp) { entry = pmd_wrprotect(entry); entry = pmd_mkuffd_wp(entry); @@ -1900,13 +1931,17 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, */ entry = pmd_clear_uffd_wp(entry); } + + /* See change_pte_range(). */ + if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) && + can_change_pmd_writable(vma, addr, entry)) + entry = pmd_mkwrite(entry); + ret = HPAGE_PMD_NR; set_pmd_at(mm, addr, pmd, entry); if (huge_pmd_needs_flush(oldpmd, entry)) tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE); - - BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry)); unlock: spin_unlock(ptl); return ret; @@ -2148,7 +2183,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, uffd_wp = pmd_uffd_wp(old_pmd); VM_BUG_ON_PAGE(!page_count(page), page); - page_ref_add(page, HPAGE_PMD_NR - 1); /* * Without "freeze", we'll simply split the PMD, propagating the @@ -2168,6 +2202,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, anon_exclusive = PageAnon(page) && PageAnonExclusive(page); if (freeze && anon_exclusive && page_try_share_anon_rmap(page)) freeze = false; + if (!freeze) + page_ref_add(page, HPAGE_PMD_NR - 1); } /* @@ -2209,66 +2245,37 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = maybe_mkwrite(entry, vma); if (anon_exclusive) SetPageAnonExclusive(page + i); - if (!write) - entry = pte_wrprotect(entry); if (!young) entry = pte_mkold(entry); + /* NOTE: this may set soft-dirty too on some archs */ + if (dirty) + entry = pte_mkdirty(entry); /* - * NOTE: we don't do pte_mkdirty when dirty==true - * because it breaks sparc64 which can sigsegv - * random process. Need to revisit when we figure - * out what is special with sparc64. + * NOTE: this needs to happen after pte_mkdirty, + * because some archs (sparc64, loongarch) could + * set hw write bit when mkdirty. */ + if (!write) + entry = pte_wrprotect(entry); if (soft_dirty) entry = pte_mksoft_dirty(entry); if (uffd_wp) entry = pte_mkuffd_wp(entry); + page_add_anon_rmap(page + i, vma, addr, false); } pte = pte_offset_map(&_pmd, addr); BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, entry); - if (!pmd_migration) - atomic_inc(&page[i]._mapcount); pte_unmap(pte); } - if (!pmd_migration) { - /* - * Set PG_double_map before dropping compound_mapcount to avoid - * false-negative page_mapped(). - */ - if (compound_mapcount(page) > 1 && - !TestSetPageDoubleMap(page)) { - for (i = 0; i < HPAGE_PMD_NR; i++) - atomic_inc(&page[i]._mapcount); - } - - lock_page_memcg(page); - if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { - /* Last compound_mapcount is gone. */ - __mod_lruvec_page_state(page, NR_ANON_THPS, - -HPAGE_PMD_NR); - if (TestClearPageDoubleMap(page)) { - /* No need in mapcount reference anymore */ - for (i = 0; i < HPAGE_PMD_NR; i++) - atomic_dec(&page[i]._mapcount); - } - } - unlock_page_memcg(page); - - /* Above is effectively page_remove_rmap(page, vma, true) */ - munlock_vma_page(page, vma, true); - } + if (!pmd_migration) + page_remove_rmap(page, vma, true); + if (freeze) + put_page(page); smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); - - if (freeze) { - for (i = 0; i < HPAGE_PMD_NR; i++) { - page_remove_rmap(page + i, vma, false); - put_page(page + i); - } - } } void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, @@ -2460,7 +2467,7 @@ static void __split_huge_page_tail(struct page *head, int tail, (1L << PG_dirty) | LRU_GEN_MASK | LRU_REFS_MASK)); - /* ->mapping in first tail page is compound_mapcount */ + /* ->mapping in first and second tail page is replaced by other uses */ VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, page_tail); page_tail->mapping = head->mapping; @@ -2470,6 +2477,10 @@ static void __split_huge_page_tail(struct page *head, int tail, * page->private should not be set in tail pages with the exception * of swap cache pages that store the swp_entry_t in tail pages. * Fix up and warn once if private is unexpectedly set. + * + * What of 32-bit systems, on which head[1].compound_pincount overlays + * head[1].private? No problem: THP_SWAP is not enabled on 32-bit, and + * compound_pincount must be 0 for folio_ref_freeze() to have succeeded. */ if (!folio_test_swapcache(page_folio(head))) { VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail); @@ -2722,7 +2733,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * split PMDs */ if (!can_split_folio(folio, &extra_pins)) { - ret = -EBUSY; + ret = -EAGAIN; goto out_unlock; } @@ -2772,7 +2783,7 @@ fail: xas_unlock(&xas); local_irq_enable(); remap_page(folio, folio_nr_pages(folio)); - ret = -EBUSY; + ret = -EAGAIN; } out_unlock: @@ -3076,28 +3087,28 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, mapping = candidate->f_mapping; for (index = off_start; index < off_end; index += nr_pages) { - struct page *fpage = pagecache_get_page(mapping, index, - FGP_ENTRY | FGP_HEAD, 0); + struct folio *folio = __filemap_get_folio(mapping, index, + FGP_ENTRY, 0); nr_pages = 1; - if (xa_is_value(fpage) || !fpage) + if (xa_is_value(folio) || !folio) continue; - if (!is_transparent_hugepage(fpage)) + if (!folio_test_large(folio)) goto next; total++; - nr_pages = thp_nr_pages(fpage); + nr_pages = folio_nr_pages(folio); - if (!trylock_page(fpage)) + if (!folio_trylock(folio)) goto next; - if (!split_huge_page(fpage)) + if (!split_folio(folio)) split++; - unlock_page(fpage); + folio_unlock(folio); next: - put_page(fpage); + folio_put(folio); cond_resched(); } |