diff options
-rw-r--r-- | include/linux/mm.h | 5 | ||||
-rw-r--r-- | include/linux/rmap.h | 49 | ||||
-rw-r--r-- | mm/huge_memory.c | 27 | ||||
-rw-r--r-- | mm/hugetlb.c | 16 | ||||
-rw-r--r-- | mm/memory.c | 17 | ||||
-rw-r--r-- | mm/migrate.c | 2 |
6 files changed, 79 insertions, 37 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h index a02812178562..9ee3ae51e8e3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1575,16 +1575,13 @@ static inline bool page_maybe_dma_pinned(struct page *page) /* * This should most likely only be called during fork() to see whether we - * should break the cow immediately for a page on the src mm. + * should break the cow immediately for an anon page on the src mm. * * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq. */ static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, struct page *page) { - if (!is_cow_mapping(vma->vm_flags)) - return false; - VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1)); if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 8573aae50d96..73f41544084f 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -12,6 +12,7 @@ #include <linux/memcontrol.h> #include <linux/highmem.h> #include <linux/pagemap.h> +#include <linux/memremap.h> /* * The anon_vma heads a list of private "related" vmas, to scan if @@ -182,11 +183,57 @@ void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address); -static inline void page_dup_rmap(struct page *page, bool compound) +static inline void __page_dup_rmap(struct page *page, bool compound) { atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount); } +static inline void page_dup_file_rmap(struct page *page, bool compound) +{ + __page_dup_rmap(page, compound); +} + +/** + * page_try_dup_anon_rmap - try duplicating a mapping of an already mapped + * anonymous page + * @page: the page to duplicate the mapping for + * @compound: the page is mapped as compound or as a small page + * @vma: the source vma + * + * The caller needs to hold the PT lock and the vma->vma_mm->write_protect_seq. + * + * Duplicating the mapping can only fail if the page may be pinned; device + * private pages cannot get pinned and consequently this function cannot fail. + * + * If duplicating the mapping succeeds, the page has to be mapped R/O into + * the parent and the child. It must *not* get mapped writable after this call. + * + * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise. + */ +static inline int page_try_dup_anon_rmap(struct page *page, bool compound, + struct vm_area_struct *vma) +{ + VM_BUG_ON_PAGE(!PageAnon(page), page); + + /* + * If this page may have been pinned by the parent process, + * don't allow to duplicate the mapping but instead require to e.g., + * copy the page immediately for the child so that we'll always + * guarantee the pinned page won't be randomly replaced in the + * future on write faults. + */ + if (likely(!is_device_private_page(page) && + unlikely(page_needs_cow_for_dma(vma, page)))) + return -EBUSY; + + /* + * It's okay to share the anon page between both processes, mapping + * the page R/O into both processes. + */ + __page_dup_rmap(page, compound); + return 0; +} + /* * Called from mm/vmscan.c to handle paging out */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c468fee595ff..baf4ea6d8e1a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1097,23 +1097,16 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, src_page = pmd_page(pmd); VM_BUG_ON_PAGE(!PageHead(src_page), src_page); - /* - * If this page is a potentially pinned page, split and retry the fault - * with smaller page size. Normally this should not happen because the - * userspace should use MADV_DONTFORK upon pinned regions. This is a - * best effort that the pinned pages won't be replaced by another - * random page during the coming copy-on-write. - */ - if (unlikely(page_needs_cow_for_dma(src_vma, src_page))) { + get_page(src_page); + if (unlikely(page_try_dup_anon_rmap(src_page, true, src_vma))) { + /* Page maybe pinned: split and retry the fault on PTEs. */ + put_page(src_page); pte_free(dst_mm, pgtable); spin_unlock(src_ptl); spin_unlock(dst_ptl); __split_huge_pmd(src_vma, src_pmd, addr, false, NULL); return -EAGAIN; } - - get_page(src_page); - page_dup_rmap(src_page, true); add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); out_zero_page: mm_inc_nr_ptes(dst_mm); @@ -1217,14 +1210,10 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, /* No huge zero pud yet */ } - /* Please refer to comments in copy_huge_pmd() */ - if (unlikely(page_needs_cow_for_dma(vma, pud_page(pud)))) { - spin_unlock(src_ptl); - spin_unlock(dst_ptl); - __split_huge_pud(vma, src_pud, addr); - return -EAGAIN; - } - + /* + * TODO: once we support anonymous pages, use page_try_dup_anon_rmap() + * and split if duplicating fails. + */ pudp_set_wrprotect(src_mm, addr, src_pud); pud = pud_mkold(pud_wrprotect(pud)); set_pud_at(dst_mm, addr, dst_pud, pud); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7389cd8a9a87..7a6052a984e9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4806,15 +4806,18 @@ again: get_page(ptepage); /* - * This is a rare case where we see pinned hugetlb - * pages while they're prone to COW. We need to do the - * COW earlier during fork. + * Failing to duplicate the anon rmap is a rare case + * where we see pinned hugetlb pages while they're + * prone to COW. We need to do the COW earlier during + * fork. * * When pre-allocating the page or copying data, we * need to be without the pgtable locks since we could * sleep during the process. */ - if (unlikely(page_needs_cow_for_dma(vma, ptepage))) { + if (!PageAnon(ptepage)) { + page_dup_file_rmap(ptepage, true); + } else if (page_try_dup_anon_rmap(ptepage, true, vma)) { pte_t src_pte_old = entry; struct page *new; @@ -4861,7 +4864,6 @@ again: entry = huge_pte_wrprotect(entry); } - page_dup_rmap(ptepage, true); set_huge_pte_at(dst, addr, dst_pte, entry); hugetlb_count_add(npages, dst); } @@ -5541,7 +5543,7 @@ retry: ClearHPageRestoreReserve(page); hugepage_add_new_anon_rmap(page, vma, haddr); } else - page_dup_rmap(page, true); + page_dup_file_rmap(page, true); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); set_huge_pte_at(mm, haddr, ptep, new_pte); @@ -5902,7 +5904,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, goto out_release_unlock; if (vm_shared) { - page_dup_rmap(page, true); + page_dup_file_rmap(page, true); } else { ClearHPageRestoreReserve(page); hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); diff --git a/mm/memory.c b/mm/memory.c index 9bdfce15e98f..9f5f829a1b1f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -825,7 +825,8 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, */ get_page(page); rss[mm_counter(page)]++; - page_dup_rmap(page, false); + /* Cannot fail as these pages cannot get pinned. */ + BUG_ON(page_try_dup_anon_rmap(page, false, src_vma)); /* * We do not preserve soft-dirty information, because so @@ -921,18 +922,24 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, struct page *page; page = vm_normal_page(src_vma, addr, pte); - if (page && unlikely(page_needs_cow_for_dma(src_vma, page))) { + if (page && PageAnon(page)) { /* * If this page may have been pinned by the parent process, * copy the page immediately for the child so that we'll always * guarantee the pinned page won't be randomly replaced in the * future. */ - return copy_present_page(dst_vma, src_vma, dst_pte, src_pte, - addr, rss, prealloc, page); + get_page(page); + if (unlikely(page_try_dup_anon_rmap(page, false, src_vma))) { + /* Page maybe pinned, we have to copy. */ + put_page(page); + return copy_present_page(dst_vma, src_vma, dst_pte, src_pte, + addr, rss, prealloc, page); + } + rss[mm_counter(page)]++; } else if (page) { get_page(page); - page_dup_rmap(page, false); + page_dup_file_rmap(page, false); rss[mm_counter(page)]++; } diff --git a/mm/migrate.c b/mm/migrate.c index 77592288fbc0..6ca6e89ded94 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -234,7 +234,7 @@ static bool remove_migration_pte(struct folio *folio, if (folio_test_anon(folio)) hugepage_add_anon_rmap(new, vma, pvmw.address); else - page_dup_rmap(new, true); + page_dup_file_rmap(new, true); set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } else #endif |