summaryrefslogtreecommitdiff
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c365
1 files changed, 204 insertions, 161 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 01f39e8144ef..405a483d2fd1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -77,7 +77,6 @@
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
#include <linux/sched/sysctl.h>
-#include <linux/net_mm.h>
#include <trace/events/kmem.h>
@@ -361,12 +360,10 @@ void free_pgd_range(struct mmu_gather *tlb,
} while (pgd++, addr = next, addr != end);
}
-void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
+void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
struct vm_area_struct *vma, unsigned long floor,
unsigned long ceiling, bool mm_wr_locked)
{
- MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
-
do {
unsigned long addr = vma->vm_start;
struct vm_area_struct *next;
@@ -375,7 +372,7 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
* Note: USER_PGTABLES_CEILING may be passed as ceiling and may
* be 0. This will underflow and is okay.
*/
- next = mas_find(&mas, ceiling - 1);
+ next = mas_find(mas, ceiling - 1);
/*
* Hide vma from rmap and truncate_pagecache before freeing
@@ -396,7 +393,7 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
while (next && next->vm_start <= vma->vm_end + PMD_SIZE
&& !is_vm_hugetlb_page(next)) {
vma = next;
- next = mas_find(&mas, ceiling - 1);
+ next = mas_find(mas, ceiling - 1);
if (mm_wr_locked)
vma_start_write(vma);
unlink_anon_vmas(vma);
@@ -860,8 +857,11 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
return -EBUSY;
return -ENOENT;
} else if (is_pte_marker_entry(entry)) {
- if (is_swapin_error_entry(entry) || userfaultfd_wp(dst_vma))
- set_pte_at(dst_mm, addr, dst_pte, pte);
+ pte_marker marker = copy_pte_marker(entry, dst_vma);
+
+ if (marker)
+ set_pte_at(dst_mm, addr, dst_pte,
+ make_pte_marker(marker));
return 0;
}
if (!userfaultfd_wp(dst_vma))
@@ -1312,7 +1312,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
* Use the raw variant of the seqcount_t write API to avoid
* lockdep complaining about preemptibility.
*/
- mmap_assert_write_locked(src_mm);
+ vma_assert_write_locked(src_vma);
raw_write_seqcount_begin(&src_mm->write_protect_seq);
}
@@ -1433,8 +1433,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
tlb_remove_tlb_entry(tlb, pte, addr);
zap_install_uffd_wp_if_needed(vma, addr, pte, details,
ptent);
- if (unlikely(!page))
+ if (unlikely(!page)) {
+ ksm_might_unmap_zero_page(mm, ptent);
continue;
+ }
delay_rmap = 0;
if (!PageAnon(page)) {
@@ -1500,7 +1502,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
!zap_drop_file_uffd_wp(details))
continue;
} else if (is_hwpoison_entry(entry) ||
- is_swapin_error_entry(entry)) {
+ is_poisoned_swp_entry(entry)) {
if (!should_zap_cows(details))
continue;
} else {
@@ -1691,10 +1693,12 @@ static void unmap_single_vma(struct mmu_gather *tlb,
/**
* unmap_vmas - unmap a range of memory covered by a list of vma's
* @tlb: address of the caller's struct mmu_gather
- * @mt: the maple tree
+ * @mas: the maple state
* @vma: the starting vma
* @start_addr: virtual address at which to start unmapping
* @end_addr: virtual address at which to end unmapping
+ * @tree_end: The maximum index to check
+ * @mm_wr_locked: lock flag
*
* Unmap all pages in the vma list.
*
@@ -1707,9 +1711,10 @@ static void unmap_single_vma(struct mmu_gather *tlb,
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
* drops the lock and schedules.
*/
-void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
+void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
struct vm_area_struct *vma, unsigned long start_addr,
- unsigned long end_addr, bool mm_wr_locked)
+ unsigned long end_addr, unsigned long tree_end,
+ bool mm_wr_locked)
{
struct mmu_notifier_range range;
struct zap_details details = {
@@ -1717,7 +1722,6 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
/* Careful - we need to zap private pages too! */
.even_cows = true,
};
- MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
start_addr, end_addr);
@@ -1725,7 +1729,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
do {
unmap_single_vma(tlb, vma, start_addr, end_addr, &details,
mm_wr_locked);
- } while ((vma = mas_find(&mas, end_addr - 1)) != NULL);
+ } while ((vma = mas_find(mas, tree_end - 1)) != NULL);
mmu_notifier_invalidate_range_end(&range);
}
@@ -1865,7 +1869,6 @@ out:
return retval;
}
-#ifdef pte_index
static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
unsigned long addr, struct page *page, pgprot_t prot)
{
@@ -1880,7 +1883,7 @@ static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
}
/* insert_pages() amortizes the cost of spinlock operations
- * when inserting pages in a loop. Arch *must* define pte_index.
+ * when inserting pages in a loop.
*/
static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
struct page **pages, unsigned long *num, pgprot_t prot)
@@ -1939,7 +1942,6 @@ out:
*num = remaining_pages_total;
return ret;
}
-#endif /* ifdef pte_index */
/**
* vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
@@ -1959,7 +1961,6 @@ out:
int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
struct page **pages, unsigned long *num)
{
-#ifdef pte_index
const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;
if (addr < vma->vm_start || end_addr >= vma->vm_end)
@@ -1971,18 +1972,6 @@ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
}
/* Defer page refcount checking till we're about to map that page. */
return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
-#else
- unsigned long idx = 0, pgcount = *num;
- int err = -EINVAL;
-
- for (; idx < pgcount; ++idx) {
- err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]);
- if (err)
- break;
- }
- *num = pgcount - idx;
- return err;
-#endif /* ifdef pte_index */
}
EXPORT_SYMBOL(vm_insert_pages);
@@ -2858,7 +2847,7 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
entry = pte_mkyoung(vmf->orig_pte);
if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
- update_mmu_cache(vma, addr, vmf->pte);
+ update_mmu_cache_range(vmf, vma, addr, vmf->pte, 1);
}
/*
@@ -2927,10 +2916,9 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
*
* We do this without the lock held, so that it can sleep if it needs to.
*/
-static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
+static vm_fault_t do_page_mkwrite(struct vm_fault *vmf, struct folio *folio)
{
vm_fault_t ret;
- struct page *page = vmf->page;
unsigned int old_flags = vmf->flags;
vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
@@ -2945,14 +2933,14 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
return ret;
if (unlikely(!(ret & VM_FAULT_LOCKED))) {
- lock_page(page);
- if (!page->mapping) {
- unlock_page(page);
+ folio_lock(folio);
+ if (!folio->mapping) {
+ folio_unlock(folio);
return 0; /* retry */
}
ret |= VM_FAULT_LOCKED;
} else
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
return ret;
}
@@ -2965,20 +2953,20 @@ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping;
- struct page *page = vmf->page;
+ struct folio *folio = page_folio(vmf->page);
bool dirtied;
bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
- dirtied = set_page_dirty(page);
- VM_BUG_ON_PAGE(PageAnon(page), page);
+ dirtied = folio_mark_dirty(folio);
+ VM_BUG_ON_FOLIO(folio_test_anon(folio), folio);
/*
- * Take a local copy of the address_space - page.mapping may be zeroed
- * by truncate after unlock_page(). The address_space itself remains
- * pinned by vma->vm_file's reference. We rely on unlock_page()'s
+ * Take a local copy of the address_space - folio.mapping may be zeroed
+ * by truncate after folio_unlock(). The address_space itself remains
+ * pinned by vma->vm_file's reference. We rely on folio_unlock()'s
* release semantics to prevent the compiler from undoing this copying.
*/
- mapping = page_rmapping(page);
- unlock_page(page);
+ mapping = folio_raw_mapping(folio);
+ folio_unlock(folio);
if (!page_mkwrite)
file_update_time(vma->vm_file);
@@ -3036,7 +3024,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
entry = pte_mkyoung(vmf->orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
- update_mmu_cache(vma, vmf->address, vmf->pte);
+ update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
pte_unmap_unlock(vmf->pte, vmf->ptl);
count_vm_event(PGREUSE);
}
@@ -3128,6 +3116,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
inc_mm_counter(mm, MM_ANONPAGES);
}
} else {
+ ksm_might_unmap_zero_page(mm, vmf->orig_pte);
inc_mm_counter(mm, MM_ANONPAGES);
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
@@ -3149,7 +3138,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
* that left a window where the new PTE could be loaded into
* some TLBs while the old PTE remains in others.
*/
- ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
+ ptep_clear_flush(vma, vmf->address, vmf->pte);
folio_add_new_anon_rmap(new_folio, vma, vmf->address);
folio_add_lru_vma(new_folio, vma);
/*
@@ -3159,7 +3148,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
*/
BUG_ON(unshare && pte_write(entry));
set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
- update_mmu_cache(vma, vmf->address, vmf->pte);
+ update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
if (old_folio) {
/*
* Only after switching the pte to the new page may
@@ -3195,11 +3184,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
}
- /*
- * No need to double call mmu_notifier->invalidate_range() callback as
- * the above ptep_clear_flush_notify() did already call it.
- */
- mmu_notifier_invalidate_range_only_end(&range);
+ mmu_notifier_invalidate_range_end(&range);
if (new_folio)
folio_put(new_folio);
@@ -3269,6 +3254,11 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
vm_fault_t ret;
pte_unmap_unlock(vmf->pte, vmf->ptl);
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ vma_end_read(vmf->vma);
+ return VM_FAULT_RETRY;
+ }
+
vmf->flags |= FAULT_FLAG_MKWRITE;
ret = vma->vm_ops->pfn_mkwrite(vmf);
if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
@@ -3279,36 +3269,42 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
return 0;
}
-static vm_fault_t wp_page_shared(struct vm_fault *vmf)
+static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio)
__releases(vmf->ptl)
{
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret = 0;
- get_page(vmf->page);
+ folio_get(folio);
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
vm_fault_t tmp;
pte_unmap_unlock(vmf->pte, vmf->ptl);
- tmp = do_page_mkwrite(vmf);
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ folio_put(folio);
+ vma_end_read(vmf->vma);
+ return VM_FAULT_RETRY;
+ }
+
+ tmp = do_page_mkwrite(vmf, folio);
if (unlikely(!tmp || (tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- put_page(vmf->page);
+ folio_put(folio);
return tmp;
}
tmp = finish_mkwrite_fault(vmf);
if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
- unlock_page(vmf->page);
- put_page(vmf->page);
+ folio_unlock(folio);
+ folio_put(folio);
return tmp;
}
} else {
wp_page_reuse(vmf);
- lock_page(vmf->page);
+ folio_lock(folio);
}
ret |= fault_dirty_shared_page(vmf);
- put_page(vmf->page);
+ folio_put(folio);
return ret;
}
@@ -3359,6 +3355,9 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
+ if (vmf->page)
+ folio = page_folio(vmf->page);
+
/*
* Shared mapping: we are guaranteed to have VM_WRITE and
* FAULT_FLAG_WRITE set at this point.
@@ -3373,12 +3372,9 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
*/
if (!vmf->page)
return wp_pfn_shared(vmf);
- return wp_page_shared(vmf);
+ return wp_page_shared(vmf, folio);
}
- if (vmf->page)
- folio = page_folio(vmf->page);
-
/*
* Private mapping: create an exclusive anonymous page copy if reuse
* is impossible. We might miss VM_WRITE for FOLL_FORCE handling.
@@ -3432,6 +3428,12 @@ reuse:
return 0;
}
copy:
+ if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma->anon_vma) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ vma_end_read(vmf->vma);
+ return VM_FAULT_RETRY;
+ }
+
/*
* Ok, we need to copy. Oh, well..
*/
@@ -3495,7 +3497,7 @@ void unmap_mapping_folio(struct folio *folio)
VM_BUG_ON(!folio_test_locked(folio));
first_index = folio->index;
- last_index = folio->index + folio_nr_pages(folio) - 1;
+ last_index = folio_next_index(folio) - 1;
details.even_cows = false;
details.single_folio = folio;
@@ -3582,6 +3584,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
struct folio *folio = page_folio(vmf->page);
struct vm_area_struct *vma = vmf->vma;
struct mmu_notifier_range range;
+ vm_fault_t ret;
/*
* We need a reference to lock the folio because we don't hold
@@ -3594,9 +3597,10 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
if (!folio_try_get(folio))
return 0;
- if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) {
+ ret = folio_lock_or_retry(folio, vmf);
+ if (ret) {
folio_put(folio);
- return VM_FAULT_RETRY;
+ return ret;
}
mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
vma->vm_mm, vmf->address & PAGE_MASK,
@@ -3647,7 +3651,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
* none pte. Otherwise it means the pte could have changed, so retry.
*
* This should also cover the case where e.g. the pte changed
- * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_SWAPIN_ERROR.
+ * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED.
* So is_pte_marker() check is not enough to safely drop the pte.
*/
if (pte_same(vmf->orig_pte, ptep_get(vmf->pte)))
@@ -3693,8 +3697,8 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
/* Higher priority than uffd-wp when data corrupted */
- if (marker & PTE_MARKER_SWAPIN_ERROR)
- return VM_FAULT_SIGBUS;
+ if (marker & PTE_MARKER_POISONED)
+ return VM_FAULT_HWPOISON;
if (pte_marker_entry_uffd_wp(entry))
return pte_marker_handle_uffd_wp(vmf);
@@ -3721,18 +3725,12 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
bool exclusive = false;
swp_entry_t entry;
pte_t pte;
- int locked;
vm_fault_t ret = 0;
void *shadow = NULL;
if (!pte_unmap_same(vmf))
goto out;
- if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
- ret = VM_FAULT_RETRY;
- goto out;
- }
-
entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
@@ -3742,6 +3740,16 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
vmf->page = pfn_swap_entry_to_page(entry);
ret = remove_device_exclusive_entry(vmf);
} else if (is_device_private_entry(entry)) {
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ /*
+ * migrate_to_ram is not yet ready to operate
+ * under VMA lock.
+ */
+ vma_end_read(vma);
+ ret = VM_FAULT_RETRY;
+ goto out;
+ }
+
vmf->page = pfn_swap_entry_to_page(entry);
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
@@ -3805,7 +3813,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
folio_add_lru(folio);
/* To provide entry to swap_readpage() */
- folio_set_swap_entry(folio, entry);
+ folio->swap = entry;
swap_readpage(page, true, NULL);
folio->private = NULL;
}
@@ -3843,12 +3851,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto out_release;
}
- locked = folio_lock_or_retry(folio, vma->vm_mm, vmf->flags);
-
- if (!locked) {
- ret |= VM_FAULT_RETRY;
+ ret |= folio_lock_or_retry(folio, vmf);
+ if (ret & VM_FAULT_RETRY)
goto out_release;
- }
if (swapcache) {
/*
@@ -3859,7 +3864,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* changed.
*/
if (unlikely(!folio_test_swapcache(folio) ||
- page_private(page) != entry.val))
+ page_swap_entry(page).val != entry.val))
goto out_page;
/*
@@ -4026,7 +4031,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
}
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, vmf->address, vmf->pte);
+ update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
unlock:
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -4150,7 +4155,7 @@ setpte:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, vmf->address, vmf->pte);
+ update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
unlock:
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -4245,7 +4250,6 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
bool write = vmf->flags & FAULT_FLAG_WRITE;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
pmd_t entry;
- int i;
vm_fault_t ret = VM_FAULT_FALLBACK;
if (!transhuge_vma_suitable(vma, haddr))
@@ -4278,8 +4282,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
if (unlikely(!pmd_none(*vmf->pmd)))
goto out;
- for (i = 0; i < HPAGE_PMD_NR; i++)
- flush_icache_page(vma, page + i);
+ flush_icache_pages(vma, page, HPAGE_PMD_NR);
entry = mk_huge_pmd(page, vma->vm_page_prot);
if (write)
@@ -4312,15 +4315,24 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
}
#endif
-void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
+/**
+ * set_pte_range - Set a range of PTEs to point to pages in a folio.
+ * @vmf: Fault decription.
+ * @folio: The folio that contains @page.
+ * @page: The first page to create a PTE for.
+ * @nr: The number of PTEs to create.
+ * @addr: The first address to create a PTE for.
+ */
+void set_pte_range(struct vm_fault *vmf, struct folio *folio,
+ struct page *page, unsigned int nr, unsigned long addr)
{
struct vm_area_struct *vma = vmf->vma;
bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
bool write = vmf->flags & FAULT_FLAG_WRITE;
- bool prefault = vmf->address != addr;
+ bool prefault = in_range(vmf->address, addr, nr * PAGE_SIZE);
pte_t entry;
- flush_icache_page(vma, page);
+ flush_icache_pages(vma, page, nr);
entry = mk_pte(page, vma->vm_page_prot);
if (prefault && arch_wants_old_prefaulted_pte())
@@ -4334,14 +4346,18 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
entry = pte_mkuffd_wp(entry);
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) {
- inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, addr);
- lru_cache_add_inactive_or_unevictable(page, vma);
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr);
+ VM_BUG_ON_FOLIO(nr != 1, folio);
+ folio_add_new_anon_rmap(folio, vma, addr);
+ folio_add_lru_vma(folio, vma);
} else {
- inc_mm_counter(vma->vm_mm, mm_counter_file(page));
- page_add_file_rmap(page, vma, false);
+ add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
+ folio_add_file_rmap_range(folio, page, nr, vma, false);
}
- set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
+ set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
+
+ /* no need to invalidate: a not-present page won't be cached */
+ update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr);
}
static bool vmf_pte_changed(struct vm_fault *vmf)
@@ -4409,11 +4425,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
/* Re-check under ptl */
if (likely(!vmf_pte_changed(vmf))) {
- do_set_pte(vmf, page, vmf->address);
-
- /* no need to invalidate: a not-present page won't be cached */
- update_mmu_cache(vma, vmf->address, vmf->pte);
+ struct folio *folio = page_folio(page);
+ set_pte_range(vmf, folio, page, 1, vmf->address);
ret = 0;
} else {
update_mmu_tlb(vma, vmf->address, vmf->pte);
@@ -4532,6 +4546,7 @@ static inline bool should_fault_around(struct vm_fault *vmf)
static vm_fault_t do_read_fault(struct vm_fault *vmf)
{
vm_fault_t ret = 0;
+ struct folio *folio;
/*
* Let's call ->map_pages() first and use ->fault() as fallback
@@ -4544,14 +4559,20 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
return ret;
}
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ vma_end_read(vmf->vma);
+ return VM_FAULT_RETRY;
+ }
+
ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
ret |= finish_fault(vmf);
- unlock_page(vmf->page);
+ folio = page_folio(vmf->page);
+ folio_unlock(folio);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
- put_page(vmf->page);
+ folio_put(folio);
return ret;
}
@@ -4560,6 +4581,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret;
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ vma_end_read(vma);
+ return VM_FAULT_RETRY;
+ }
+
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
@@ -4598,21 +4624,29 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret, tmp;
+ struct folio *folio;
+
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ vma_end_read(vma);
+ return VM_FAULT_RETRY;
+ }
ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
+ folio = page_folio(vmf->page);
+
/*
* Check if the backing address space wants to know that the page is
* about to become writable
*/
if (vma->vm_ops->page_mkwrite) {
- unlock_page(vmf->page);
- tmp = do_page_mkwrite(vmf);
+ folio_unlock(folio);
+ tmp = do_page_mkwrite(vmf, folio);
if (unlikely(!tmp ||
(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- put_page(vmf->page);
+ folio_put(folio);
return tmp;
}
}
@@ -4620,8 +4654,8 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
ret |= finish_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
VM_FAULT_RETRY))) {
- unlock_page(vmf->page);
- put_page(vmf->page);
+ folio_unlock(folio);
+ folio_put(folio);
return ret;
}
@@ -4810,43 +4844,45 @@ out_map:
if (writable)
pte = pte_mkwrite(pte);
ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
- update_mmu_cache(vma, vmf->address, vmf->pte);
+ update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
pte_unmap_unlock(vmf->pte, vmf->ptl);
goto out;
}
static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
{
- if (vma_is_anonymous(vmf->vma))
+ struct vm_area_struct *vma = vmf->vma;
+ if (vma_is_anonymous(vma))
return do_huge_pmd_anonymous_page(vmf);
- if (vmf->vma->vm_ops->huge_fault)
- return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
+ if (vma->vm_ops->huge_fault)
+ return vma->vm_ops->huge_fault(vmf, PMD_ORDER);
return VM_FAULT_FALLBACK;
}
/* `inline' is required to avoid gcc 4.1.2 build error */
static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
{
+ struct vm_area_struct *vma = vmf->vma;
const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
vm_fault_t ret;
- if (vma_is_anonymous(vmf->vma)) {
+ if (vma_is_anonymous(vma)) {
if (likely(!unshare) &&
- userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
+ userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd))
return handle_userfault(vmf, VM_UFFD_WP);
return do_huge_pmd_wp_page(vmf);
}
- if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
- if (vmf->vma->vm_ops->huge_fault) {
- ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
+ if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
+ if (vma->vm_ops->huge_fault) {
+ ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
}
}
/* COW or write-notify handled on pte level: split pmd. */
- __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
+ __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
return VM_FAULT_FALLBACK;
}
@@ -4855,11 +4891,12 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+ struct vm_area_struct *vma = vmf->vma;
/* No support for anonymous transparent PUD pages yet */
- if (vma_is_anonymous(vmf->vma))
+ if (vma_is_anonymous(vma))
return VM_FAULT_FALLBACK;
- if (vmf->vma->vm_ops->huge_fault)
- return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+ if (vma->vm_ops->huge_fault)
+ return vma->vm_ops->huge_fault(vmf, PUD_ORDER);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
return VM_FAULT_FALLBACK;
}
@@ -4868,21 +4905,22 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+ struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret;
/* No support for anonymous transparent PUD pages yet */
- if (vma_is_anonymous(vmf->vma))
+ if (vma_is_anonymous(vma))
goto split;
- if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
- if (vmf->vma->vm_ops->huge_fault) {
- ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+ if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
+ if (vma->vm_ops->huge_fault) {
+ ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
}
}
split:
/* COW or write-notify not handled on PUD level: split pud.*/
- __split_huge_pud(vmf->vma, vmf->pud, vmf->address);
+ __split_huge_pud(vma, vmf->pud, vmf->address);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
return VM_FAULT_FALLBACK;
}
@@ -4959,7 +4997,8 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
entry = pte_mkyoung(entry);
if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
vmf->flags & FAULT_FLAG_WRITE)) {
- update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
+ update_mmu_cache_range(vmf, vmf->vma, vmf->address,
+ vmf->pte, 1);
} else {
/* Skip spurious TLB flush for retried page fault */
if (vmf->flags & FAULT_FLAG_TRIED)
@@ -4980,10 +5019,10 @@ unlock:
}
/*
- * By the time we get here, we already hold the mm semaphore
- *
- * The mmap_lock may have been released depending on flags and our
- * return value. See filemap_fault() and __folio_lock_or_retry().
+ * On entry, we hold either the VMA lock or the mmap_lock
+ * (FAULT_FLAG_VMA_LOCK tells you which). If VM_FAULT_RETRY is set in
+ * the result, the mmap_lock is not held on exit. See filemap_fault()
+ * and __folio_lock_or_retry().
*/
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
@@ -5081,7 +5120,7 @@ retry_pud:
/**
* mm_account_fault - Do page fault accounting
- *
+ * @mm: mm from which memcg should be extracted. It can be NULL.
* @regs: the pt_regs struct pointer. When set to NULL, will skip accounting
* of perf event counters, but we'll still do the per-task accounting to
* the task who triggered this page fault.
@@ -5189,6 +5228,17 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
!is_cow_mapping(vma->vm_flags)))
return VM_FAULT_SIGSEGV;
}
+#ifdef CONFIG_PER_VMA_LOCK
+ /*
+ * Per-VMA locks can't be used with FAULT_FLAG_RETRY_NOWAIT because of
+ * the assumption that lock is dropped on VM_FAULT_RETRY.
+ */
+ if (WARN_ON_ONCE((*flags &
+ (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)) ==
+ (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)))
+ return VM_FAULT_SIGSEGV;
+#endif
+
return 0;
}
@@ -5257,11 +5307,8 @@ EXPORT_SYMBOL_GPL(handle_mm_fault);
static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
{
- /* Even if this succeeds, make it clear we *might* have slept */
- if (likely(mmap_read_trylock(mm))) {
- might_sleep();
+ if (likely(mmap_read_trylock(mm)))
return true;
- }
if (regs && !user_mode(regs)) {
unsigned long ip = instruction_pointer(regs);
@@ -5389,31 +5436,21 @@ retry:
if (!vma)
goto inval;
- /* Only anonymous and tcp vmas are supported for now */
- if (!vma_is_anonymous(vma) && !vma_is_tcp(vma))
- goto inval;
-
- /* find_mergeable_anon_vma uses adjacent vmas which are not locked */
- if (!vma->anon_vma && !vma_is_tcp(vma))
- goto inval;
-
if (!vma_start_read(vma))
goto inval;
/*
- * Due to the possibility of userfault handler dropping mmap_lock, avoid
- * it for now and fall back to page fault handling under mmap_lock.
+ * find_mergeable_anon_vma uses adjacent vmas which are not locked.
+ * This check must happen after vma_start_read(); otherwise, a
+ * concurrent mremap() with MREMAP_DONTUNMAP could dissociate the VMA
+ * from its anon_vma.
*/
- if (userfaultfd_armed(vma)) {
- vma_end_read(vma);
- goto inval;
- }
+ if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma))
+ goto inval_end_read;
/* Check since vm_start/vm_end might change before we lock the VMA */
- if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
- vma_end_read(vma);
- goto inval;
- }
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+ goto inval_end_read;
/* Check if the VMA got isolated after we found it */
if (vma->detached) {
@@ -5425,6 +5462,9 @@ retry:
rcu_read_unlock();
return vma;
+
+inval_end_read:
+ vma_end_read(vma);
inval:
rcu_read_unlock();
count_vm_vma_lock_event(VMA_LOCK_ABORT);
@@ -5701,6 +5741,9 @@ int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
if (mmap_read_lock_killable(mm))
return 0;
+ /* Untag the address before looking up the VMA */
+ addr = untagged_addr_remote(mm, addr);
+
/* Avoid triggering the temporary warning in __get_user_pages */
if (!vma_lookup(mm, addr) && !expand_stack(mm, addr))
return 0;
@@ -6055,19 +6098,19 @@ void __init ptlock_cache_init(void)
SLAB_PANIC, NULL);
}
-bool ptlock_alloc(struct page *page)
+bool ptlock_alloc(struct ptdesc *ptdesc)
{
spinlock_t *ptl;
ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
if (!ptl)
return false;
- page->ptl = ptl;
+ ptdesc->ptl = ptl;
return true;
}
-void ptlock_free(struct page *page)
+void ptlock_free(struct ptdesc *ptdesc)
{
- kmem_cache_free(page_ptl_cachep, page->ptl);
+ kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
}
#endif