diff options
Diffstat (limited to 'mm/memory.c')
| -rw-r--r-- | mm/memory.c | 399 | 
1 files changed, 267 insertions, 132 deletions
diff --git a/mm/memory.c b/mm/memory.c index 0bfc8b007c01..f2bc6dd15eb8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -806,9 +806,9 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,  		}  		rss[MM_SWAPENTS]++;  	} else if (is_migration_entry(entry)) { -		page = pfn_swap_entry_to_page(entry); +		folio = pfn_swap_entry_folio(entry); -		rss[mm_counter(page)]++; +		rss[mm_counter(folio)]++;  		if (!is_readable_migration_entry(entry) &&  				is_cow_mapping(vm_flags)) { @@ -840,7 +840,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,  		 * keep things as they are.  		 */  		folio_get(folio); -		rss[mm_counter(page)]++; +		rss[mm_counter(folio)]++;  		/* Cannot fail as these pages cannot get pinned. */  		folio_try_dup_anon_rmap_pte(folio, page, src_vma); @@ -930,68 +930,111 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma  	return 0;  } +static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma, +		struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, +		pte_t pte, unsigned long addr, int nr) +{ +	struct mm_struct *src_mm = src_vma->vm_mm; + +	/* If it's a COW mapping, write protect it both processes. */ +	if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) { +		wrprotect_ptes(src_mm, addr, src_pte, nr); +		pte = pte_wrprotect(pte); +	} + +	/* If it's a shared mapping, mark it clean in the child. */ +	if (src_vma->vm_flags & VM_SHARED) +		pte = pte_mkclean(pte); +	pte = pte_mkold(pte); + +	if (!userfaultfd_wp(dst_vma)) +		pte = pte_clear_uffd_wp(pte); + +	set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr); +} +  /* - * Copy one pte.  Returns 0 if succeeded, or -EAGAIN if one preallocated page - * is required to copy this pte. + * Copy one present PTE, trying to batch-process subsequent PTEs that map + * consecutive pages of the same folio by copying them as well. + * + * Returns -EAGAIN if one preallocated page is required to copy the next PTE. + * Otherwise, returns the number of copied PTEs (at least 1).   */  static inline int -copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, -		 pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, -		 struct folio **prealloc) +copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, +		 pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr, +		 int max_nr, int *rss, struct folio **prealloc)  { -	struct mm_struct *src_mm = src_vma->vm_mm; -	unsigned long vm_flags = src_vma->vm_flags; -	pte_t pte = ptep_get(src_pte);  	struct page *page;  	struct folio *folio; +	bool any_writable; +	fpb_t flags = 0; +	int err, nr;  	page = vm_normal_page(src_vma, addr, pte); -	if (page) -		folio = page_folio(page); -	if (page && folio_test_anon(folio)) { +	if (unlikely(!page)) +		goto copy_pte; + +	folio = page_folio(page); + +	/* +	 * If we likely have to copy, just don't bother with batching. Make +	 * sure that the common "small folio" case is as fast as possible +	 * by keeping the batching logic separate. +	 */ +	if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) { +		if (src_vma->vm_flags & VM_SHARED) +			flags |= FPB_IGNORE_DIRTY; +		if (!vma_soft_dirty_enabled(src_vma)) +			flags |= FPB_IGNORE_SOFT_DIRTY; + +		nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags, +				     &any_writable); +		folio_ref_add(folio, nr); +		if (folio_test_anon(folio)) { +			if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, +								  nr, src_vma))) { +				folio_ref_sub(folio, nr); +				return -EAGAIN; +			} +			rss[MM_ANONPAGES] += nr; +			VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio); +		} else { +			folio_dup_file_rmap_ptes(folio, page, nr); +			rss[mm_counter_file(folio)] += nr; +		} +		if (any_writable) +			pte = pte_mkwrite(pte, src_vma); +		__copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, +				    addr, nr); +		return nr; +	} + +	folio_get(folio); +	if (folio_test_anon(folio)) {  		/*  		 * If this page may have been pinned by the parent process,  		 * copy the page immediately for the child so that we'll always  		 * guarantee the pinned page won't be randomly replaced in the  		 * future.  		 */ -		folio_get(folio);  		if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) {  			/* Page may be pinned, we have to copy. */  			folio_put(folio); -			return copy_present_page(dst_vma, src_vma, dst_pte, src_pte, -						 addr, rss, prealloc, page); +			err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte, +						addr, rss, prealloc, page); +			return err ? err : 1;  		}  		rss[MM_ANONPAGES]++; -	} else if (page) { -		folio_get(folio); +		VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio); +	} else {  		folio_dup_file_rmap_pte(folio, page); -		rss[mm_counter_file(page)]++; -	} - -	/* -	 * If it's a COW mapping, write protect it both -	 * in the parent and the child -	 */ -	if (is_cow_mapping(vm_flags) && pte_write(pte)) { -		ptep_set_wrprotect(src_mm, addr, src_pte); -		pte = pte_wrprotect(pte); +		rss[mm_counter_file(folio)]++;  	} -	VM_BUG_ON(page && folio_test_anon(folio) && PageAnonExclusive(page)); -	/* -	 * If it's a shared mapping, mark it clean in -	 * the child -	 */ -	if (vm_flags & VM_SHARED) -		pte = pte_mkclean(pte); -	pte = pte_mkold(pte); - -	if (!userfaultfd_wp(dst_vma)) -		pte = pte_clear_uffd_wp(pte); - -	set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); -	return 0; +copy_pte: +	__copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, 1); +	return 1;  }  static inline struct folio *folio_prealloc(struct mm_struct *src_mm, @@ -1028,10 +1071,11 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,  	pte_t *src_pte, *dst_pte;  	pte_t ptent;  	spinlock_t *src_ptl, *dst_ptl; -	int progress, ret = 0; +	int progress, max_nr, ret = 0;  	int rss[NR_MM_COUNTERS];  	swp_entry_t entry = (swp_entry_t){0};  	struct folio *prealloc = NULL; +	int nr;  again:  	progress = 0; @@ -1062,6 +1106,8 @@ again:  	arch_enter_lazy_mmu_mode();  	do { +		nr = 1; +  		/*  		 * We are holding two locks at this point - either of them  		 * could generate latencies in another task on another CPU. @@ -1091,6 +1137,8 @@ again:  				progress += 8;  				continue;  			} +			ptent = ptep_get(src_pte); +			VM_WARN_ON_ONCE(!pte_present(ptent));  			/*  			 * Device exclusive entry restored, continue by copying @@ -1098,9 +1146,10 @@ again:  			 */  			WARN_ON_ONCE(ret != -ENOENT);  		} -		/* copy_present_pte() will clear `*prealloc' if consumed */ -		ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte, -				       addr, rss, &prealloc); +		/* copy_present_ptes() will clear `*prealloc' if consumed */ +		max_nr = (end - addr) / PAGE_SIZE; +		ret = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, +					ptent, addr, max_nr, rss, &prealloc);  		/*  		 * If we need a pre-allocated page for this pte, drop the  		 * locks, allocate, and try again. @@ -1117,8 +1166,10 @@ again:  			folio_put(prealloc);  			prealloc = NULL;  		} -		progress += 8; -	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); +		nr = ret; +		progress += 8 * nr; +	} while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr, +		 addr != end);  	arch_leave_lazy_mmu_mode();  	pte_unmap_unlock(orig_src_pte, src_ptl); @@ -1139,7 +1190,7 @@ again:  		prealloc = folio_prealloc(src_mm, src_vma, addr, false);  		if (!prealloc)  			return -ENOMEM; -	} else if (ret) { +	} else if (ret < 0) {  		VM_WARN_ON_ONCE(1);  	} @@ -1369,19 +1420,16 @@ static inline bool should_zap_cows(struct zap_details *details)  	return details->even_cows;  } -/* Decides whether we should zap this page with the page pointer specified */ -static inline bool should_zap_page(struct zap_details *details, struct page *page) +/* Decides whether we should zap this folio with the folio pointer specified */ +static inline bool should_zap_folio(struct zap_details *details, +				    struct folio *folio)  { -	/* If we can make a decision without *page.. */ +	/* If we can make a decision without *folio.. */  	if (should_zap_cows(details))  		return true; -	/* E.g. the caller passes NULL for the case of a zero page */ -	if (!page) -		return true; - -	/* Otherwise we should only zap non-anon pages */ -	return !PageAnon(page); +	/* Otherwise we should only zap non-anon folios */ +	return !folio_test_anon(folio);  }  static inline bool zap_drop_file_uffd_wp(struct zap_details *details) @@ -1398,7 +1446,7 @@ static inline bool zap_drop_file_uffd_wp(struct zap_details *details)   */  static inline void  zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, -			      unsigned long addr, pte_t *pte, +			      unsigned long addr, pte_t *pte, int nr,  			      struct zap_details *details, pte_t pteval)  {  	/* Zap on anonymous always means dropping everything */ @@ -1408,7 +1456,111 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,  	if (zap_drop_file_uffd_wp(details))  		return; -	pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); +	for (;;) { +		/* the PFN in the PTE is irrelevant. */ +		pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); +		if (--nr == 0) +			break; +		pte++; +		addr += PAGE_SIZE; +	} +} + +static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, +		struct vm_area_struct *vma, struct folio *folio, +		struct page *page, pte_t *pte, pte_t ptent, unsigned int nr, +		unsigned long addr, struct zap_details *details, int *rss, +		bool *force_flush, bool *force_break) +{ +	struct mm_struct *mm = tlb->mm; +	bool delay_rmap = false; + +	if (!folio_test_anon(folio)) { +		ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); +		if (pte_dirty(ptent)) { +			folio_mark_dirty(folio); +			if (tlb_delay_rmap(tlb)) { +				delay_rmap = true; +				*force_flush = true; +			} +		} +		if (pte_young(ptent) && likely(vma_has_recency(vma))) +			folio_mark_accessed(folio); +		rss[mm_counter(folio)] -= nr; +	} else { +		/* We don't need up-to-date accessed/dirty bits. */ +		clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); +		rss[MM_ANONPAGES] -= nr; +	} +	/* Checking a single PTE in a batch is sufficient. */ +	arch_check_zapped_pte(vma, ptent); +	tlb_remove_tlb_entries(tlb, pte, nr, addr); +	if (unlikely(userfaultfd_pte_wp(vma, ptent))) +		zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, +					      ptent); + +	if (!delay_rmap) { +		folio_remove_rmap_ptes(folio, page, nr, vma); + +		/* Only sanity-check the first page in a batch. */ +		if (unlikely(page_mapcount(page) < 0)) +			print_bad_pte(vma, addr, ptent, page); +	} +	if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) { +		*force_flush = true; +		*force_break = true; +	} +} + +/* + * Zap or skip at least one present PTE, trying to batch-process subsequent + * PTEs that map consecutive pages of the same folio. + * + * Returns the number of processed (skipped or zapped) PTEs (at least 1). + */ +static inline int zap_present_ptes(struct mmu_gather *tlb, +		struct vm_area_struct *vma, pte_t *pte, pte_t ptent, +		unsigned int max_nr, unsigned long addr, +		struct zap_details *details, int *rss, bool *force_flush, +		bool *force_break) +{ +	const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; +	struct mm_struct *mm = tlb->mm; +	struct folio *folio; +	struct page *page; +	int nr; + +	page = vm_normal_page(vma, addr, ptent); +	if (!page) { +		/* We don't need up-to-date accessed/dirty bits. */ +		ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); +		arch_check_zapped_pte(vma, ptent); +		tlb_remove_tlb_entry(tlb, pte, addr); +		VM_WARN_ON_ONCE(userfaultfd_wp(vma)); +		ksm_might_unmap_zero_page(mm, ptent); +		return 1; +	} + +	folio = page_folio(page); +	if (unlikely(!should_zap_folio(details, folio))) +		return 1; + +	/* +	 * Make sure that the common "small folio" case is as fast as possible +	 * by keeping the batching logic separate. +	 */ +	if (unlikely(folio_test_large(folio) && max_nr != 1)) { +		nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, +				     NULL); + +		zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, +				       addr, details, rss, force_flush, +				       force_break); +		return nr; +	} +	zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr, +			       details, rss, force_flush, force_break); +	return 1;  }  static unsigned long zap_pte_range(struct mmu_gather *tlb, @@ -1416,13 +1568,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,  				unsigned long addr, unsigned long end,  				struct zap_details *details)  { +	bool force_flush = false, force_break = false;  	struct mm_struct *mm = tlb->mm; -	int force_flush = 0;  	int rss[NR_MM_COUNTERS];  	spinlock_t *ptl;  	pte_t *start_pte;  	pte_t *pte;  	swp_entry_t entry; +	int nr;  	tlb_change_page_size(tlb, PAGE_SIZE);  	init_rss_vec(rss); @@ -1436,7 +1589,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,  		pte_t ptent = ptep_get(pte);  		struct folio *folio;  		struct page *page; +		int max_nr; +		nr = 1;  		if (pte_none(ptent))  			continue; @@ -1444,44 +1599,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,  			break;  		if (pte_present(ptent)) { -			unsigned int delay_rmap; - -			page = vm_normal_page(vma, addr, ptent); -			if (unlikely(!should_zap_page(details, page))) -				continue; -			ptent = ptep_get_and_clear_full(mm, addr, pte, -							tlb->fullmm); -			arch_check_zapped_pte(vma, ptent); -			tlb_remove_tlb_entry(tlb, pte, addr); -			zap_install_uffd_wp_if_needed(vma, addr, pte, details, -						      ptent); -			if (unlikely(!page)) { -				ksm_might_unmap_zero_page(mm, ptent); -				continue; -			} - -			folio = page_folio(page); -			delay_rmap = 0; -			if (!folio_test_anon(folio)) { -				if (pte_dirty(ptent)) { -					folio_mark_dirty(folio); -					if (tlb_delay_rmap(tlb)) { -						delay_rmap = 1; -						force_flush = 1; -					} -				} -				if (pte_young(ptent) && likely(vma_has_recency(vma))) -					folio_mark_accessed(folio); -			} -			rss[mm_counter(page)]--; -			if (!delay_rmap) { -				folio_remove_rmap_pte(folio, page, vma); -				if (unlikely(page_mapcount(page) < 0)) -					print_bad_pte(vma, addr, ptent, page); -			} -			if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) { -				force_flush = 1; -				addr += PAGE_SIZE; +			max_nr = (end - addr) / PAGE_SIZE; +			nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr, +					      addr, details, rss, &force_flush, +					      &force_break); +			if (unlikely(force_break)) { +				addr += nr * PAGE_SIZE;  				break;  			}  			continue; @@ -1492,7 +1615,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,  		    is_device_exclusive_entry(entry)) {  			page = pfn_swap_entry_to_page(entry);  			folio = page_folio(page); -			if (unlikely(!should_zap_page(details, page))) +			if (unlikely(!should_zap_folio(details, folio)))  				continue;  			/*  			 * Both device private/exclusive mappings should only @@ -1501,7 +1624,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,  			 * see zap_install_uffd_wp_if_needed().  			 */  			WARN_ON_ONCE(!vma_is_anonymous(vma)); -			rss[mm_counter(page)]--; +			rss[mm_counter(folio)]--;  			if (is_device_private_entry(entry))  				folio_remove_rmap_pte(folio, page, vma);  			folio_put(folio); @@ -1513,10 +1636,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,  			if (unlikely(!free_swap_and_cache(entry)))  				print_bad_pte(vma, addr, ptent, NULL);  		} else if (is_migration_entry(entry)) { -			page = pfn_swap_entry_to_page(entry); -			if (!should_zap_page(details, page)) +			folio = pfn_swap_entry_folio(entry); +			if (!should_zap_folio(details, folio))  				continue; -			rss[mm_counter(page)]--; +			rss[mm_counter(folio)]--;  		} else if (pte_marker_entry_uffd_wp(entry)) {  			/*  			 * For anon: always drop the marker; for file: only @@ -1535,8 +1658,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,  			WARN_ON_ONCE(1);  		}  		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); -		zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); -	} while (pte++, addr += PAGE_SIZE, addr != end); +		zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent); +	} while (pte += nr, addr += PAGE_SIZE * nr, addr != end);  	add_mm_rss_vec(mm, rss);  	arch_leave_lazy_mmu_mode(); @@ -1870,7 +1993,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,  		return -EBUSY;  	/* Ok, finally just insert the thing.. */  	folio_get(folio); -	inc_mm_counter(vma->vm_mm, mm_counter_file(page)); +	inc_mm_counter(vma->vm_mm, mm_counter_file(folio));  	folio_add_file_rmap_pte(folio, page, vma);  	set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));  	return 0; @@ -3081,7 +3204,7 @@ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf)  	return VM_FAULT_RETRY;  } -static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) +vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)  {  	struct vm_area_struct *vma = vmf->vma; @@ -3175,7 +3298,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)  	if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {  		if (old_folio) {  			if (!folio_test_anon(old_folio)) { -				dec_mm_counter(mm, mm_counter_file(&old_folio->page)); +				dec_mm_counter(mm, mm_counter_file(old_folio));  				inc_mm_counter(mm, MM_ANONPAGES);  			}  		} else { @@ -3253,7 +3376,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)  		folio_put(new_folio);  	if (old_folio) {  		if (page_copied) -			free_swap_cache(&old_folio->page); +			free_swap_cache(old_folio);  		folio_put(old_folio);  	} @@ -3376,6 +3499,16 @@ static bool wp_can_reuse_anon_folio(struct folio *folio,  				    struct vm_area_struct *vma)  {  	/* +	 * We could currently only reuse a subpage of a large folio if no +	 * other subpages of the large folios are still mapped. However, +	 * let's just consistently not reuse subpages even if we could +	 * reuse in that scenario, and give back a large folio a bit +	 * sooner. +	 */ +	if (folio_test_large(folio)) +		return false; + +	/*  	 * We have to verify under folio lock: these early checks are  	 * just an optimization to avoid locking the folio and freeing  	 * the swapcache if there is little hope that we can reuse. @@ -4170,8 +4303,8 @@ static bool pte_range_none(pte_t *pte, int nr_pages)  static struct folio *alloc_anon_folio(struct vm_fault *vmf)  { -#ifdef CONFIG_TRANSPARENT_HUGEPAGE  	struct vm_area_struct *vma = vmf->vma; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE  	unsigned long orders;  	struct folio *folio;  	unsigned long addr; @@ -4223,15 +4356,21 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)  		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);  		folio = vma_alloc_folio(gfp, order, vma, addr, true);  		if (folio) { +			if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { +				folio_put(folio); +				goto next; +			} +			folio_throttle_swaprate(folio, gfp);  			clear_huge_page(&folio->page, vmf->address, 1 << order);  			return folio;  		} +next:  		order = next_order(&orders, order);  	}  fallback:  #endif -	return vma_alloc_zeroed_movable_folio(vmf->vma, vmf->address); +	return folio_prealloc(vma->vm_mm, vma, vmf->address, true);  }  /* @@ -4298,10 +4437,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)  	nr_pages = folio_nr_pages(folio);  	addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); -	if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) -		goto oom_free_page; -	folio_throttle_swaprate(folio, GFP_KERNEL); -  	/*  	 * The memory barrier inside __folio_mark_uptodate makes sure that  	 * preceding stores to the page contents become visible before @@ -4355,8 +4490,6 @@ unlock:  release:  	folio_put(folio);  	goto unlock; -oom_free_page: -	folio_put(folio);  oom:  	return VM_FAULT_OOM;  } @@ -4480,7 +4613,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)  	if (write)  		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); -	add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); +	add_mm_counter(vma->vm_mm, mm_counter_file(folio), HPAGE_PMD_NR);  	folio_add_file_rmap_pmd(folio, page, vma);  	/* @@ -4543,7 +4676,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,  		folio_add_new_anon_rmap(folio, vma, addr);  		folio_add_lru_vma(folio, vma);  	} else { -		add_mm_counter(vma->vm_mm, mm_counter_file(page), nr); +		add_mm_counter(vma->vm_mm, mm_counter_file(folio), nr);  		folio_add_file_rmap_ptes(folio, page, nr, vma);  	}  	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr); @@ -4653,7 +4786,8 @@ static int fault_around_bytes_set(void *data, u64 val)  	 * The minimum value is 1 page, however this results in no fault-around  	 * at all. See should_fault_around().  	 */ -	fault_around_pages = max(rounddown_pow_of_two(val) >> PAGE_SHIFT, 1UL); +	val = max(val, PAGE_SIZE); +	fault_around_pages = rounddown_pow_of_two(val) >> PAGE_SHIFT;  	return 0;  } @@ -4928,18 +5062,18 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)  	int flags = 0;  	/* -	 * The "pte" at this point cannot be used safely without -	 * validation through pte_unmap_same(). It's of NUMA type but -	 * the pfn may be screwed if the read is non atomic. +	 * The pte cannot be used safely until we verify, while holding the page +	 * table lock, that its contents have not changed during fault handling.  	 */  	spin_lock(vmf->ptl); -	if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { +	/* Read the live PTE from the page tables: */ +	old_pte = ptep_get(vmf->pte); + +	if (unlikely(!pte_same(old_pte, vmf->orig_pte))) {  		pte_unmap_unlock(vmf->pte, vmf->ptl);  		goto out;  	} -	/* Get the normal PTE  */ -	old_pte = ptep_get(vmf->pte);  	pte = pte_modify(old_pte, vma->vm_page_prot);  	/* @@ -6163,7 +6297,7 @@ static int clear_subpage(unsigned long addr, int idx, void *arg)  {  	struct page *page = arg; -	clear_user_highpage(page + idx, addr); +	clear_user_highpage(nth_page(page, idx), addr);  	return 0;  } @@ -6213,10 +6347,11 @@ struct copy_subpage_arg {  static int copy_subpage(unsigned long addr, int idx, void *arg)  {  	struct copy_subpage_arg *copy_arg = arg; +	struct page *dst = nth_page(copy_arg->dst, idx); +	struct page *src = nth_page(copy_arg->src, idx); -	if (copy_mc_user_highpage(copy_arg->dst + idx, copy_arg->src + idx, -				  addr, copy_arg->vma)) { -		memory_failure_queue(page_to_pfn(copy_arg->src + idx), 0); +	if (copy_mc_user_highpage(dst, src, addr, copy_arg->vma)) { +		memory_failure_queue(page_to_pfn(src), 0);  		return -EHWPOISON;  	}  	return 0;  | 
