diff options
| author | Jani Nikula <jani.nikula@intel.com> | 2025-06-09 12:40:46 +0300 | 
|---|---|---|
| committer | Jani Nikula <jani.nikula@intel.com> | 2025-06-09 12:40:46 +0300 | 
| commit | 34c55367af96f62e89221444f04487440ebc6487 (patch) | |
| tree | fdb36ba67d7dea09455b55037e26043b7e051ef9 /mm/hugetlb.c | |
| parent | 7247efca0dcbc8ac6147db9200ed1549c0662465 (diff) | |
| parent | 19272b37aa4f83ca52bdf9c16d5d81bdd1354494 (diff) | |
| download | linux-34c55367af96f62e89221444f04487440ebc6487.tar.xz | |
Merge drm/drm-next into drm-intel-next
Sync to v6.16-rc1, among other things to get the fixed size GENMASK_U*()
and BIT_U*() macros.
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Diffstat (limited to 'mm/hugetlb.c')
| -rw-r--r-- | mm/hugetlb.c | 264 | 
1 files changed, 172 insertions, 92 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e3e6ac991b9c..8746ed2fec13 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -58,6 +58,7 @@ int hugetlb_max_hstate __read_mostly;  unsigned int default_hstate_idx;  struct hstate hstates[HUGE_MAX_HSTATE]; +__initdata nodemask_t hugetlb_bootmem_nodes;  __initdata struct list_head huge_boot_pages[MAX_NUMNODES];  static unsigned long hstate_boot_nrinvalid[HUGE_MAX_HSTATE] __initdata; @@ -120,7 +121,7 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma);  static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);  static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);  static void hugetlb_unshare_pmds(struct vm_area_struct *vma, -		unsigned long start, unsigned long end); +		unsigned long start, unsigned long end, bool take_locks);  static struct resv_map *vma_resv_map(struct vm_area_struct *vma);  static void hugetlb_free_folio(struct folio *folio) @@ -1250,7 +1251,7 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma)  /*   * Reset and decrement one ref on hugepage private reservation.   * Called with mm->mmap_lock writer semaphore held. - * This function should be only used by move_vma() and operate on + * This function should be only used by mremap and operate on   * same sized vma. It should never come here with last ref on the   * reservation.   */ @@ -1950,7 +1951,6 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,  	int order = huge_page_order(h);  	struct folio *folio;  	bool alloc_try_hard = true; -	bool retry = true;  	/*  	 * By default we always try hard to allocate the folio with @@ -1965,22 +1965,8 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,  		gfp_mask |= __GFP_RETRY_MAYFAIL;  	if (nid == NUMA_NO_NODE)  		nid = numa_mem_id(); -retry: -	folio = __folio_alloc(gfp_mask, order, nid, nmask); -	/* Ensure hugetlb folio won't have large_rmappable flag set. */ -	if (folio) -		folio_clear_large_rmappable(folio); -	if (folio && !folio_ref_freeze(folio, 1)) { -		folio_put(folio); -		if (retry) {	/* retry once */ -			retry = false; -			goto retry; -		} -		/* WOW!  twice in a row. */ -		pr_warn("HugeTLB unexpected inflated folio ref count\n"); -		folio = NULL; -	} +	folio = (struct folio *)__alloc_frozen_pages(gfp_mask, order, nid, nmask);  	/*  	 * If we did not specify __GFP_RETRY_MAYFAIL, but still got a @@ -2419,7 +2405,6 @@ static int gather_surplus_pages(struct hstate *h, long delta)  	long i;  	long needed, allocated;  	bool alloc_ok = true; -	int node;  	nodemask_t *mbind_nodemask, alloc_nodemask;  	mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h)); @@ -2443,21 +2428,12 @@ retry:  	for (i = 0; i < needed; i++) {  		folio = NULL; -		/* Prioritize current node */ -		if (node_isset(numa_mem_id(), alloc_nodemask)) -			folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), -					numa_mem_id(), NULL); - -		if (!folio) { -			for_each_node_mask(node, alloc_nodemask) { -				if (node == numa_mem_id()) -					continue; -				folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), -						node, NULL); -				if (folio) -					break; -			} -		} +		/* +		 * It is okay to use NUMA_NO_NODE because we use numa_mem_id() +		 * down the road to pick the current node if that is the case. +		 */ +		folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), +						    NUMA_NO_NODE, &alloc_nodemask);  		if (!folio) {  			alloc_ok = false;  			break; @@ -2896,10 +2872,9 @@ free_new:  	return ret;  } -int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) +int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list)  {  	struct hstate *h; -	struct folio *folio = page_folio(page);  	int ret = -EBUSY;  	/* @@ -2949,12 +2924,20 @@ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)  	while (start_pfn < end_pfn) {  		folio = pfn_folio(start_pfn); + +		/* +		 * The folio might have been dissolved from under our feet, so make sure +		 * to carefully check the state under the lock. +		 */ +		spin_lock_irq(&hugetlb_lock);  		if (folio_test_hugetlb(folio)) {  			h = folio_hstate(folio);  		} else { +			spin_unlock_irq(&hugetlb_lock);  			start_pfn++;  			continue;  		} +		spin_unlock_irq(&hugetlb_lock);  		if (!folio_ref_count(folio)) {  			ret = alloc_and_dissolve_hugetlb_folio(h, folio, @@ -3010,7 +2993,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,  	struct hugepage_subpool *spool = subpool_vma(vma);  	struct hstate *h = hstate_vma(vma);  	struct folio *folio; -	long retval, gbl_chg; +	long retval, gbl_chg, gbl_reserve;  	map_chg_state map_chg;  	int ret, idx;  	struct hugetlb_cgroup *h_cg = NULL; @@ -3163,8 +3146,16 @@ out_uncharge_cgroup_reservation:  		hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),  						    h_cg);  out_subpool_put: -	if (map_chg) -		hugepage_subpool_put_pages(spool, 1); +	/* +	 * put page to subpool iff the quota of subpool's rsv_hpages is used +	 * during hugepage_subpool_get_pages. +	 */ +	if (map_chg && !gbl_chg) { +		gbl_reserve = hugepage_subpool_put_pages(spool, 1); +		hugetlb_acct_memory(h, -gbl_reserve); +	} + +  out_end_reservation:  	if (map_chg != MAP_CHG_ENFORCED)  		vma_end_reservation(h, vma, addr); @@ -3237,7 +3228,8 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)  	}  	/* allocate from next node when distributing huge pages */ -	for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_ONLINE]) { +	for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, +				    &hugetlb_bootmem_nodes) {  		m = alloc_bootmem(h, node, false);  		if (!m)  			return 0; @@ -3701,6 +3693,15 @@ static void __init hugetlb_init_hstates(void)  	struct hstate *h, *h2;  	for_each_hstate(h) { +		/* +		 * Always reset to first_memory_node here, even if +		 * next_nid_to_alloc was set before - we can't +		 * reference hugetlb_bootmem_nodes after init, and +		 * first_memory_node is right for all further allocations. +		 */ +		h->next_nid_to_alloc = first_memory_node; +		h->next_nid_to_free = first_memory_node; +  		/* oversize hugepages were init'ed in early boot */  		if (!hstate_is_gigantic(h))  			hugetlb_hstate_alloc_pages(h); @@ -3740,7 +3741,7 @@ static void __init report_hugepages(void)  		string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);  		pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n", -			buf, h->free_huge_pages); +			buf, h->nr_huge_pages);  		if (nrinvalid)  			pr_info("HugeTLB: %s page size: %lu invalid page%s discarded\n",  					buf, nrinvalid, nrinvalid > 1 ? "s" : ""); @@ -4034,10 +4035,13 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst,  	list_for_each_entry_safe(folio, next, src_list, lru) {  		int i; +		bool cma;  		if (folio_test_hugetlb_vmemmap_optimized(folio))  			continue; +		cma = folio_test_hugetlb_cma(folio); +  		list_del(&folio->lru);  		split_page_owner(&folio->page, huge_page_order(src), huge_page_order(dst)); @@ -4053,6 +4057,9 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst,  			new_folio->mapping = NULL;  			init_new_hugetlb_folio(dst, new_folio); +			/* Copy the CMA flag so that it is freed correctly */ +			if (cma) +				folio_set_hugetlb_cma(new_folio);  			list_add(&new_folio->lru, &dst_list);  		}  	} @@ -5007,6 +5014,20 @@ static int __init default_hugepagesz_setup(char *s)  }  hugetlb_early_param("default_hugepagesz", default_hugepagesz_setup); +void __init hugetlb_bootmem_set_nodes(void) +{ +	int i, nid; +	unsigned long start_pfn, end_pfn; + +	if (!nodes_empty(hugetlb_bootmem_nodes)) +		return; + +	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { +		if (end_pfn > start_pfn) +			node_set(nid, hugetlb_bootmem_nodes); +	} +} +  static bool __hugetlb_bootmem_allocated __initdata;  bool __init hugetlb_bootmem_allocated(void) @@ -5022,6 +5043,8 @@ void __init hugetlb_bootmem_alloc(void)  	if (__hugetlb_bootmem_allocated)  		return; +	hugetlb_bootmem_set_nodes(); +  	for (i = 0; i < MAX_NUMNODES; i++)  		INIT_LIST_HEAD(&huge_boot_pages[i]); @@ -5029,7 +5052,6 @@ void __init hugetlb_bootmem_alloc(void)  	for_each_hstate(h) {  		h->next_nid_to_alloc = first_online_node; -		h->next_nid_to_free = first_online_node;  		if (hstate_is_gigantic(h))  			hugetlb_hstate_alloc_pages(h); @@ -5404,26 +5426,40 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)  {  	if (addr & ~(huge_page_mask(hstate_vma(vma))))  		return -EINVAL; +	return 0; +} +void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) +{  	/*  	 * PMD sharing is only possible for PUD_SIZE-aligned address ranges  	 * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this  	 * split, unshare PMDs in the PUD_SIZE interval surrounding addr now. +	 * This function is called in the middle of a VMA split operation, with +	 * MM, VMA and rmap all write-locked to prevent concurrent page table +	 * walks (except hardware and gup_fast()).  	 */ +	vma_assert_write_locked(vma); +	i_mmap_assert_write_locked(vma->vm_file->f_mapping); +  	if (addr & ~PUD_MASK) { -		/* -		 * hugetlb_vm_op_split is called right before we attempt to -		 * split the VMA. We will need to unshare PMDs in the old and -		 * new VMAs, so let's unshare before we split. -		 */  		unsigned long floor = addr & PUD_MASK;  		unsigned long ceil = floor + PUD_SIZE; -		if (floor >= vma->vm_start && ceil <= vma->vm_end) -			hugetlb_unshare_pmds(vma, floor, ceil); +		if (floor >= vma->vm_start && ceil <= vma->vm_end) { +			/* +			 * Locking: +			 * Use take_locks=false here. +			 * The file rmap lock is already held. +			 * The hugetlb VMA lock can't be taken when we already +			 * hold the file rmap lock, and we don't need it because +			 * its purpose is to synchronize against concurrent page +			 * table walks, which are not possible thanks to the +			 * locks held by our caller. +			 */ +			hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false); +		}  	} - -	return 0;  }  static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) @@ -5458,18 +5494,16 @@ const struct vm_operations_struct hugetlb_vm_ops = {  	.pagesize = hugetlb_vm_op_pagesize,  }; -static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, +static pte_t make_huge_pte(struct vm_area_struct *vma, struct folio *folio,  		bool try_mkwrite)  { -	pte_t entry; +	pte_t entry = folio_mk_pte(folio, vma->vm_page_prot);  	unsigned int shift = huge_page_shift(hstate_vma(vma));  	if (try_mkwrite && (vma->vm_flags & VM_WRITE)) { -		entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, -					 vma->vm_page_prot))); +		entry = pte_mkwrite_novma(pte_mkdirty(entry));  	} else { -		entry = huge_pte_wrprotect(mk_huge_pte(page, -					   vma->vm_page_prot)); +		entry = pte_wrprotect(entry);  	}  	entry = pte_mkyoung(entry);  	entry = arch_make_huge_pte(entry, shift, vma->vm_flags); @@ -5524,7 +5558,7 @@ static void  hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,  		      struct folio *new_folio, pte_t old, unsigned long sz)  { -	pte_t newpte = make_huge_pte(vma, &new_folio->page, true); +	pte_t newpte = make_huge_pte(vma, new_folio, true);  	__folio_mark_uptodate(new_folio);  	hugetlb_add_new_anon_rmap(new_folio, vma, addr); @@ -5828,14 +5862,14 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,  void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,  			    unsigned long start, unsigned long end, -			    struct page *ref_page, zap_flags_t zap_flags) +			    struct folio *folio, zap_flags_t zap_flags)  {  	struct mm_struct *mm = vma->vm_mm; +	const bool folio_provided = !!folio;  	unsigned long address;  	pte_t *ptep;  	pte_t pte;  	spinlock_t *ptl; -	struct page *page;  	struct hstate *h = hstate_vma(vma);  	unsigned long sz = huge_page_size(h);  	bool adjust_reservation = false; @@ -5899,14 +5933,13 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,  			continue;  		} -		page = pte_page(pte);  		/* -		 * If a reference page is supplied, it is because a specific -		 * page is being unmapped, not a range. Ensure the page we -		 * are about to unmap is the actual page of interest. +		 * If a folio is supplied, it is because a specific +		 * folio is being unmapped, not a range. Ensure the folio we +		 * are about to unmap is the actual folio of interest.  		 */ -		if (ref_page) { -			if (page != ref_page) { +		if (folio_provided) { +			if (folio != page_folio(pte_page(pte))) {  				spin_unlock(ptl);  				continue;  			} @@ -5916,12 +5949,14 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,  			 * looking like data was lost  			 */  			set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); +		} else { +			folio = page_folio(pte_page(pte));  		}  		pte = huge_ptep_get_and_clear(mm, address, ptep, sz);  		tlb_remove_huge_tlb_entry(h, tlb, ptep, address);  		if (huge_pte_dirty(pte)) -			set_page_dirty(page); +			folio_mark_dirty(folio);  		/* Leave a uffd-wp pte marker if needed */  		if (huge_pte_uffd_wp(pte) &&  		    !(zap_flags & ZAP_FLAG_DROP_MARKER)) @@ -5929,7 +5964,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,  					make_pte_marker(PTE_MARKER_UFFD_WP),  					sz);  		hugetlb_count_sub(pages_per_huge_page(h), mm); -		hugetlb_remove_rmap(page_folio(page)); +		hugetlb_remove_rmap(folio);  		/*  		 * Restore the reservation for anonymous page, otherwise the @@ -5938,8 +5973,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,  		 * reservation bit.  		 */  		if (!h->surplus_huge_pages && __vma_private_lock(vma) && -		    folio_test_anon(page_folio(page))) { -			folio_set_hugetlb_restore_reserve(page_folio(page)); +		    folio_test_anon(folio)) { +			folio_set_hugetlb_restore_reserve(folio);  			/* Reservation to be adjusted after the spin lock */  			adjust_reservation = true;  		} @@ -5963,16 +5998,17 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,  				 * count will not be incremented by free_huge_folio.  				 * Act as if we consumed the reservation.  				 */ -				folio_clear_hugetlb_restore_reserve(page_folio(page)); +				folio_clear_hugetlb_restore_reserve(folio);  			else if (rc)  				vma_add_reservation(h, vma, address);  		} -		tlb_remove_page_size(tlb, page, huge_page_size(h)); +		tlb_remove_page_size(tlb, folio_page(folio, 0), +				     folio_size(folio));  		/* -		 * Bail out after unmapping reference page if supplied +		 * If we were instructed to unmap a specific folio, we're done.  		 */ -		if (ref_page) +		if (folio_provided)  			break;  	}  	tlb_end_vma(tlb, vma); @@ -6034,7 +6070,7 @@ void __hugetlb_zap_end(struct vm_area_struct *vma,  }  void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, -			  unsigned long end, struct page *ref_page, +			  unsigned long end, struct folio *folio,  			  zap_flags_t zap_flags)  {  	struct mmu_notifier_range range; @@ -6046,7 +6082,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,  	mmu_notifier_invalidate_range_start(&range);  	tlb_gather_mmu(&tlb, vma->vm_mm); -	__unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags); +	__unmap_hugepage_range(&tlb, vma, start, end, +			       folio, zap_flags);  	mmu_notifier_invalidate_range_end(&range);  	tlb_finish_mmu(&tlb); @@ -6059,7 +6096,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,   * same region.   */  static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, -			      struct page *page, unsigned long address) +			      struct folio *folio, unsigned long address)  {  	struct hstate *h = hstate_vma(vma);  	struct vm_area_struct *iter_vma; @@ -6103,7 +6140,8 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,  		 */  		if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))  			unmap_hugepage_range(iter_vma, address, -					     address + huge_page_size(h), page, 0); +					     address + huge_page_size(h), +					     folio, 0);  	}  	i_mmap_unlock_write(mapping);  } @@ -6226,8 +6264,7 @@ retry_avoidcopy:  			hugetlb_vma_unlock_read(vma);  			mutex_unlock(&hugetlb_fault_mutex_table[hash]); -			unmap_ref_private(mm, vma, &old_folio->page, -					vmf->address); +			unmap_ref_private(mm, vma, old_folio, vmf->address);  			mutex_lock(&hugetlb_fault_mutex_table[hash]);  			hugetlb_vma_lock_read(vma); @@ -6274,7 +6311,7 @@ retry_avoidcopy:  	spin_lock(vmf->ptl);  	vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h));  	if (likely(vmf->pte && pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte))) { -		pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare); +		pte_t newpte = make_huge_pte(vma, new_folio, !unshare);  		/* Break COW or unshare */  		huge_ptep_clear_flush(vma, vmf->address, vmf->pte); @@ -6554,7 +6591,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,  		hugetlb_add_new_anon_rmap(folio, vma, vmf->address);  	else  		hugetlb_add_file_rmap(folio); -	new_pte = make_huge_pte(vma, &folio->page, vma->vm_flags & VM_SHARED); +	new_pte = make_huge_pte(vma, folio, vma->vm_flags & VM_SHARED);  	/*  	 * If this pte was previously wr-protected, keep it wr-protected even  	 * if populated. @@ -7039,7 +7076,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,  	 * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY  	 * with wp flag set, don't set pte write bit.  	 */ -	_dst_pte = make_huge_pte(dst_vma, &folio->page, +	_dst_pte = make_huge_pte(dst_vma, folio,  				 !wp_enabled && !(is_continue && !vm_shared));  	/*  	 * Always mark UFFDIO_COPY page dirty; note that this may not be @@ -7233,7 +7270,7 @@ bool hugetlb_reserve_pages(struct inode *inode,  					struct vm_area_struct *vma,  					vm_flags_t vm_flags)  { -	long chg = -1, add = -1; +	long chg = -1, add = -1, spool_resv, gbl_resv;  	struct hstate *h = hstate_inode(inode);  	struct hugepage_subpool *spool = subpool_inode(inode);  	struct resv_map *resv_map; @@ -7368,8 +7405,16 @@ bool hugetlb_reserve_pages(struct inode *inode,  	return true;  out_put_pages: -	/* put back original number of pages, chg */ -	(void)hugepage_subpool_put_pages(spool, chg); +	spool_resv = chg - gbl_reserve; +	if (spool_resv) { +		/* put sub pool's reservation back, chg - gbl_reserve */ +		gbl_resv = hugepage_subpool_put_pages(spool, spool_resv); +		/* +		 * subpool's reserved pages can not be put back due to race, +		 * return to hstate. +		 */ +		hugetlb_acct_memory(h, -gbl_resv); +	}  out_uncharge_cgroup:  	hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),  					    chg * pages_per_huge_page(h), h_cg); @@ -7584,6 +7629,13 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,  		return 0;  	pud_clear(pud); +	/* +	 * Once our caller drops the rmap lock, some other process might be +	 * using this page table as a normal, non-hugetlb page table. +	 * Wait for pending gup_fast() in other threads to finish before letting +	 * that happen. +	 */ +	tlb_remove_table_sync_one();  	ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));  	mm_dec_nr_pmds(mm);  	return 1; @@ -7854,9 +7906,16 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re  	spin_unlock_irq(&hugetlb_lock);  } +/* + * If @take_locks is false, the caller must ensure that no concurrent page table + * access can happen (except for gup_fast() and hardware page walks). + * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like + * concurrent page fault handling) and the file rmap lock. + */  static void hugetlb_unshare_pmds(struct vm_area_struct *vma,  				   unsigned long start, -				   unsigned long end) +				   unsigned long end, +				   bool take_locks)  {  	struct hstate *h = hstate_vma(vma);  	unsigned long sz = huge_page_size(h); @@ -7880,8 +7939,12 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,  				start, end);  	mmu_notifier_invalidate_range_start(&range); -	hugetlb_vma_lock_write(vma); -	i_mmap_lock_write(vma->vm_file->f_mapping); +	if (take_locks) { +		hugetlb_vma_lock_write(vma); +		i_mmap_lock_write(vma->vm_file->f_mapping); +	} else { +		i_mmap_assert_write_locked(vma->vm_file->f_mapping); +	}  	for (address = start; address < end; address += PUD_SIZE) {  		ptep = hugetlb_walk(vma, address, sz);  		if (!ptep) @@ -7891,8 +7954,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,  		spin_unlock(ptl);  	}  	flush_hugetlb_tlb_range(vma, start, end); -	i_mmap_unlock_write(vma->vm_file->f_mapping); -	hugetlb_vma_unlock_write(vma); +	if (take_locks) { +		i_mmap_unlock_write(vma->vm_file->f_mapping); +		hugetlb_vma_unlock_write(vma); +	}  	/*  	 * No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see  	 * Documentation/mm/mmu_notifier.rst. @@ -7907,5 +7972,20 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,  void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)  {  	hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE), -			ALIGN_DOWN(vma->vm_end, PUD_SIZE)); +			ALIGN_DOWN(vma->vm_end, PUD_SIZE), +			/* take_locks = */ true); +} + +/* + * For hugetlb, mremap() is an odd edge case - while the VMA copying is + * performed, we permit both the old and new VMAs to reference the same + * reservation. + * + * We fix this up after the operation succeeds, or if a newly allocated VMA + * is closed as a result of a failure to allocate memory. + */ +void fixup_hugetlb_reservations(struct vm_area_struct *vma) +{ +	if (is_vm_hugetlb_page(vma)) +		clear_vma_resv_huge_pages(vma);  }  | 
