diff options
Diffstat (limited to 'mm/hugetlb.c')
| -rw-r--r-- | mm/hugetlb.c | 81 | 
1 files changed, 24 insertions, 57 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 745088810965..df2e7dd5ff17 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3238,7 +3238,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,  	struct page *ptepage;  	unsigned long addr;  	int cow; -	struct address_space *mapping = vma->vm_file->f_mapping;  	struct hstate *h = hstate_vma(vma);  	unsigned long sz = huge_page_size(h);  	struct mmu_notifier_range range; @@ -3250,23 +3249,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,  		mmu_notifier_range_init(&range, src, vma->vm_start,  					vma->vm_end);  		mmu_notifier_invalidate_range_start(&range); -	} else { -		/* -		 * For shared mappings i_mmap_rwsem must be held to call -		 * huge_pte_alloc, otherwise the returned ptep could go -		 * away if part of a shared pmd and another thread calls -		 * huge_pmd_unshare. -		 */ -		i_mmap_lock_read(mapping);  	}  	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {  		spinlock_t *src_ptl, *dst_ptl; -  		src_pte = huge_pte_offset(src, addr, sz);  		if (!src_pte)  			continue; -  		dst_pte = huge_pte_alloc(dst, addr, sz);  		if (!dst_pte) {  			ret = -ENOMEM; @@ -3337,8 +3326,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,  	if (cow)  		mmu_notifier_invalidate_range_end(&range); -	else -		i_mmap_unlock_read(mapping);  	return ret;  } @@ -3755,16 +3742,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,  	}  	/* -	 * We can not race with truncation due to holding i_mmap_rwsem. -	 * Check once here for faults beyond end of file. +	 * Use page lock to guard against racing truncation +	 * before we get page_table_lock.  	 */ -	size = i_size_read(mapping->host) >> huge_page_shift(h); -	if (idx >= size) -		goto out; -  retry:  	page = find_lock_page(mapping, idx);  	if (!page) { +		size = i_size_read(mapping->host) >> huge_page_shift(h); +		if (idx >= size) +			goto out; +  		/*  		 * Check for page in userfault range  		 */ @@ -3784,18 +3771,14 @@ retry:  			};  			/* -			 * hugetlb_fault_mutex and i_mmap_rwsem must be -			 * dropped before handling userfault.  Reacquire -			 * after handling fault to make calling code simpler. +			 * hugetlb_fault_mutex must be dropped before +			 * handling userfault.  Reacquire after handling +			 * fault to make calling code simpler.  			 */  			hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,  							idx, haddr);  			mutex_unlock(&hugetlb_fault_mutex_table[hash]); -			i_mmap_unlock_read(mapping); -  			ret = handle_userfault(&vmf, VM_UFFD_MISSING); - -			i_mmap_lock_read(mapping);  			mutex_lock(&hugetlb_fault_mutex_table[hash]);  			goto out;  		} @@ -3854,6 +3837,9 @@ retry:  	}  	ptl = huge_pte_lock(h, mm, ptep); +	size = i_size_read(mapping->host) >> huge_page_shift(h); +	if (idx >= size) +		goto backout;  	ret = 0;  	if (!huge_pte_none(huge_ptep_get(ptep))) @@ -3940,11 +3926,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,  	ptep = huge_pte_offset(mm, haddr, huge_page_size(h));  	if (ptep) { -		/* -		 * Since we hold no locks, ptep could be stale.  That is -		 * OK as we are only making decisions based on content and -		 * not actually modifying content here. -		 */  		entry = huge_ptep_get(ptep);  		if (unlikely(is_hugetlb_entry_migration(entry))) {  			migration_entry_wait_huge(vma, mm, ptep); @@ -3952,33 +3933,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,  		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))  			return VM_FAULT_HWPOISON_LARGE |  				VM_FAULT_SET_HINDEX(hstate_index(h)); +	} else { +		ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); +		if (!ptep) +			return VM_FAULT_OOM;  	} -	/* -	 * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold -	 * until finished with ptep.  This serves two purposes: -	 * 1) It prevents huge_pmd_unshare from being called elsewhere -	 *    and making the ptep no longer valid. -	 * 2) It synchronizes us with file truncation. -	 * -	 * ptep could have already be assigned via huge_pte_offset.  That -	 * is OK, as huge_pte_alloc will return the same value unless -	 * something changed. -	 */  	mapping = vma->vm_file->f_mapping; -	i_mmap_lock_read(mapping); -	ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); -	if (!ptep) { -		i_mmap_unlock_read(mapping); -		return VM_FAULT_OOM; -	} +	idx = vma_hugecache_offset(h, vma, haddr);  	/*  	 * Serialize hugepage allocation and instantiation, so that we don't  	 * get spurious allocation failures if two CPUs race to instantiate  	 * the same page in the page cache.  	 */ -	idx = vma_hugecache_offset(h, vma, haddr);  	hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);  	mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -4066,7 +4034,6 @@ out_ptl:  	}  out_mutex:  	mutex_unlock(&hugetlb_fault_mutex_table[hash]); -	i_mmap_unlock_read(mapping);  	/*  	 * Generally it's safe to hold refcount during waiting page lock. But  	 * here we just wait to defer the next page fault to avoid busy loop and @@ -4671,12 +4638,10 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,   * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()   * and returns the corresponding pte. While this is not necessary for the   * !shared pmd case because we can allocate the pmd later as well, it makes the - * code much cleaner. - * - * This routine must be called with i_mmap_rwsem held in at least read mode. - * For hugetlbfs, this prevents removal of any page table entries associated - * with the address space.  This is important as we are setting up sharing - * based on existing page table entries (mappings). + * code much cleaner. pmd allocation is essential for the shared case because + * pud has to be populated inside the same i_mmap_rwsem section - otherwise + * racing tasks could either miss the sharing (see huge_pte_offset) or select a + * bad pmd for sharing.   */  pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)  { @@ -4693,6 +4658,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)  	if (!vma_shareable(vma, addr))  		return (pte_t *)pmd_alloc(mm, pud, addr); +	i_mmap_lock_write(mapping);  	vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {  		if (svma == vma)  			continue; @@ -4722,6 +4688,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)  	spin_unlock(ptl);  out:  	pte = (pte_t *)pmd_alloc(mm, pud, addr); +	i_mmap_unlock_write(mapping);  	return pte;  } @@ -4732,7 +4699,7 @@ out:   * indicated by page_count > 1, unmap is achieved by clearing pud and   * decrementing the ref count. If count == 1, the pte page is not shared.   * - * Called with page table lock held and i_mmap_rwsem held in write mode. + * called with page table lock held.   *   * returns: 1 successfully unmapped a shared pte page   *	    0 the underlying pte page is not shared, or it is the last user  | 
