diff options
Diffstat (limited to 'mm/rmap.c')
| -rw-r--r-- | mm/rmap.c | 186 | 
1 files changed, 162 insertions, 24 deletions
diff --git a/mm/rmap.c b/mm/rmap.c index 38a336e2eea1..87b9e8ad4509 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -56,6 +56,7 @@  #include <linux/memcontrol.h>  #include <linux/mmu_notifier.h>  #include <linux/migrate.h> +#include <linux/hugetlb.h>  #include <asm/tlbflush.h> @@ -132,9 +133,14 @@ int anon_vma_prepare(struct vm_area_struct *vma)  			if (unlikely(!anon_vma))  				goto out_enomem_free_avc;  			allocated = anon_vma; +			/* +			 * This VMA had no anon_vma yet.  This anon_vma is +			 * the root of any anon_vma tree that might form. +			 */ +			anon_vma->root = anon_vma;  		} -		spin_lock(&anon_vma->lock); +		anon_vma_lock(anon_vma);  		/* page_table_lock to protect against threads */  		spin_lock(&mm->page_table_lock);  		if (likely(!vma->anon_vma)) { @@ -142,12 +148,12 @@ int anon_vma_prepare(struct vm_area_struct *vma)  			avc->anon_vma = anon_vma;  			avc->vma = vma;  			list_add(&avc->same_vma, &vma->anon_vma_chain); -			list_add(&avc->same_anon_vma, &anon_vma->head); +			list_add_tail(&avc->same_anon_vma, &anon_vma->head);  			allocated = NULL;  			avc = NULL;  		}  		spin_unlock(&mm->page_table_lock); -		spin_unlock(&anon_vma->lock); +		anon_vma_unlock(anon_vma);  		if (unlikely(allocated))  			anon_vma_free(allocated); @@ -170,9 +176,9 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,  	avc->anon_vma = anon_vma;  	list_add(&avc->same_vma, &vma->anon_vma_chain); -	spin_lock(&anon_vma->lock); +	anon_vma_lock(anon_vma);  	list_add_tail(&avc->same_anon_vma, &anon_vma->head); -	spin_unlock(&anon_vma->lock); +	anon_vma_unlock(anon_vma);  }  /* @@ -224,9 +230,21 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)  	avc = anon_vma_chain_alloc();  	if (!avc)  		goto out_error_free_anon_vma; -	anon_vma_chain_link(vma, avc, anon_vma); + +	/* +	 * The root anon_vma's spinlock is the lock actually used when we +	 * lock any of the anon_vmas in this anon_vma tree. +	 */ +	anon_vma->root = pvma->anon_vma->root; +	/* +	 * With KSM refcounts, an anon_vma can stay around longer than the +	 * process it belongs to.  The root anon_vma needs to be pinned +	 * until this anon_vma is freed, because the lock lives in the root. +	 */ +	get_anon_vma(anon_vma->root);  	/* Mark this anon_vma as the one where our new (COWed) pages go. */  	vma->anon_vma = anon_vma; +	anon_vma_chain_link(vma, avc, anon_vma);  	return 0; @@ -246,22 +264,29 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)  	if (!anon_vma)  		return; -	spin_lock(&anon_vma->lock); +	anon_vma_lock(anon_vma);  	list_del(&anon_vma_chain->same_anon_vma);  	/* We must garbage collect the anon_vma if it's empty */  	empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma); -	spin_unlock(&anon_vma->lock); +	anon_vma_unlock(anon_vma); -	if (empty) +	if (empty) { +		/* We no longer need the root anon_vma */ +		if (anon_vma->root != anon_vma) +			drop_anon_vma(anon_vma->root);  		anon_vma_free(anon_vma); +	}  }  void unlink_anon_vmas(struct vm_area_struct *vma)  {  	struct anon_vma_chain *avc, *next; -	/* Unlink each anon_vma chained to the VMA. */ +	/* +	 * Unlink each anon_vma chained to the VMA.  This list is ordered +	 * from newest to oldest, ensuring the root anon_vma gets freed last. +	 */  	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {  		anon_vma_unlink(avc);  		list_del(&avc->same_vma); @@ -302,7 +327,7 @@ struct anon_vma *page_lock_anon_vma(struct page *page)  		goto out;  	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); -	spin_lock(&anon_vma->lock); +	anon_vma_lock(anon_vma);  	return anon_vma;  out:  	rcu_read_unlock(); @@ -311,7 +336,7 @@ out:  void page_unlock_anon_vma(struct anon_vma *anon_vma)  { -	spin_unlock(&anon_vma->lock); +	anon_vma_unlock(anon_vma);  	rcu_read_unlock();  } @@ -326,6 +351,8 @@ vma_address(struct page *page, struct vm_area_struct *vma)  	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);  	unsigned long address; +	if (unlikely(is_vm_hugetlb_page(vma))) +		pgoff = page->index << huge_page_order(page_hstate(page));  	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);  	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {  		/* page should be within @vma mapping range */ @@ -340,9 +367,10 @@ vma_address(struct page *page, struct vm_area_struct *vma)   */  unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)  { -	if (PageAnon(page)) -		; -	else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { +	if (PageAnon(page)) { +		if (vma->anon_vma->root != page_anon_vma(page)->root) +			return -EFAULT; +	} else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {  		if (!vma->vm_file ||  		    vma->vm_file->f_mapping != page->mapping)  			return -EFAULT; @@ -369,6 +397,12 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,  	pte_t *pte;  	spinlock_t *ptl; +	if (unlikely(PageHuge(page))) { +		pte = huge_pte_offset(mm, address); +		ptl = &mm->page_table_lock; +		goto check; +	} +  	pgd = pgd_offset(mm, address);  	if (!pgd_present(*pgd))  		return NULL; @@ -389,6 +423,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,  	}  	ptl = pte_lockptr(mm, pmd); +check:  	spin_lock(ptl);  	if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {  		*ptlp = ptl; @@ -743,14 +778,20 @@ static void __page_set_anon_rmap(struct page *page,  	 * If the page isn't exclusively mapped into this vma,  	 * we must use the _oldest_ possible anon_vma for the  	 * page mapping! -	 * -	 * So take the last AVC chain entry in the vma, which is -	 * the deepest ancestor, and use the anon_vma from that.  	 */  	if (!exclusive) { -		struct anon_vma_chain *avc; -		avc = list_entry(vma->anon_vma_chain.prev, struct anon_vma_chain, same_vma); -		anon_vma = avc->anon_vma; +		if (PageAnon(page)) +			return; +		anon_vma = anon_vma->root; +	} else { +		/* +		 * In this case, swapped-out-but-not-discarded swap-cache +		 * is remapped. So, no need to update page->mapping here. +		 * We convice anon_vma poitned by page->mapping is not obsolete +		 * because vma->anon_vma is necessary to be a family of it. +		 */ +		if (PageAnon(page)) +			return;  	}  	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; @@ -780,6 +821,7 @@ static void __page_check_anon_rmap(struct page *page,  	 * are initially only visible via the pagetables, and the pte is locked  	 * over the call to page_add_new_anon_rmap.  	 */ +	BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);  	BUG_ON(page->index != linear_page_index(vma, address));  #endif  } @@ -798,6 +840,17 @@ static void __page_check_anon_rmap(struct page *page,  void page_add_anon_rmap(struct page *page,  	struct vm_area_struct *vma, unsigned long address)  { +	do_page_add_anon_rmap(page, vma, address, 0); +} + +/* + * Special version of the above for do_swap_page, which often runs + * into pages that are exclusively owned by the current process. + * Everybody else should continue to use page_add_anon_rmap above. + */ +void do_page_add_anon_rmap(struct page *page, +	struct vm_area_struct *vma, unsigned long address, int exclusive) +{  	int first = atomic_inc_and_test(&page->_mapcount);  	if (first)  		__inc_zone_page_state(page, NR_ANON_PAGES); @@ -807,7 +860,7 @@ void page_add_anon_rmap(struct page *page,  	VM_BUG_ON(!PageLocked(page));  	VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);  	if (first) -		__page_set_anon_rmap(page, vma, address, 0); +		__page_set_anon_rmap(page, vma, address, exclusive);  	else  		__page_check_anon_rmap(page, vma, address);  } @@ -873,6 +926,12 @@ void page_remove_rmap(struct page *page)  		page_clear_dirty(page);  		set_page_dirty(page);  	} +	/* +	 * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED +	 * and not charged by memcg for now. +	 */ +	if (unlikely(PageHuge(page))) +		return;  	if (PageAnon(page)) {  		mem_cgroup_uncharge_page(page);  		__dec_zone_page_state(page, NR_ANON_PAGES); @@ -1368,6 +1427,42 @@ int try_to_munlock(struct page *page)  		return try_to_unmap_file(page, TTU_MUNLOCK);  } +#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION) +/* + * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root + * if necessary.  Be careful to do all the tests under the lock.  Once + * we know we are the last user, nobody else can get a reference and we + * can do the freeing without the lock. + */ +void drop_anon_vma(struct anon_vma *anon_vma) +{ +	BUG_ON(atomic_read(&anon_vma->external_refcount) <= 0); +	if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) { +		struct anon_vma *root = anon_vma->root; +		int empty = list_empty(&anon_vma->head); +		int last_root_user = 0; +		int root_empty = 0; + +		/* +		 * The refcount on a non-root anon_vma got dropped.  Drop +		 * the refcount on the root and check if we need to free it. +		 */ +		if (empty && anon_vma != root) { +			BUG_ON(atomic_read(&root->external_refcount) <= 0); +			last_root_user = atomic_dec_and_test(&root->external_refcount); +			root_empty = list_empty(&root->head); +		} +		anon_vma_unlock(anon_vma); + +		if (empty) { +			anon_vma_free(anon_vma); +			if (root_empty && last_root_user) +				anon_vma_free(root); +		} +	} +} +#endif +  #ifdef CONFIG_MIGRATION  /*   * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): @@ -1389,7 +1484,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,  	anon_vma = page_anon_vma(page);  	if (!anon_vma)  		return ret; -	spin_lock(&anon_vma->lock); +	anon_vma_lock(anon_vma);  	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {  		struct vm_area_struct *vma = avc->vma;  		unsigned long address = vma_address(page, vma); @@ -1399,7 +1494,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,  		if (ret != SWAP_AGAIN)  			break;  	} -	spin_unlock(&anon_vma->lock); +	anon_vma_unlock(anon_vma);  	return ret;  } @@ -1445,3 +1540,46 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *,  		return rmap_walk_file(page, rmap_one, arg);  }  #endif /* CONFIG_MIGRATION */ + +#ifdef CONFIG_HUGETLB_PAGE +/* + * The following three functions are for anonymous (private mapped) hugepages. + * Unlike common anonymous pages, anonymous hugepages have no accounting code + * and no lru code, because we handle hugepages differently from common pages. + */ +static void __hugepage_set_anon_rmap(struct page *page, +	struct vm_area_struct *vma, unsigned long address, int exclusive) +{ +	struct anon_vma *anon_vma = vma->anon_vma; +	BUG_ON(!anon_vma); +	if (!exclusive) { +		struct anon_vma_chain *avc; +		avc = list_entry(vma->anon_vma_chain.prev, +				 struct anon_vma_chain, same_vma); +		anon_vma = avc->anon_vma; +	} +	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; +	page->mapping = (struct address_space *) anon_vma; +	page->index = linear_page_index(vma, address); +} + +void hugepage_add_anon_rmap(struct page *page, +			    struct vm_area_struct *vma, unsigned long address) +{ +	struct anon_vma *anon_vma = vma->anon_vma; +	int first; +	BUG_ON(!anon_vma); +	BUG_ON(address < vma->vm_start || address >= vma->vm_end); +	first = atomic_inc_and_test(&page->_mapcount); +	if (first) +		__hugepage_set_anon_rmap(page, vma, address, 0); +} + +void hugepage_add_new_anon_rmap(struct page *page, +			struct vm_area_struct *vma, unsigned long address) +{ +	BUG_ON(address < vma->vm_start || address >= vma->vm_end); +	atomic_set(&page->_mapcount, 0); +	__hugepage_set_anon_rmap(page, vma, address, 1); +} +#endif /* CONFIG_HUGETLB_PAGE */  | 
