diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 250 |
1 files changed, 178 insertions, 72 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0a9ac6c26832..271e4432734c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -61,6 +61,9 @@ DEFINE_SPINLOCK(hugetlb_lock); static int num_fault_mutexes; static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; +/* Forward declaration */ +static int hugetlb_acct_memory(struct hstate *h, long delta); + static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) { bool free = (spool->count == 0) && (spool->used_hpages == 0); @@ -68,23 +71,36 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) spin_unlock(&spool->lock); /* If no pages are used, and no other handles to the subpool - * remain, free the subpool the subpool remain */ - if (free) + * remain, give up any reservations mased on minimum size and + * free the subpool */ + if (free) { + if (spool->min_hpages != -1) + hugetlb_acct_memory(spool->hstate, + -spool->min_hpages); kfree(spool); + } } -struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) +struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, + long min_hpages) { struct hugepage_subpool *spool; - spool = kmalloc(sizeof(*spool), GFP_KERNEL); + spool = kzalloc(sizeof(*spool), GFP_KERNEL); if (!spool) return NULL; spin_lock_init(&spool->lock); spool->count = 1; - spool->max_hpages = nr_blocks; - spool->used_hpages = 0; + spool->max_hpages = max_hpages; + spool->hstate = h; + spool->min_hpages = min_hpages; + + if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { + kfree(spool); + return NULL; + } + spool->rsv_hpages = min_hpages; return spool; } @@ -97,36 +113,89 @@ void hugepage_put_subpool(struct hugepage_subpool *spool) unlock_or_release_subpool(spool); } -static int hugepage_subpool_get_pages(struct hugepage_subpool *spool, +/* + * Subpool accounting for allocating and reserving pages. + * Return -ENOMEM if there are not enough resources to satisfy the + * the request. Otherwise, return the number of pages by which the + * global pools must be adjusted (upward). The returned value may + * only be different than the passed value (delta) in the case where + * a subpool minimum size must be manitained. + */ +static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, long delta) { - int ret = 0; + long ret = delta; if (!spool) - return 0; + return ret; spin_lock(&spool->lock); - if ((spool->used_hpages + delta) <= spool->max_hpages) { - spool->used_hpages += delta; - } else { - ret = -ENOMEM; + + if (spool->max_hpages != -1) { /* maximum size accounting */ + if ((spool->used_hpages + delta) <= spool->max_hpages) + spool->used_hpages += delta; + else { + ret = -ENOMEM; + goto unlock_ret; + } } - spin_unlock(&spool->lock); + if (spool->min_hpages != -1) { /* minimum size accounting */ + if (delta > spool->rsv_hpages) { + /* + * Asking for more reserves than those already taken on + * behalf of subpool. Return difference. + */ + ret = delta - spool->rsv_hpages; + spool->rsv_hpages = 0; + } else { + ret = 0; /* reserves already accounted for */ + spool->rsv_hpages -= delta; + } + } + +unlock_ret: + spin_unlock(&spool->lock); return ret; } -static void hugepage_subpool_put_pages(struct hugepage_subpool *spool, +/* + * Subpool accounting for freeing and unreserving pages. + * Return the number of global page reservations that must be dropped. + * The return value may only be different than the passed value (delta) + * in the case where a subpool minimum size must be maintained. + */ +static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, long delta) { + long ret = delta; + if (!spool) - return; + return delta; spin_lock(&spool->lock); - spool->used_hpages -= delta; - /* If hugetlbfs_put_super couldn't free spool due to - * an outstanding quota reference, free it now. */ + + if (spool->max_hpages != -1) /* maximum size accounting */ + spool->used_hpages -= delta; + + if (spool->min_hpages != -1) { /* minimum size accounting */ + if (spool->rsv_hpages + delta <= spool->min_hpages) + ret = 0; + else + ret = spool->rsv_hpages + delta - spool->min_hpages; + + spool->rsv_hpages += delta; + if (spool->rsv_hpages > spool->min_hpages) + spool->rsv_hpages = spool->min_hpages; + } + + /* + * If hugetlbfs_put_super couldn't free spool due to an outstanding + * quota reference, free it now. + */ unlock_or_release_subpool(spool); + + return ret; } static inline struct hugepage_subpool *subpool_inode(struct inode *inode) @@ -855,6 +924,31 @@ struct hstate *size_to_hstate(unsigned long size) return NULL; } +/* + * Test to determine whether the hugepage is "active/in-use" (i.e. being linked + * to hstate->hugepage_activelist.) + * + * This function can be called for tail pages, but never returns true for them. + */ +bool page_huge_active(struct page *page) +{ + VM_BUG_ON_PAGE(!PageHuge(page), page); + return PageHead(page) && PagePrivate(&page[1]); +} + +/* never called for tail page */ +static void set_page_huge_active(struct page *page) +{ + VM_BUG_ON_PAGE(!PageHeadHuge(page), page); + SetPagePrivate(&page[1]); +} + +static void clear_page_huge_active(struct page *page) +{ + VM_BUG_ON_PAGE(!PageHeadHuge(page), page); + ClearPagePrivate(&page[1]); +} + void free_huge_page(struct page *page) { /* @@ -874,7 +968,16 @@ void free_huge_page(struct page *page) restore_reserve = PagePrivate(page); ClearPagePrivate(page); + /* + * A return code of zero implies that the subpool will be under its + * minimum size if the reservation is not restored after page is free. + * Therefore, force restore_reserve operation. + */ + if (hugepage_subpool_put_pages(spool, 1) == 0) + restore_reserve = true; + spin_lock(&hugetlb_lock); + clear_page_huge_active(page); hugetlb_cgroup_uncharge_page(hstate_index(h), pages_per_huge_page(h), page); if (restore_reserve) @@ -891,7 +994,6 @@ void free_huge_page(struct page *page) enqueue_huge_page(h, page); } spin_unlock(&hugetlb_lock); - hugepage_subpool_put_pages(spool, 1); } static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) @@ -917,7 +1019,6 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) __SetPageHead(page); __ClearPageReserved(page); for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { - __SetPageTail(p); /* * For gigantic hugepages allocated through bootmem at * boot, it's safer to be consistent with the not-gigantic @@ -933,6 +1034,9 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) __ClearPageReserved(p); set_page_count(p, 0); p->first_page = page; + /* Make sure p->first_page is always valid for PageTail() */ + smp_wmb(); + __SetPageTail(p); } } @@ -1384,7 +1488,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, if (chg < 0) return ERR_PTR(-ENOMEM); if (chg || avoid_reserve) - if (hugepage_subpool_get_pages(spool, 1)) + if (hugepage_subpool_get_pages(spool, 1) < 0) return ERR_PTR(-ENOSPC); ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); @@ -2452,6 +2556,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) struct resv_map *resv = vma_resv_map(vma); struct hugepage_subpool *spool = subpool_vma(vma); unsigned long reserve, start, end; + long gbl_reserve; if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return; @@ -2464,8 +2569,12 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) kref_put(&resv->refs, resv_map_release); if (reserve) { - hugetlb_acct_memory(h, -reserve); - hugepage_subpool_put_pages(spool, reserve); + /* + * Decrement reserve counts. The global reserve count may be + * adjusted if the subpool has a minimum size. + */ + gbl_reserve = hugepage_subpool_put_pages(spool, reserve); + hugetlb_acct_memory(h, -gbl_reserve); } } @@ -2889,6 +2998,7 @@ retry_avoidcopy: copy_user_huge_page(new_page, old_page, address, vma, pages_per_huge_page(h)); __SetPageUptodate(new_page); + set_page_huge_active(new_page); mmun_start = address & huge_page_mask(h); mmun_end = mmun_start + huge_page_size(h); @@ -3001,6 +3111,7 @@ retry: } clear_huge_page(page, address, pages_per_huge_page(h)); __SetPageUptodate(page); + set_page_huge_active(page); if (vma->vm_flags & VM_MAYSHARE) { int err; @@ -3276,6 +3387,15 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; /* + * If we have a pending SIGKILL, don't keep faulting pages and + * potentially allocating memory. + */ + if (unlikely(fatal_signal_pending(current))) { + remainder = 0; + break; + } + + /* * Some archs (sparc64, sh*) have multiple pte_ts to * each hugepage. We have to make sure we get the * first, for the page indexing below to work. @@ -3436,6 +3556,7 @@ int hugetlb_reserve_pages(struct inode *inode, struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); struct resv_map *resv_map; + long gbl_reserve; /* * Only apply hugepage reservation if asked. At fault time, an @@ -3472,8 +3593,13 @@ int hugetlb_reserve_pages(struct inode *inode, goto out_err; } - /* There must be enough pages in the subpool for the mapping */ - if (hugepage_subpool_get_pages(spool, chg)) { + /* + * There must be enough pages in the subpool for the mapping. If + * the subpool has a minimum size, there may be some global + * reservations already in place (gbl_reserve). + */ + gbl_reserve = hugepage_subpool_get_pages(spool, chg); + if (gbl_reserve < 0) { ret = -ENOSPC; goto out_err; } @@ -3482,9 +3608,10 @@ int hugetlb_reserve_pages(struct inode *inode, * Check enough hugepages are available for the reservation. * Hand the pages back to the subpool if there are not */ - ret = hugetlb_acct_memory(h, chg); + ret = hugetlb_acct_memory(h, gbl_reserve); if (ret < 0) { - hugepage_subpool_put_pages(spool, chg); + /* put back original number of pages, chg */ + (void)hugepage_subpool_put_pages(spool, chg); goto out_err; } @@ -3514,6 +3641,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) struct resv_map *resv_map = inode_resv_map(inode); long chg = 0; struct hugepage_subpool *spool = subpool_inode(inode); + long gbl_reserve; if (resv_map) chg = region_truncate(resv_map, offset); @@ -3521,8 +3649,12 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); - hugepage_subpool_put_pages(spool, (chg - freed)); - hugetlb_acct_memory(h, -(chg - freed)); + /* + * If the subpool has a minimum size, the number of global + * reservations to be released may be adjusted. + */ + gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); + hugetlb_acct_memory(h, -gbl_reserve); } #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE @@ -3733,8 +3865,7 @@ retry: if (!pmd_huge(*pmd)) goto out; if (pmd_present(*pmd)) { - page = pte_page(*(pte_t *)pmd) + - ((address & ~PMD_MASK) >> PAGE_SHIFT); + page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); if (flags & FOLL_GET) get_page(page); } else { @@ -3765,20 +3896,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, #ifdef CONFIG_MEMORY_FAILURE -/* Should be called in hugetlb_lock */ -static int is_hugepage_on_freelist(struct page *hpage) -{ - struct page *page; - struct page *tmp; - struct hstate *h = page_hstate(hpage); - int nid = page_to_nid(hpage); - - list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru) - if (page == hpage) - return 1; - return 0; -} - /* * This function is called from memory failure code. * Assume the caller holds page lock of the head page. @@ -3790,7 +3907,11 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) int ret = -EBUSY; spin_lock(&hugetlb_lock); - if (is_hugepage_on_freelist(hpage)) { + /* + * Just checking !page_huge_active is not enough, because that could be + * an isolated/hwpoisoned hugepage (which have >0 refcount). + */ + if (!page_huge_active(hpage) && !page_count(hpage)) { /* * Hwpoisoned hugepage isn't linked to activelist or freelist, * but dangling hpage->lru can trigger list-debug warnings @@ -3810,42 +3931,27 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) bool isolate_huge_page(struct page *page, struct list_head *list) { + bool ret = true; + VM_BUG_ON_PAGE(!PageHead(page), page); - if (!get_page_unless_zero(page)) - return false; spin_lock(&hugetlb_lock); + if (!page_huge_active(page) || !get_page_unless_zero(page)) { + ret = false; + goto unlock; + } + clear_page_huge_active(page); list_move_tail(&page->lru, list); +unlock: spin_unlock(&hugetlb_lock); - return true; + return ret; } void putback_active_hugepage(struct page *page) { VM_BUG_ON_PAGE(!PageHead(page), page); spin_lock(&hugetlb_lock); + set_page_huge_active(page); list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); spin_unlock(&hugetlb_lock); put_page(page); } - -bool is_hugepage_active(struct page *page) -{ - VM_BUG_ON_PAGE(!PageHuge(page), page); - /* - * This function can be called for a tail page because the caller, - * scan_movable_pages, scans through a given pfn-range which typically - * covers one memory block. In systems using gigantic hugepage (1GB - * for x86_64,) a hugepage is larger than a memory block, and we don't - * support migrating such large hugepages for now, so return false - * when called for tail pages. - */ - if (PageTail(page)) - return false; - /* - * Refcount of a hwpoisoned hugepages is 1, but they are not active, - * so we should return false for them. - */ - if (unlikely(PageHWPoison(page))) - return false; - return page_count(page) > 0; -} |