summaryrefslogtreecommitdiff
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c250
1 files changed, 178 insertions, 72 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0a9ac6c26832..271e4432734c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -61,6 +61,9 @@ DEFINE_SPINLOCK(hugetlb_lock);
static int num_fault_mutexes;
static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
+/* Forward declaration */
+static int hugetlb_acct_memory(struct hstate *h, long delta);
+
static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
{
bool free = (spool->count == 0) && (spool->used_hpages == 0);
@@ -68,23 +71,36 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
spin_unlock(&spool->lock);
/* If no pages are used, and no other handles to the subpool
- * remain, free the subpool the subpool remain */
- if (free)
+ * remain, give up any reservations mased on minimum size and
+ * free the subpool */
+ if (free) {
+ if (spool->min_hpages != -1)
+ hugetlb_acct_memory(spool->hstate,
+ -spool->min_hpages);
kfree(spool);
+ }
}
-struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
+struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
+ long min_hpages)
{
struct hugepage_subpool *spool;
- spool = kmalloc(sizeof(*spool), GFP_KERNEL);
+ spool = kzalloc(sizeof(*spool), GFP_KERNEL);
if (!spool)
return NULL;
spin_lock_init(&spool->lock);
spool->count = 1;
- spool->max_hpages = nr_blocks;
- spool->used_hpages = 0;
+ spool->max_hpages = max_hpages;
+ spool->hstate = h;
+ spool->min_hpages = min_hpages;
+
+ if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
+ kfree(spool);
+ return NULL;
+ }
+ spool->rsv_hpages = min_hpages;
return spool;
}
@@ -97,36 +113,89 @@ void hugepage_put_subpool(struct hugepage_subpool *spool)
unlock_or_release_subpool(spool);
}
-static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
+/*
+ * Subpool accounting for allocating and reserving pages.
+ * Return -ENOMEM if there are not enough resources to satisfy the
+ * the request. Otherwise, return the number of pages by which the
+ * global pools must be adjusted (upward). The returned value may
+ * only be different than the passed value (delta) in the case where
+ * a subpool minimum size must be manitained.
+ */
+static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
long delta)
{
- int ret = 0;
+ long ret = delta;
if (!spool)
- return 0;
+ return ret;
spin_lock(&spool->lock);
- if ((spool->used_hpages + delta) <= spool->max_hpages) {
- spool->used_hpages += delta;
- } else {
- ret = -ENOMEM;
+
+ if (spool->max_hpages != -1) { /* maximum size accounting */
+ if ((spool->used_hpages + delta) <= spool->max_hpages)
+ spool->used_hpages += delta;
+ else {
+ ret = -ENOMEM;
+ goto unlock_ret;
+ }
}
- spin_unlock(&spool->lock);
+ if (spool->min_hpages != -1) { /* minimum size accounting */
+ if (delta > spool->rsv_hpages) {
+ /*
+ * Asking for more reserves than those already taken on
+ * behalf of subpool. Return difference.
+ */
+ ret = delta - spool->rsv_hpages;
+ spool->rsv_hpages = 0;
+ } else {
+ ret = 0; /* reserves already accounted for */
+ spool->rsv_hpages -= delta;
+ }
+ }
+
+unlock_ret:
+ spin_unlock(&spool->lock);
return ret;
}
-static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
+/*
+ * Subpool accounting for freeing and unreserving pages.
+ * Return the number of global page reservations that must be dropped.
+ * The return value may only be different than the passed value (delta)
+ * in the case where a subpool minimum size must be maintained.
+ */
+static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
long delta)
{
+ long ret = delta;
+
if (!spool)
- return;
+ return delta;
spin_lock(&spool->lock);
- spool->used_hpages -= delta;
- /* If hugetlbfs_put_super couldn't free spool due to
- * an outstanding quota reference, free it now. */
+
+ if (spool->max_hpages != -1) /* maximum size accounting */
+ spool->used_hpages -= delta;
+
+ if (spool->min_hpages != -1) { /* minimum size accounting */
+ if (spool->rsv_hpages + delta <= spool->min_hpages)
+ ret = 0;
+ else
+ ret = spool->rsv_hpages + delta - spool->min_hpages;
+
+ spool->rsv_hpages += delta;
+ if (spool->rsv_hpages > spool->min_hpages)
+ spool->rsv_hpages = spool->min_hpages;
+ }
+
+ /*
+ * If hugetlbfs_put_super couldn't free spool due to an outstanding
+ * quota reference, free it now.
+ */
unlock_or_release_subpool(spool);
+
+ return ret;
}
static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
@@ -855,6 +924,31 @@ struct hstate *size_to_hstate(unsigned long size)
return NULL;
}
+/*
+ * Test to determine whether the hugepage is "active/in-use" (i.e. being linked
+ * to hstate->hugepage_activelist.)
+ *
+ * This function can be called for tail pages, but never returns true for them.
+ */
+bool page_huge_active(struct page *page)
+{
+ VM_BUG_ON_PAGE(!PageHuge(page), page);
+ return PageHead(page) && PagePrivate(&page[1]);
+}
+
+/* never called for tail page */
+static void set_page_huge_active(struct page *page)
+{
+ VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
+ SetPagePrivate(&page[1]);
+}
+
+static void clear_page_huge_active(struct page *page)
+{
+ VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
+ ClearPagePrivate(&page[1]);
+}
+
void free_huge_page(struct page *page)
{
/*
@@ -874,7 +968,16 @@ void free_huge_page(struct page *page)
restore_reserve = PagePrivate(page);
ClearPagePrivate(page);
+ /*
+ * A return code of zero implies that the subpool will be under its
+ * minimum size if the reservation is not restored after page is free.
+ * Therefore, force restore_reserve operation.
+ */
+ if (hugepage_subpool_put_pages(spool, 1) == 0)
+ restore_reserve = true;
+
spin_lock(&hugetlb_lock);
+ clear_page_huge_active(page);
hugetlb_cgroup_uncharge_page(hstate_index(h),
pages_per_huge_page(h), page);
if (restore_reserve)
@@ -891,7 +994,6 @@ void free_huge_page(struct page *page)
enqueue_huge_page(h, page);
}
spin_unlock(&hugetlb_lock);
- hugepage_subpool_put_pages(spool, 1);
}
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -917,7 +1019,6 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
__SetPageHead(page);
__ClearPageReserved(page);
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
- __SetPageTail(p);
/*
* For gigantic hugepages allocated through bootmem at
* boot, it's safer to be consistent with the not-gigantic
@@ -933,6 +1034,9 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
__ClearPageReserved(p);
set_page_count(p, 0);
p->first_page = page;
+ /* Make sure p->first_page is always valid for PageTail() */
+ smp_wmb();
+ __SetPageTail(p);
}
}
@@ -1384,7 +1488,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
if (chg < 0)
return ERR_PTR(-ENOMEM);
if (chg || avoid_reserve)
- if (hugepage_subpool_get_pages(spool, 1))
+ if (hugepage_subpool_get_pages(spool, 1) < 0)
return ERR_PTR(-ENOSPC);
ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
@@ -2452,6 +2556,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
struct resv_map *resv = vma_resv_map(vma);
struct hugepage_subpool *spool = subpool_vma(vma);
unsigned long reserve, start, end;
+ long gbl_reserve;
if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
return;
@@ -2464,8 +2569,12 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
kref_put(&resv->refs, resv_map_release);
if (reserve) {
- hugetlb_acct_memory(h, -reserve);
- hugepage_subpool_put_pages(spool, reserve);
+ /*
+ * Decrement reserve counts. The global reserve count may be
+ * adjusted if the subpool has a minimum size.
+ */
+ gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
+ hugetlb_acct_memory(h, -gbl_reserve);
}
}
@@ -2889,6 +2998,7 @@ retry_avoidcopy:
copy_user_huge_page(new_page, old_page, address, vma,
pages_per_huge_page(h));
__SetPageUptodate(new_page);
+ set_page_huge_active(new_page);
mmun_start = address & huge_page_mask(h);
mmun_end = mmun_start + huge_page_size(h);
@@ -3001,6 +3111,7 @@ retry:
}
clear_huge_page(page, address, pages_per_huge_page(h));
__SetPageUptodate(page);
+ set_page_huge_active(page);
if (vma->vm_flags & VM_MAYSHARE) {
int err;
@@ -3276,6 +3387,15 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *page;
/*
+ * If we have a pending SIGKILL, don't keep faulting pages and
+ * potentially allocating memory.
+ */
+ if (unlikely(fatal_signal_pending(current))) {
+ remainder = 0;
+ break;
+ }
+
+ /*
* Some archs (sparc64, sh*) have multiple pte_ts to
* each hugepage. We have to make sure we get the
* first, for the page indexing below to work.
@@ -3436,6 +3556,7 @@ int hugetlb_reserve_pages(struct inode *inode,
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
struct resv_map *resv_map;
+ long gbl_reserve;
/*
* Only apply hugepage reservation if asked. At fault time, an
@@ -3472,8 +3593,13 @@ int hugetlb_reserve_pages(struct inode *inode,
goto out_err;
}
- /* There must be enough pages in the subpool for the mapping */
- if (hugepage_subpool_get_pages(spool, chg)) {
+ /*
+ * There must be enough pages in the subpool for the mapping. If
+ * the subpool has a minimum size, there may be some global
+ * reservations already in place (gbl_reserve).
+ */
+ gbl_reserve = hugepage_subpool_get_pages(spool, chg);
+ if (gbl_reserve < 0) {
ret = -ENOSPC;
goto out_err;
}
@@ -3482,9 +3608,10 @@ int hugetlb_reserve_pages(struct inode *inode,
* Check enough hugepages are available for the reservation.
* Hand the pages back to the subpool if there are not
*/
- ret = hugetlb_acct_memory(h, chg);
+ ret = hugetlb_acct_memory(h, gbl_reserve);
if (ret < 0) {
- hugepage_subpool_put_pages(spool, chg);
+ /* put back original number of pages, chg */
+ (void)hugepage_subpool_put_pages(spool, chg);
goto out_err;
}
@@ -3514,6 +3641,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
struct resv_map *resv_map = inode_resv_map(inode);
long chg = 0;
struct hugepage_subpool *spool = subpool_inode(inode);
+ long gbl_reserve;
if (resv_map)
chg = region_truncate(resv_map, offset);
@@ -3521,8 +3649,12 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
inode->i_blocks -= (blocks_per_huge_page(h) * freed);
spin_unlock(&inode->i_lock);
- hugepage_subpool_put_pages(spool, (chg - freed));
- hugetlb_acct_memory(h, -(chg - freed));
+ /*
+ * If the subpool has a minimum size, the number of global
+ * reservations to be released may be adjusted.
+ */
+ gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
+ hugetlb_acct_memory(h, -gbl_reserve);
}
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
@@ -3733,8 +3865,7 @@ retry:
if (!pmd_huge(*pmd))
goto out;
if (pmd_present(*pmd)) {
- page = pte_page(*(pte_t *)pmd) +
- ((address & ~PMD_MASK) >> PAGE_SHIFT);
+ page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
if (flags & FOLL_GET)
get_page(page);
} else {
@@ -3765,20 +3896,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
#ifdef CONFIG_MEMORY_FAILURE
-/* Should be called in hugetlb_lock */
-static int is_hugepage_on_freelist(struct page *hpage)
-{
- struct page *page;
- struct page *tmp;
- struct hstate *h = page_hstate(hpage);
- int nid = page_to_nid(hpage);
-
- list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
- if (page == hpage)
- return 1;
- return 0;
-}
-
/*
* This function is called from memory failure code.
* Assume the caller holds page lock of the head page.
@@ -3790,7 +3907,11 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
int ret = -EBUSY;
spin_lock(&hugetlb_lock);
- if (is_hugepage_on_freelist(hpage)) {
+ /*
+ * Just checking !page_huge_active is not enough, because that could be
+ * an isolated/hwpoisoned hugepage (which have >0 refcount).
+ */
+ if (!page_huge_active(hpage) && !page_count(hpage)) {
/*
* Hwpoisoned hugepage isn't linked to activelist or freelist,
* but dangling hpage->lru can trigger list-debug warnings
@@ -3810,42 +3931,27 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
bool isolate_huge_page(struct page *page, struct list_head *list)
{
+ bool ret = true;
+
VM_BUG_ON_PAGE(!PageHead(page), page);
- if (!get_page_unless_zero(page))
- return false;
spin_lock(&hugetlb_lock);
+ if (!page_huge_active(page) || !get_page_unless_zero(page)) {
+ ret = false;
+ goto unlock;
+ }
+ clear_page_huge_active(page);
list_move_tail(&page->lru, list);
+unlock:
spin_unlock(&hugetlb_lock);
- return true;
+ return ret;
}
void putback_active_hugepage(struct page *page)
{
VM_BUG_ON_PAGE(!PageHead(page), page);
spin_lock(&hugetlb_lock);
+ set_page_huge_active(page);
list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
spin_unlock(&hugetlb_lock);
put_page(page);
}
-
-bool is_hugepage_active(struct page *page)
-{
- VM_BUG_ON_PAGE(!PageHuge(page), page);
- /*
- * This function can be called for a tail page because the caller,
- * scan_movable_pages, scans through a given pfn-range which typically
- * covers one memory block. In systems using gigantic hugepage (1GB
- * for x86_64,) a hugepage is larger than a memory block, and we don't
- * support migrating such large hugepages for now, so return false
- * when called for tail pages.
- */
- if (PageTail(page))
- return false;
- /*
- * Refcount of a hwpoisoned hugepages is 1, but they are not active,
- * so we should return false for them.
- */
- if (unlikely(PageHWPoison(page)))
- return false;
- return page_count(page) > 0;
-}