diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 81 |
1 files changed, 71 insertions, 10 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 39f92aad7bd1..6a3cf7935c14 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1250,7 +1250,7 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma) /* * Reset and decrement one ref on hugepage private reservation. * Called with mm->mmap_lock writer semaphore held. - * This function should be only used by move_vma() and operate on + * This function should be only used by mremap and operate on * same sized vma. It should never come here with last ref on the * reservation. */ @@ -2271,7 +2271,7 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, * as surplus_pages, otherwise it might confuse * persistent_huge_pages() momentarily. */ - __prep_account_new_huge_page(h, nid); + __prep_account_new_huge_page(h, folio_nid(folio)); /* * We could have raced with the pool size change. @@ -2949,12 +2949,20 @@ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) while (start_pfn < end_pfn) { folio = pfn_folio(start_pfn); + + /* + * The folio might have been dissolved from under our feet, so make sure + * to carefully check the state under the lock. + */ + spin_lock_irq(&hugetlb_lock); if (folio_test_hugetlb(folio)) { h = folio_hstate(folio); } else { + spin_unlock_irq(&hugetlb_lock); start_pfn++; continue; } + spin_unlock_irq(&hugetlb_lock); if (!folio_ref_count(folio)) { ret = alloc_and_dissolve_hugetlb_folio(h, folio, @@ -3010,7 +3018,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct folio *folio; - long retval, gbl_chg; + long retval, gbl_chg, gbl_reserve; map_chg_state map_chg; int ret, idx; struct hugetlb_cgroup *h_cg = NULL; @@ -3163,8 +3171,16 @@ out_uncharge_cgroup_reservation: hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), h_cg); out_subpool_put: - if (map_chg) - hugepage_subpool_put_pages(spool, 1); + /* + * put page to subpool iff the quota of subpool's rsv_hpages is used + * during hugepage_subpool_get_pages. + */ + if (map_chg && !gbl_chg) { + gbl_reserve = hugepage_subpool_put_pages(spool, 1); + hugetlb_acct_memory(h, -gbl_reserve); + } + + out_end_reservation: if (map_chg != MAP_CHG_ENFORCED) vma_end_reservation(h, vma, addr); @@ -3825,6 +3841,7 @@ found: static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, nodemask_t *nodes_allowed) { + unsigned long persistent_free_count; unsigned long min_count; unsigned long allocated; struct folio *folio; @@ -3959,8 +3976,24 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * though, we'll note that we're not allowed to exceed surplus * and won't grow the pool anywhere else. Not until one of the * sysctls are changed, or the surplus pages go out of use. + * + * min_count is the expected number of persistent pages, we + * shouldn't calculate min_count by using + * resv_huge_pages + persistent_huge_pages() - free_huge_pages, + * because there may exist free surplus huge pages, and this will + * lead to subtracting twice. Free surplus huge pages come from HVO + * failing to restore vmemmap, see comments in the callers of + * hugetlb_vmemmap_restore_folio(). Thus, we should calculate + * persistent free count first. */ - min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; + persistent_free_count = h->free_huge_pages; + if (h->free_huge_pages > persistent_huge_pages(h)) { + if (h->free_huge_pages > h->surplus_huge_pages) + persistent_free_count -= h->surplus_huge_pages; + else + persistent_free_count = 0; + } + min_count = h->resv_huge_pages + persistent_huge_pages(h) - persistent_free_count; min_count = max(count, min_count); try_to_free_low(h, min_count, nodes_allowed); @@ -4017,10 +4050,13 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst, list_for_each_entry_safe(folio, next, src_list, lru) { int i; + bool cma; if (folio_test_hugetlb_vmemmap_optimized(folio)) continue; + cma = folio_test_hugetlb_cma(folio); + list_del(&folio->lru); split_page_owner(&folio->page, huge_page_order(src), huge_page_order(dst)); @@ -4036,6 +4072,9 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst, new_folio->mapping = NULL; init_new_hugetlb_folio(dst, new_folio); + /* Copy the CMA flag so that it is freed correctly */ + if (cma) + folio_set_hugetlb_cma(new_folio); list_add(&new_folio->lru, &dst_list); } } @@ -4630,7 +4669,7 @@ static void __init hugetlb_sysfs_init(void) err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, hstate_kobjs, &hstate_attr_group); if (err) - pr_err("HugeTLB: Unable to add hstate %s", h->name); + pr_err("HugeTLB: Unable to add hstate %s\n", h->name); } #ifdef CONFIG_NUMA @@ -7216,7 +7255,7 @@ bool hugetlb_reserve_pages(struct inode *inode, struct vm_area_struct *vma, vm_flags_t vm_flags) { - long chg = -1, add = -1; + long chg = -1, add = -1, spool_resv, gbl_resv; struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); struct resv_map *resv_map; @@ -7351,8 +7390,16 @@ bool hugetlb_reserve_pages(struct inode *inode, return true; out_put_pages: - /* put back original number of pages, chg */ - (void)hugepage_subpool_put_pages(spool, chg); + spool_resv = chg - gbl_reserve; + if (spool_resv) { + /* put sub pool's reservation back, chg - gbl_reserve */ + gbl_resv = hugepage_subpool_put_pages(spool, spool_resv); + /* + * subpool's reserved pages can not be put back due to race, + * return to hstate. + */ + hugetlb_acct_memory(h, -gbl_resv); + } out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); @@ -7892,3 +7939,17 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE), ALIGN_DOWN(vma->vm_end, PUD_SIZE)); } + +/* + * For hugetlb, mremap() is an odd edge case - while the VMA copying is + * performed, we permit both the old and new VMAs to reference the same + * reservation. + * + * We fix this up after the operation succeeds, or if a newly allocated VMA + * is closed as a result of a failure to allocate memory. + */ +void fixup_hugetlb_reservations(struct vm_area_struct *vma) +{ + if (is_vm_hugetlb_page(vma)) + clear_vma_resv_huge_pages(vma); +} |