From 5e27a2df03b8933aa7c1579816ecb6a071bb0e0d Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Sat, 30 Nov 2019 17:55:06 -0800 Subject: mm/page_alloc: add alloc_contig_pages() HugeTLB helper alloc_gigantic_page() implements fairly generic allocation method where it scans over various zones looking for a large contiguous pfn range before trying to allocate it with alloc_contig_range(). Other than deriving the requested order from 'struct hstate', there is nothing HugeTLB specific in there. This can be made available for general use to allocate contiguous memory which could not have been allocated through the buddy allocator. alloc_gigantic_page() has been split carving out actual allocation method which is then made available via new alloc_contig_pages() helper wrapped under CONFIG_CONTIG_ALLOC. All references to 'gigantic' have been replaced with more generic term 'contig'. Allocated pages here should be freed with free_contig_range() or by calling __free_page() on each allocated page. Link: http://lkml.kernel.org/r/1571300646-32240-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: David Hildenbrand Acked-by: Michal Hocko Cc: Mike Kravetz Cc: Vlastimil Babka Cc: Michal Hocko Cc: David Rientjes Cc: Andrea Arcangeli Cc: Oscar Salvador Cc: Mel Gorman Cc: Mike Rapoport Cc: Dan Williams Cc: Pavel Tatashin Cc: Matthew Wilcox Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 77 ++---------------------------------------------------------- 1 file changed, 2 insertions(+), 75 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b45a95363a84..26b722faf740 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1069,85 +1069,12 @@ static void free_gigantic_page(struct page *page, unsigned int order) } #ifdef CONFIG_CONTIG_ALLOC -static int __alloc_gigantic_page(unsigned long start_pfn, - unsigned long nr_pages, gfp_t gfp_mask) -{ - unsigned long end_pfn = start_pfn + nr_pages; - return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, - gfp_mask); -} - -static bool pfn_range_valid_gigantic(struct zone *z, - unsigned long start_pfn, unsigned long nr_pages) -{ - unsigned long i, end_pfn = start_pfn + nr_pages; - struct page *page; - - for (i = start_pfn; i < end_pfn; i++) { - page = pfn_to_online_page(i); - if (!page) - return false; - - if (page_zone(page) != z) - return false; - - if (PageReserved(page)) - return false; - - if (page_count(page) > 0) - return false; - - if (PageHuge(page)) - return false; - } - - return true; -} - -static bool zone_spans_last_pfn(const struct zone *zone, - unsigned long start_pfn, unsigned long nr_pages) -{ - unsigned long last_pfn = start_pfn + nr_pages - 1; - return zone_spans_pfn(zone, last_pfn); -} - static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { - unsigned int order = huge_page_order(h); - unsigned long nr_pages = 1 << order; - unsigned long ret, pfn, flags; - struct zonelist *zonelist; - struct zone *zone; - struct zoneref *z; - - zonelist = node_zonelist(nid, gfp_mask); - for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) { - spin_lock_irqsave(&zone->lock, flags); + unsigned long nr_pages = 1UL << huge_page_order(h); - pfn = ALIGN(zone->zone_start_pfn, nr_pages); - while (zone_spans_last_pfn(zone, pfn, nr_pages)) { - if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) { - /* - * We release the zone lock here because - * alloc_contig_range() will also lock the zone - * at some point. If there's an allocation - * spinning on this lock, it may win the race - * and cause alloc_contig_range() to fail... - */ - spin_unlock_irqrestore(&zone->lock, flags); - ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask); - if (!ret) - return pfn_to_page(pfn); - spin_lock_irqsave(&zone->lock, flags); - } - pfn += nr_pages; - } - - spin_unlock_irqrestore(&zone->lock, flags); - } - - return NULL; + return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); } static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); -- cgit v1.2.3 From 552546366a30d88bd1d6f5efe848b2ab50fd57e5 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Sat, 30 Nov 2019 17:56:30 -0800 Subject: hugetlbfs: hugetlb_fault_mutex_hash() cleanup A new clang diagnostic (-Wsizeof-array-div) warns about the calculation to determine the number of u32's in an array of unsigned longs. Suppress warning by adding parentheses. While looking at the above issue, noticed that the 'address' parameter to hugetlb_fault_mutex_hash is no longer used. So, remove it from the definition and all callers. No functional change. Link: http://lkml.kernel.org/r/20190919011847.18400-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reported-by: Nathan Chancellor Reviewed-by: Nathan Chancellor Reviewed-by: Davidlohr Bueso Reviewed-by: Andrew Morton Cc: Nick Desaulniers Cc: Ilie Halip Cc: David Bolvansky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 4 ++-- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 10 +++++----- mm/userfaultfd.c | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a478df035651..6e5eadee6b0d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -440,7 +440,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, u32 hash; index = page->index; - hash = hugetlb_fault_mutex_hash(h, mapping, index, 0); + hash = hugetlb_fault_mutex_hash(h, mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* @@ -644,7 +644,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, addr = index * hpage_size; /* mutex taken here, fault path and hole punch */ - hash = hugetlb_fault_mutex_hash(h, mapping, index, addr); + hash = hugetlb_fault_mutex_hash(h, mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 53fc34f930d0..d3814bd686ba 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -106,7 +106,7 @@ void free_huge_page(struct page *page); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, - pgoff_t idx, unsigned long address); + pgoff_t idx); pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 26b722faf740..39579f98d6f3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3842,7 +3842,7 @@ retry: * handling userfault. Reacquire after handling * fault to make calling code simpler. */ - hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr); + hash = hugetlb_fault_mutex_hash(h, mapping, idx); mutex_unlock(&hugetlb_fault_mutex_table[hash]); ret = handle_userfault(&vmf, VM_UFFD_MISSING); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -3970,7 +3970,7 @@ backout_unlocked: #ifdef CONFIG_SMP u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, - pgoff_t idx, unsigned long address) + pgoff_t idx) { unsigned long key[2]; u32 hash; @@ -3978,7 +3978,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, key[0] = (unsigned long) mapping; key[1] = idx; - hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0); + hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0); return hash & (num_fault_mutexes - 1); } @@ -3988,7 +3988,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, * return 0 and avoid the hashing overhead. */ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, - pgoff_t idx, unsigned long address) + pgoff_t idx) { return 0; } @@ -4032,7 +4032,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr); + hash = hugetlb_fault_mutex_hash(h, mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); entry = huge_ptep_get(ptep); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index c7ae74ce5ff3..640ff2bd9a69 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -269,7 +269,7 @@ retry: */ idx = linear_page_index(dst_vma, dst_addr); mapping = dst_vma->vm_file->f_mapping; - hash = hugetlb_fault_mutex_hash(h, mapping, idx, dst_addr); + hash = hugetlb_fault_mutex_hash(h, mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); err = -ENOMEM; -- cgit v1.2.3 From 930668c34408ba983049322e04f13f03b6f1fafa Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 30 Nov 2019 17:56:49 -0800 Subject: hugetlbfs: take read_lock on i_mmap for PMD sharing A customer with large SMP systems (up to 16 sockets) with application that uses large amount of static hugepages (~500-1500GB) are experiencing random multisecond delays. These delays were caused by the long time it took to scan the VMA interval tree with mmap_sem held. The sharing of huge PMD does not require changes to the i_mmap at all. Therefore, we can just take the read lock and let other threads searching for the right VMA share it in parallel. Once the right VMA is found, either the PMD lock (2M huge page for x86-64) or the mm->page_table_lock will be acquired to perform the actual PMD sharing. Lock contention, if present, will happen in the spinlock. That is much better than contention in the rwsem where the time needed to scan the the interval tree is indeterminate. With this patch applied, the customer is seeing significant performance improvement over the unpatched kernel. Link: http://lkml.kernel.org/r/20191107211809.9539-1-longman@redhat.com Signed-off-by: Waiman Long Suggested-by: Mike Kravetz Reviewed-by: Mike Kravetz Cc: Davidlohr Bueso Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Will Deacon Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 39579f98d6f3..18c92cb9bf43 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4769,7 +4769,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!vma_shareable(vma, addr)) return (pte_t *)pmd_alloc(mm, pud, addr); - i_mmap_lock_write(mapping); + i_mmap_lock_read(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; @@ -4799,7 +4799,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) spin_unlock(ptl); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); - i_mmap_unlock_write(mapping); + i_mmap_unlock_read(mapping); return pte; } -- cgit v1.2.3 From 5c9119542035dbbc61241ab8dc7feeac11fa82ca Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Sat, 30 Nov 2019 17:56:54 -0800 Subject: hugetlb: region_chg provides only cache entry Current behavior is that region_chg provides both a cache entry in resv->region_cache, AND a placeholder entry in resv->regions. region_add first tries to use the placeholder, and if it finds that the placeholder has been deleted by a racing region_del call, it uses the cache entry. This behavior is completely unnecessary and is removed in this patch for a couple of reasons: 1. region_add needs to either find a cached file_region entry in resv->region_cache, or find an entry in resv->regions to expand. It does not need both. 2. region_chg adding a placeholder entry in resv->regions opens up a possible race with region_del, where region_chg adds a placeholder region in resv->regions, and this region is deleted by a racing call to region_del during region_chg execution or before region_add is called. Removing the race makes the code easier to reason about and maintain. In addition, a follow up patch in another series that disables region coalescing, which would be further complicated if the race with region_del exists. Link: http://lkml.kernel.org/r/20190919200428.188797-2-almasrymina@google.com Signed-off-by: Mina Almasry Reviewed-by: Mike Kravetz Cc: David Rientjes Cc: Shakeel Butt Cc: Greg Thelen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 63 +++++++++++------------------------------------------------- 1 file changed, 11 insertions(+), 52 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 18c92cb9bf43..17178dbd1167 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -246,14 +246,10 @@ struct file_region { /* * Add the huge page range represented by [f, t) to the reserve - * map. In the normal case, existing regions will be expanded - * to accommodate the specified range. Sufficient regions should - * exist for expansion due to the previous call to region_chg - * with the same range. However, it is possible that region_del - * could have been called after region_chg and modifed the map - * in such a way that no region exists to be expanded. In this - * case, pull a region descriptor from the cache associated with - * the map and use that for the new range. + * map. Existing regions will be expanded to accommodate the specified + * range, or a region will be taken from the cache. Sufficient regions + * must exist in the cache due to the previous call to region_chg with + * the same range. * * Return the number of new huge pages added to the map. This * number is greater than or equal to zero. @@ -272,9 +268,8 @@ static long region_add(struct resv_map *resv, long f, long t) /* * If no region exists which can be expanded to include the - * specified range, the list must have been modified by an - * interleving call to region_del(). Pull a region descriptor - * from the cache and use it for this range. + * specified range, pull a region descriptor from the cache + * and use it for this range. */ if (&rg->link == head || t < rg->from) { VM_BUG_ON(resv->region_cache_count <= 0); @@ -339,15 +334,9 @@ out_locked: * call to region_add that will actually modify the reserve * map to add the specified range [f, t). region_chg does * not change the number of huge pages represented by the - * map. However, if the existing regions in the map can not - * be expanded to represent the new range, a new file_region - * structure is added to the map as a placeholder. This is - * so that the subsequent region_add call will have all the - * regions it needs and will not fail. - * - * Upon entry, region_chg will also examine the cache of region descriptors - * associated with the map. If there are not enough descriptors cached, one - * will be allocated for the in progress add operation. + * map. A new file_region structure is added to the cache + * as a placeholder, so that the subsequent region_add + * call will have all the regions it needs and will not fail. * * Returns the number of huge pages that need to be added to the existing * reservation map for the range [f, t). This number is greater or equal to @@ -357,10 +346,9 @@ out_locked: static long region_chg(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; - struct file_region *rg, *nrg = NULL; + struct file_region *rg; long chg = 0; -retry: spin_lock(&resv->lock); retry_locked: resv->adds_in_progress++; @@ -378,10 +366,8 @@ retry_locked: spin_unlock(&resv->lock); trg = kmalloc(sizeof(*trg), GFP_KERNEL); - if (!trg) { - kfree(nrg); + if (!trg) return -ENOMEM; - } spin_lock(&resv->lock); list_add(&trg->link, &resv->region_cache); @@ -394,28 +380,6 @@ retry_locked: if (f <= rg->to) break; - /* If we are below the current region then a new region is required. - * Subtle, allocate a new region at the position but make it zero - * size such that we can guarantee to record the reservation. */ - if (&rg->link == head || t < rg->from) { - if (!nrg) { - resv->adds_in_progress--; - spin_unlock(&resv->lock); - nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); - if (!nrg) - return -ENOMEM; - - nrg->from = f; - nrg->to = f; - INIT_LIST_HEAD(&nrg->link); - goto retry; - } - - list_add(&nrg->link, rg->link.prev); - chg = t - f; - goto out_nrg; - } - /* Round our left edge to the current segment if it encloses us. */ if (f > rg->from) f = rg->from; @@ -439,11 +403,6 @@ retry_locked: } out: - spin_unlock(&resv->lock); - /* We already know we raced and no longer need the new region */ - kfree(nrg); - return chg; -out_nrg: spin_unlock(&resv->lock); return chg; } -- cgit v1.2.3 From d75c6af9c89ac1fe8b74a5c094ce412ae992efc9 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Sat, 30 Nov 2019 17:56:59 -0800 Subject: hugetlb: remove duplicated code Remove duplicated code between region_chg and region_add, and refactor it into a common function, add_reservation_in_range. This is mostly done because there is a follow up change in another series that disables region coalescing in region_add, and I want to make that change in one place only. It should improve maintainability anyway on its own. [akpm@linux-foundation.org: coding style fixes] Link: http://lkml.kernel.org/r/20190919200428.188797-3-almasrymina@google.com Signed-off-by: Mina Almasry Reviewed-by: Mike Kravetz Cc: David Rientjes Cc: Shakeel Butt Cc: Greg Thelen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 119 ++++++++++++++++++++++++++++------------------------------- 1 file changed, 57 insertions(+), 62 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 17178dbd1167..a8e43aa9c670 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -244,6 +244,60 @@ struct file_region { long to; }; +/* Must be called with resv->lock held. Calling this with count_only == true + * will count the number of pages to be added but will not modify the linked + * list. + */ +static long add_reservation_in_range(struct resv_map *resv, long f, long t, + bool count_only) +{ + long chg = 0; + struct list_head *head = &resv->regions; + struct file_region *rg = NULL, *trg = NULL, *nrg = NULL; + + /* Locate the region we are before or in. */ + list_for_each_entry(rg, head, link) + if (f <= rg->to) + break; + + /* Round our left edge to the current segment if it encloses us. */ + if (f > rg->from) + f = rg->from; + + chg = t - f; + + /* Check for and consume any regions we now overlap with. */ + nrg = rg; + list_for_each_entry_safe(rg, trg, rg->link.prev, link) { + if (&rg->link == head) + break; + if (rg->from > t) + break; + + /* We overlap with this area, if it extends further than + * us then we must extend ourselves. Account for its + * existing reservation. + */ + if (rg->to > t) { + chg += rg->to - t; + t = rg->to; + } + chg -= rg->to - rg->from; + + if (!count_only && rg != nrg) { + list_del(&rg->link); + kfree(rg); + } + } + + if (!count_only) { + nrg->from = f; + nrg->to = t; + } + + return chg; +} + /* * Add the huge page range represented by [f, t) to the reserve * map. Existing regions will be expanded to accommodate the specified @@ -257,7 +311,7 @@ struct file_region { static long region_add(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; - struct file_region *rg, *nrg, *trg; + struct file_region *rg, *nrg; long add = 0; spin_lock(&resv->lock); @@ -287,38 +341,7 @@ static long region_add(struct resv_map *resv, long f, long t) goto out_locked; } - /* Round our left edge to the current segment if it encloses us. */ - if (f > rg->from) - f = rg->from; - - /* Check for and consume any regions we now overlap with. */ - nrg = rg; - list_for_each_entry_safe(rg, trg, rg->link.prev, link) { - if (&rg->link == head) - break; - if (rg->from > t) - break; - - /* If this area reaches higher then extend our area to - * include it completely. If this is not the first area - * which we intend to reuse, free it. */ - if (rg->to > t) - t = rg->to; - if (rg != nrg) { - /* Decrement return value by the deleted range. - * Another range will span this area so that by - * end of routine add will be >= zero - */ - add -= (rg->to - rg->from); - list_del(&rg->link); - kfree(rg); - } - } - - add += (nrg->from - f); /* Added to beginning of region */ - nrg->from = f; - add += t - nrg->to; /* Added to end of region */ - nrg->to = t; + add = add_reservation_in_range(resv, f, t, false); out_locked: resv->adds_in_progress--; @@ -345,8 +368,6 @@ out_locked: */ static long region_chg(struct resv_map *resv, long f, long t) { - struct list_head *head = &resv->regions; - struct file_region *rg; long chg = 0; spin_lock(&resv->lock); @@ -375,34 +396,8 @@ retry_locked: goto retry_locked; } - /* Locate the region we are before or in. */ - list_for_each_entry(rg, head, link) - if (f <= rg->to) - break; - - /* Round our left edge to the current segment if it encloses us. */ - if (f > rg->from) - f = rg->from; - chg = t - f; - - /* Check for and consume any regions we now overlap with. */ - list_for_each_entry(rg, rg->link.prev, link) { - if (&rg->link == head) - break; - if (rg->from > t) - goto out; + chg = add_reservation_in_range(resv, f, t, true); - /* We overlap with this area, if it extends further than - * us then we must extend ourselves. Account for its - * existing reservation. */ - if (rg->to > t) { - chg += rg->to - t; - t = rg->to; - } - chg -= rg->to - rg->from; - } - -out: spin_unlock(&resv->lock); return chg; } -- cgit v1.2.3 From 188b04a7d93860fd100b2671600b8ad81fb0a842 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 30 Nov 2019 17:57:02 -0800 Subject: hugetlb: remove unused hstate in hugetlb_fault_mutex_hash() The first parameter hstate in function hugetlb_fault_mutex_hash() is not used anymore. This patch removes it. [akpm@linux-foundation.org: various build fixes] [cai@lca.pw: fix a GCC compilation warning] Link: http://lkml.kernel.org/r/1570544108-32331-1-git-send-email-cai@lca.pw Link: http://lkml.kernel.org/r/20191005003302.785-1-richardw.yang@linux.intel.com Signed-off-by: Wei Yang Signed-off-by: Qian Cai Suggested-by: Andrew Morton Reviewed-by: Andrew Morton Cc: Mike Kravetz Cc: Hugh Dickins Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 4 ++-- include/linux/hugetlb.h | 3 +-- mm/hugetlb.c | 10 ++++------ mm/userfaultfd.c | 5 +---- 4 files changed, 8 insertions(+), 14 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index c978061c3893..d5c2a3158610 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -440,7 +440,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, u32 hash; index = page->index; - hash = hugetlb_fault_mutex_hash(h, mapping, index); + hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* @@ -644,7 +644,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, addr = index * hpage_size; /* mutex taken here, fault path and hole punch */ - hash = hugetlb_fault_mutex_hash(h, mapping, index); + hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 159d2012cdb1..31d4920994b9 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -105,8 +105,7 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason); void free_huge_page(struct page *page); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, - pgoff_t idx); +u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx); pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a8e43aa9c670..8624b7758abb 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3796,7 +3796,7 @@ retry: * handling userfault. Reacquire after handling * fault to make calling code simpler. */ - hash = hugetlb_fault_mutex_hash(h, mapping, idx); + hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_unlock(&hugetlb_fault_mutex_table[hash]); ret = handle_userfault(&vmf, VM_UFFD_MISSING); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -3923,8 +3923,7 @@ backout_unlocked: } #ifdef CONFIG_SMP -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, - pgoff_t idx) +u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) { unsigned long key[2]; u32 hash; @@ -3941,8 +3940,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, * For uniprocesor systems we always use a single mutex, so just * return 0 and avoid the hashing overhead. */ -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, - pgoff_t idx) +u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) { return 0; } @@ -3986,7 +3984,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - hash = hugetlb_fault_mutex_hash(h, mapping, idx); + hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); entry = huge_ptep_get(ptep); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 640ff2bd9a69..6d152741bb26 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -184,7 +184,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, unsigned long src_addr, dst_addr; long copied; struct page *page; - struct hstate *h; unsigned long vma_hpagesize; pgoff_t idx; u32 hash; @@ -256,8 +255,6 @@ retry: goto out_unlock; } - h = hstate_vma(dst_vma); - while (src_addr < src_start + len) { pte_t dst_pteval; @@ -269,7 +266,7 @@ retry: */ idx = linear_page_index(dst_vma, dst_addr); mapping = dst_vma->vm_file->f_mapping; - hash = hugetlb_fault_mutex_hash(h, mapping, idx); + hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); err = -ENOMEM; -- cgit v1.2.3 From acbfb087e3b19959d6f4b779a9a15bff644b8c9a Mon Sep 17 00:00:00 2001 From: Zhigang Lu Date: Sat, 30 Nov 2019 17:57:06 -0800 Subject: mm/hugetlb: avoid looping to the same hugepage if !pages and !vmas When mmapping an existing hugetlbfs file with MAP_POPULATE, we find it is very time consuming. For example, mmapping a 128GB file takes about 50 milliseconds. Sampling with perfevent shows it spends 99% time in the same_page loop in follow_hugetlb_page(). samples: 205 of event 'cycles', Event count (approx.): 136686374 - 99.04% test_mmap_huget [kernel.kallsyms] [k] follow_hugetlb_page follow_hugetlb_page __get_user_pages __mlock_vma_pages_range __mm_populate vm_mmap_pgoff sys_mmap_pgoff sys_mmap system_call_fastpath __mmap64 follow_hugetlb_page() is called with pages=NULL and vmas=NULL, so for each hugepage, we run into the same_page loop for pages_per_huge_page() times, but doing nothing. With this change, it takes less then 1 millisecond to mmap a 128GB file in hugetlbfs. Link: http://lkml.kernel.org/r/1567581712-5992-1-git-send-email-totty.lu@gmail.com Signed-off-by: Zhigang Lu Reviewed-by: Haozhong Zhang Reviewed-by: Zongming Zhang Reviewed-by: Mike Kravetz Acked-by: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8624b7758abb..ac65bb5e38ac 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4338,6 +4338,21 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, break; } } + + /* + * If subpage information not requested, update counters + * and skip the same_page loop below. + */ + if (!pages && !vmas && !pfn_offset && + (vaddr + huge_page_size(h) < vma->vm_end) && + (remainder >= pages_per_huge_page(h))) { + vaddr += huge_page_size(h); + remainder -= pages_per_huge_page(h); + i += pages_per_huge_page(h); + spin_unlock(ptl); + continue; + } + same_page: if (pages) { pages[i] = mem_map_offset(page, pfn_offset); -- cgit v1.2.3 From c77c0a8ac4c522638a8242fcb9de9496e3cdbb2d Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 4 Jan 2020 13:00:15 -0800 Subject: mm/hugetlb: defer freeing of huge pages if in non-task context The following lockdep splat was observed when a certain hugetlbfs test was run: ================================ WARNING: inconsistent lock state 4.18.0-159.el8.x86_64+debug #1 Tainted: G W --------- - - -------------------------------- inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. swapper/30/0 [HC0[0]:SC1[1]:HE1:SE0] takes: ffffffff9acdc038 (hugetlb_lock){+.?.}, at: free_huge_page+0x36f/0xaa0 {SOFTIRQ-ON-W} state was registered at: lock_acquire+0x14f/0x3b0 _raw_spin_lock+0x30/0x70 __nr_hugepages_store_common+0x11b/0xb30 hugetlb_sysctl_handler_common+0x209/0x2d0 proc_sys_call_handler+0x37f/0x450 vfs_write+0x157/0x460 ksys_write+0xb8/0x170 do_syscall_64+0xa5/0x4d0 entry_SYSCALL_64_after_hwframe+0x6a/0xdf irq event stamp: 691296 hardirqs last enabled at (691296): [] _raw_spin_unlock_irqrestore+0x4b/0x60 hardirqs last disabled at (691295): [] _raw_spin_lock_irqsave+0x22/0x81 softirqs last enabled at (691284): [] irq_enter+0xc3/0xe0 softirqs last disabled at (691285): [] irq_exit+0x23e/0x2b0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(hugetlb_lock); lock(hugetlb_lock); *** DEADLOCK *** : Call Trace: __lock_acquire+0x146b/0x48c0 lock_acquire+0x14f/0x3b0 _raw_spin_lock+0x30/0x70 free_huge_page+0x36f/0xaa0 bio_check_pages_dirty+0x2fc/0x5c0 clone_endio+0x17f/0x670 [dm_mod] blk_update_request+0x276/0xe50 scsi_end_request+0x7b/0x6a0 scsi_io_completion+0x1c6/0x1570 blk_done_softirq+0x22e/0x350 __do_softirq+0x23d/0xad8 irq_exit+0x23e/0x2b0 do_IRQ+0x11a/0x200 common_interrupt+0xf/0xf Both the hugetbl_lock and the subpool lock can be acquired in free_huge_page(). One way to solve the problem is to make both locks irq-safe. However, Mike Kravetz had learned that the hugetlb_lock is held for a linear scan of ALL hugetlb pages during a cgroup reparentling operation. So it is just too long to have irq disabled unless we can break hugetbl_lock down into finer-grained locks with shorter lock hold times. Another alternative is to defer the freeing to a workqueue job. This patch implements the deferred freeing by adding a free_hpage_workfn() work function to do the actual freeing. The free_huge_page() call in a non-task context saves the page to be freed in the hpage_freelist linked list in a lockless manner using the llist APIs. The generic workqueue is used to process the work, but a dedicated workqueue can be used instead if it is desirable to have the huge page freed ASAP. Thanks to Kirill Tkhai for suggesting the use of llist APIs which simplfy the code. Link: http://lkml.kernel.org/r/20191217170331.30893-1-longman@redhat.com Signed-off-by: Waiman Long Reviewed-by: Mike Kravetz Acked-by: Davidlohr Bueso Acked-by: Michal Hocko Reviewed-by: Kirill Tkhai Cc: Aneesh Kumar K.V Cc: Matthew Wilcox Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ac65bb5e38ac..dd8737a94bec 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -1136,7 +1137,7 @@ static inline void ClearPageHugeTemporary(struct page *page) page[2].mapping = NULL; } -void free_huge_page(struct page *page) +static void __free_huge_page(struct page *page) { /* * Can't pass hstate in here because it is called from the @@ -1199,6 +1200,54 @@ void free_huge_page(struct page *page) spin_unlock(&hugetlb_lock); } +/* + * As free_huge_page() can be called from a non-task context, we have + * to defer the actual freeing in a workqueue to prevent potential + * hugetlb_lock deadlock. + * + * free_hpage_workfn() locklessly retrieves the linked list of pages to + * be freed and frees them one-by-one. As the page->mapping pointer is + * going to be cleared in __free_huge_page() anyway, it is reused as the + * llist_node structure of a lockless linked list of huge pages to be freed. + */ +static LLIST_HEAD(hpage_freelist); + +static void free_hpage_workfn(struct work_struct *work) +{ + struct llist_node *node; + struct page *page; + + node = llist_del_all(&hpage_freelist); + + while (node) { + page = container_of((struct address_space **)node, + struct page, mapping); + node = node->next; + __free_huge_page(page); + } +} +static DECLARE_WORK(free_hpage_work, free_hpage_workfn); + +void free_huge_page(struct page *page) +{ + /* + * Defer freeing if in non-task context to avoid hugetlb_lock deadlock. + */ + if (!in_task()) { + /* + * Only call schedule_work() if hpage_freelist is previously + * empty. Otherwise, schedule_work() had been called but the + * workfn hasn't retrieved the list yet. + */ + if (llist_add((struct llist_node *)&page->mapping, + &hpage_freelist)) + schedule_work(&free_hpage_work); + return; + } + + __free_huge_page(page); +} + static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { INIT_LIST_HEAD(&page->lru); -- cgit v1.2.3