diff options
Diffstat (limited to 'mm')
55 files changed, 3434 insertions, 1755 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 48b1af447fa7..0ded10a22639 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -678,6 +678,7 @@ config ZONE_DEVICE depends on MEMORY_HOTREMOVE depends on SPARSEMEM_VMEMMAP depends on ARCH_HAS_ZONE_DEVICE + select RADIX_TREE_MULTIORDER help Device memory hotplug support allows for establishing pmem, diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 9075aa54e955..b06d9fe23a28 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -24,7 +24,7 @@ struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info) { unsigned long flags; struct page *page = alloc_page(balloon_mapping_gfp_mask() | - __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_ZERO); + __GFP_NOMEMALLOC | __GFP_NORETRY); if (!page) return NULL; diff --git a/mm/cma_debug.c b/mm/cma_debug.c index 595b757bef72..c03ccbc405a0 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -167,7 +167,7 @@ static void cma_debugfs_add_one(struct cma *cma, int idx) char name[16]; int u32s; - sprintf(name, "cma-%s", cma->name); + scnprintf(name, sizeof(name), "cma-%s", cma->name); tmp = debugfs_create_dir(name, cma_debugfs_root); diff --git a/mm/debug.c b/mm/debug.c index db1cd26d8752..5715448ab0b5 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -124,9 +124,7 @@ void dump_mm(const struct mm_struct *mm) #ifdef CONFIG_NUMA_BALANCING "numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n" #endif -#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) "tlb_flush_pending %d\n" -#endif "def_flags: %#lx(%pGv)\n", mm, mm->mmap, mm->vmacache_seqnum, mm->task_size, @@ -158,9 +156,7 @@ void dump_mm(const struct mm_struct *mm) #ifdef CONFIG_NUMA_BALANCING mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq, #endif -#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) - mm->tlb_flush_pending, -#endif + atomic_read(&mm->tlb_flush_pending), mm->def_flags, &mm->def_flags ); } diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index 6d5717bd7197..b1dd4a948fc0 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -30,6 +30,13 @@ early_param("early_ioremap_debug", early_ioremap_debug_setup); static int after_paging_init __initdata; +pgprot_t __init __weak early_memremap_pgprot_adjust(resource_size_t phys_addr, + unsigned long size, + pgprot_t prot) +{ + return prot; +} + void __init __weak early_ioremap_shutdown(void) { } @@ -215,14 +222,29 @@ early_ioremap(resource_size_t phys_addr, unsigned long size) void __init * early_memremap(resource_size_t phys_addr, unsigned long size) { - return (__force void *)__early_ioremap(phys_addr, size, - FIXMAP_PAGE_NORMAL); + pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size, + FIXMAP_PAGE_NORMAL); + + return (__force void *)__early_ioremap(phys_addr, size, prot); } #ifdef FIXMAP_PAGE_RO void __init * early_memremap_ro(resource_size_t phys_addr, unsigned long size) { - return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO); + pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size, + FIXMAP_PAGE_RO); + + return (__force void *)__early_ioremap(phys_addr, size, prot); +} +#endif + +#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT +void __init * +early_memremap_prot(resource_size_t phys_addr, unsigned long size, + unsigned long prot_val) +{ + return (__force void *)__early_ioremap(phys_addr, size, + __pgprot(prot_val)); } #endif diff --git a/mm/filemap.c b/mm/filemap.c index a49702445ce0..9d21afd692b9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -130,17 +130,8 @@ static int page_cache_tree_insert(struct address_space *mapping, return -EEXIST; mapping->nrexceptional--; - if (!dax_mapping(mapping)) { - if (shadowp) - *shadowp = p; - } else { - /* DAX can replace empty locked entry with a hole */ - WARN_ON_ONCE(p != - dax_radix_locked_entry(0, RADIX_DAX_EMPTY)); - /* Wakeup waiters for exceptional entry lock */ - dax_wake_mapping_entry_waiter(mapping, page->index, p, - true); - } + if (shadowp) + *shadowp = p; } __radix_tree_replace(&mapping->page_tree, node, slot, page, workingset_update_node, mapping); @@ -402,8 +393,7 @@ bool filemap_range_has_page(struct address_space *mapping, { pgoff_t index = start_byte >> PAGE_SHIFT; pgoff_t end = end_byte >> PAGE_SHIFT; - struct pagevec pvec; - bool ret; + struct page *page; if (end_byte < start_byte) return false; @@ -411,12 +401,10 @@ bool filemap_range_has_page(struct address_space *mapping, if (mapping->nrpages == 0) return false; - pagevec_init(&pvec, 0); - if (!pagevec_lookup(&pvec, mapping, index, 1)) + if (!find_get_pages_range(mapping, &index, end, 1, &page)) return false; - ret = (pvec.pages[0]->index <= end); - pagevec_release(&pvec); - return ret; + put_page(page); + return true; } EXPORT_SYMBOL(filemap_range_has_page); @@ -476,6 +464,29 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, EXPORT_SYMBOL(filemap_fdatawait_range); /** + * file_fdatawait_range - wait for writeback to complete + * @file: file pointing to address space structure to wait for + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) + * + * Walk the list of under-writeback pages of the address space that file + * refers to, in the given range and wait for all of them. Check error + * status of the address space vs. the file->f_wb_err cursor and return it. + * + * Since the error status of the file is advanced by this function, + * callers are responsible for checking the return value and handling and/or + * reporting the error. + */ +int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte) +{ + struct address_space *mapping = file->f_mapping; + + __filemap_fdatawait_range(mapping, start_byte, end_byte); + return file_check_and_advance_wb_err(file); +} +EXPORT_SYMBOL(file_fdatawait_range); + +/** * filemap_fdatawait_keep_errors - wait for writeback without clearing errors * @mapping: address space structure to wait for * @@ -489,45 +500,22 @@ EXPORT_SYMBOL(filemap_fdatawait_range); */ int filemap_fdatawait_keep_errors(struct address_space *mapping) { - loff_t i_size = i_size_read(mapping->host); - - if (i_size == 0) - return 0; - - __filemap_fdatawait_range(mapping, 0, i_size - 1); + __filemap_fdatawait_range(mapping, 0, LLONG_MAX); return filemap_check_and_keep_errors(mapping); } EXPORT_SYMBOL(filemap_fdatawait_keep_errors); -/** - * filemap_fdatawait - wait for all under-writeback pages to complete - * @mapping: address space structure to wait for - * - * Walk the list of under-writeback pages of the given address space - * and wait for all of them. Check error status of the address space - * and return it. - * - * Since the error status of the address space is cleared by this function, - * callers are responsible for checking the return value and handling and/or - * reporting the error. - */ -int filemap_fdatawait(struct address_space *mapping) +static bool mapping_needs_writeback(struct address_space *mapping) { - loff_t i_size = i_size_read(mapping->host); - - if (i_size == 0) - return 0; - - return filemap_fdatawait_range(mapping, 0, i_size - 1); + return (!dax_mapping(mapping) && mapping->nrpages) || + (dax_mapping(mapping) && mapping->nrexceptional); } -EXPORT_SYMBOL(filemap_fdatawait); int filemap_write_and_wait(struct address_space *mapping) { int err = 0; - if ((!dax_mapping(mapping) && mapping->nrpages) || - (dax_mapping(mapping) && mapping->nrexceptional)) { + if (mapping_needs_writeback(mapping)) { err = filemap_fdatawrite(mapping); /* * Even if the above returned error, the pages may be @@ -566,8 +554,7 @@ int filemap_write_and_wait_range(struct address_space *mapping, { int err = 0; - if ((!dax_mapping(mapping) && mapping->nrpages) || - (dax_mapping(mapping) && mapping->nrexceptional)) { + if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); /* See comment of filemap_write_and_wait() */ @@ -589,7 +576,7 @@ EXPORT_SYMBOL(filemap_write_and_wait_range); void __filemap_set_wb_err(struct address_space *mapping, int err) { - errseq_t eseq = __errseq_set(&mapping->wb_err, err); + errseq_t eseq = errseq_set(&mapping->wb_err, err); trace_filemap_set_wb_err(mapping, eseq); } @@ -656,8 +643,7 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) int err = 0, err2; struct address_space *mapping = file->f_mapping; - if ((!dax_mapping(mapping) && mapping->nrpages) || - (dax_mapping(mapping) && mapping->nrexceptional)) { + if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); /* See comment of filemap_write_and_wait() */ @@ -885,6 +871,7 @@ void __init pagecache_init(void) page_writeback_init(); } +/* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */ struct wait_page_key { struct page *page; int bit_nr; @@ -909,8 +896,10 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, if (wait_page->bit_nr != key->bit_nr) return 0; + + /* Stop walking if it's locked */ if (test_bit(key->bit_nr, &key->page->flags)) - return 0; + return -1; return autoremove_wake_function(wait, mode, sync, key); } @@ -964,6 +953,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, int ret = 0; init_wait(wait); + wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0; wait->func = wake_page_function; wait_page.page = page; wait_page.bit_nr = bit_nr; @@ -972,10 +962,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, spin_lock_irq(&q->lock); if (likely(list_empty(&wait->entry))) { - if (lock) - __add_wait_queue_entry_tail_exclusive(q, wait); - else - __add_wait_queue(q, wait); + __add_wait_queue_entry_tail(q, wait); SetPageWaiters(page); } @@ -985,10 +972,6 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, if (likely(test_bit(bit_nr, &page->flags))) { io_schedule(); - if (unlikely(signal_pending_state(state, current))) { - ret = -EINTR; - break; - } } if (lock) { @@ -998,6 +981,11 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, if (!test_bit(bit_nr, &page->flags)) break; } + + if (unlikely(signal_pending_state(state, current))) { + ret = -EINTR; + break; + } } finish_wait(q, wait); @@ -1039,7 +1027,7 @@ void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter) unsigned long flags; spin_lock_irqsave(&q->lock, flags); - __add_wait_queue(q, waiter); + __add_wait_queue_entry_tail(q, waiter); SetPageWaiters(page); spin_unlock_irqrestore(&q->lock, flags); } @@ -1564,23 +1552,29 @@ export: } /** - * find_get_pages - gang pagecache lookup + * find_get_pages_range - gang pagecache lookup * @mapping: The address_space to search * @start: The starting page index + * @end: The final page index (inclusive) * @nr_pages: The maximum number of pages * @pages: Where the resulting pages are placed * - * find_get_pages() will search for and return a group of up to - * @nr_pages pages in the mapping. The pages are placed at @pages. - * find_get_pages() takes a reference against the returned pages. + * find_get_pages_range() will search for and return a group of up to @nr_pages + * pages in the mapping starting at index @start and up to index @end + * (inclusive). The pages are placed at @pages. find_get_pages_range() takes + * a reference against the returned pages. * * The search returns a group of mapping-contiguous pages with ascending * indexes. There may be holes in the indices due to not-present pages. + * We also update @start to index the next page for the traversal. * - * find_get_pages() returns the number of pages which were found. + * find_get_pages_range() returns the number of pages which were found. If this + * number is smaller than @nr_pages, the end of specified range has been + * reached. */ -unsigned find_get_pages(struct address_space *mapping, pgoff_t start, - unsigned int nr_pages, struct page **pages) +unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, + pgoff_t end, unsigned int nr_pages, + struct page **pages) { struct radix_tree_iter iter; void **slot; @@ -1590,8 +1584,11 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, *start) { struct page *head, *page; + + if (iter.index > end) + break; repeat: page = radix_tree_deref_slot(slot); if (unlikely(!page)) @@ -1627,11 +1624,25 @@ repeat: } pages[ret] = page; - if (++ret == nr_pages) - break; + if (++ret == nr_pages) { + *start = pages[ret - 1]->index + 1; + goto out; + } } + /* + * We come here when there is no page beyond @end. We take care to not + * overflow the index @start as it confuses some of the callers. This + * breaks the iteration when there is page at index -1 but that is + * already broken anyway. + */ + if (end == (pgoff_t)-1) + *start = (pgoff_t)-1; + else + *start = end + 1; +out: rcu_read_unlock(); + return ret; } @@ -1352,7 +1352,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, } #endif /* __HAVE_ARCH_PTE_SPECIAL */ -#ifdef __HAVE_ARCH_PTE_DEVMAP +#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __gup_device_huge(unsigned long pfn, unsigned long addr, unsigned long end, struct page **pages, int *nr) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 86975dec0ba1..0b51e70e0a8b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -32,6 +32,7 @@ #include <linux/userfaultfd_k.h> #include <linux/page_idle.h> #include <linux/shmem_fs.h> +#include <linux/oom.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -327,7 +328,7 @@ static struct attribute *hugepage_attr[] = { NULL, }; -static struct attribute_group hugepage_attr_group = { +static const struct attribute_group hugepage_attr_group = { .attrs = hugepage_attr, }; @@ -550,6 +551,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, struct mem_cgroup *memcg; pgtable_t pgtable; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + int ret = 0; VM_BUG_ON_PAGE(!PageCompound(page), page); @@ -561,12 +563,11 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, pgtable = pte_alloc_one(vma->vm_mm, haddr); if (unlikely(!pgtable)) { - mem_cgroup_cancel_charge(page, memcg, true); - put_page(page); - return VM_FAULT_OOM; + ret = VM_FAULT_OOM; + goto release; } - clear_huge_page(page, haddr, HPAGE_PMD_NR); + clear_huge_page(page, vmf->address, HPAGE_PMD_NR); /* * The memory barrier inside __SetPageUptodate makes sure that * clear_huge_page writes become visible before the set_pmd_at() @@ -576,13 +577,14 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_none(*vmf->pmd))) { - spin_unlock(vmf->ptl); - mem_cgroup_cancel_charge(page, memcg, true); - put_page(page); - pte_free(vma->vm_mm, pgtable); + goto unlock_release; } else { pmd_t entry; + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto unlock_release; + /* Deliver the page fault to userland */ if (userfaultfd_missing(vma)) { int ret; @@ -610,6 +612,15 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, } return 0; +unlock_release: + spin_unlock(vmf->ptl); +release: + if (pgtable) + pte_free(vma->vm_mm, pgtable); + mem_cgroup_cancel_charge(page, memcg, true); + put_page(page); + return ret; + } /* @@ -688,7 +699,10 @@ int do_huge_pmd_anonymous_page(struct vm_fault *vmf) ret = 0; set = false; if (pmd_none(*vmf->pmd)) { - if (userfaultfd_missing(vma)) { + ret = check_stable_address_space(vma->vm_mm); + if (ret) { + spin_unlock(vmf->ptl); + } else if (userfaultfd_missing(vma)) { spin_unlock(vmf->ptl); ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); @@ -1226,15 +1240,29 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) * We can only reuse the page if nobody else maps the huge page or it's * part. */ - if (page_trans_huge_mapcount(page, NULL) == 1) { + if (!trylock_page(page)) { + get_page(page); + spin_unlock(vmf->ptl); + lock_page(page); + spin_lock(vmf->ptl); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { + unlock_page(page); + put_page(page); + goto out_unlock; + } + put_page(page); + } + if (reuse_swap_page(page, NULL)) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); ret |= VM_FAULT_WRITE; + unlock_page(page); goto out_unlock; } + unlock_page(page); get_page(page); spin_unlock(vmf->ptl); alloc: @@ -1277,7 +1305,7 @@ alloc: count_vm_event(THP_FAULT_ALLOC); if (!page) - clear_huge_page(new_page, haddr, HPAGE_PMD_NR); + clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR); else copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); @@ -1496,10 +1524,25 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) } /* + * Since we took the NUMA fault, we must have observed the !accessible + * bit. Make sure all other CPUs agree with that, to avoid them + * modifying the page we're about to migrate. + * + * Must be done under PTL such that we'll observe the relevant + * inc_tlb_flush_pending(). + * + * We are not sure a pending tlb flush here is for a huge page + * mapping or not. Hence use the tlb range variant + */ + if (mm_tlb_flush_pending(vma->vm_mm)) + flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); + + /* * Migrate the THP to the requested node, returns with page unlocked * and access rights restored. */ spin_unlock(vmf->ptl); + migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, vmf->pmd, pmd, vmf->address, page, target_nid); if (migrated) { @@ -2438,6 +2481,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageCompound(page), page); + if (PageWriteback(page)) + return -EBUSY; + if (PageAnon(head)) { /* * The caller does not necessarily hold an mmap_sem that would @@ -2515,7 +2561,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) __dec_node_page_state(page, NR_SHMEM_THPS); spin_unlock(&pgdata->split_queue_lock); __split_huge_page(page, list, flags); - ret = 0; + if (PageSwapCache(head)) { + swp_entry_t entry = { .val = page_private(head) }; + + ret = split_swap_cluster(entry); + } else + ret = 0; } else { if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { pr_alert("total_mapcount: %u, page_count(): %u\n", diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bc48ee783dd9..34625b257128 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1066,11 +1066,11 @@ static void free_gigantic_page(struct page *page, unsigned int order) } static int __alloc_gigantic_page(unsigned long start_pfn, - unsigned long nr_pages) + unsigned long nr_pages, gfp_t gfp_mask) { unsigned long end_pfn = start_pfn + nr_pages; return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, - GFP_KERNEL); + gfp_mask); } static bool pfn_range_valid_gigantic(struct zone *z, @@ -1108,19 +1108,24 @@ static bool zone_spans_last_pfn(const struct zone *zone, return zone_spans_pfn(zone, last_pfn); } -static struct page *alloc_gigantic_page(int nid, unsigned int order) +static struct page *alloc_gigantic_page(int nid, struct hstate *h) { + unsigned int order = huge_page_order(h); unsigned long nr_pages = 1 << order; unsigned long ret, pfn, flags; - struct zone *z; + struct zonelist *zonelist; + struct zone *zone; + struct zoneref *z; + gfp_t gfp_mask; - z = NODE_DATA(nid)->node_zones; - for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) { - spin_lock_irqsave(&z->lock, flags); + gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; + zonelist = node_zonelist(nid, gfp_mask); + for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), NULL) { + spin_lock_irqsave(&zone->lock, flags); - pfn = ALIGN(z->zone_start_pfn, nr_pages); - while (zone_spans_last_pfn(z, pfn, nr_pages)) { - if (pfn_range_valid_gigantic(z, pfn, nr_pages)) { + pfn = ALIGN(zone->zone_start_pfn, nr_pages); + while (zone_spans_last_pfn(zone, pfn, nr_pages)) { + if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) { /* * We release the zone lock here because * alloc_contig_range() will also lock the zone @@ -1128,16 +1133,16 @@ static struct page *alloc_gigantic_page(int nid, unsigned int order) * spinning on this lock, it may win the race * and cause alloc_contig_range() to fail... */ - spin_unlock_irqrestore(&z->lock, flags); - ret = __alloc_gigantic_page(pfn, nr_pages); + spin_unlock_irqrestore(&zone->lock, flags); + ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask); if (!ret) return pfn_to_page(pfn); - spin_lock_irqsave(&z->lock, flags); + spin_lock_irqsave(&zone->lock, flags); } pfn += nr_pages; } - spin_unlock_irqrestore(&z->lock, flags); + spin_unlock_irqrestore(&zone->lock, flags); } return NULL; @@ -1150,7 +1155,7 @@ static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid) { struct page *page; - page = alloc_gigantic_page(nid, huge_page_order(h)); + page = alloc_gigantic_page(nid, h); if (page) { prep_compound_gigantic_page(page, huge_page_order(h)); prep_new_huge_page(h, page, nid); @@ -2569,13 +2574,13 @@ static struct attribute *hstate_attrs[] = { NULL, }; -static struct attribute_group hstate_attr_group = { +static const struct attribute_group hstate_attr_group = { .attrs = hstate_attrs, }; static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, struct kobject **hstate_kobjs, - struct attribute_group *hstate_attr_group) + const struct attribute_group *hstate_attr_group) { int retval; int hi = hstate_index(h); @@ -2633,7 +2638,7 @@ static struct attribute *per_node_hstate_attrs[] = { NULL, }; -static struct attribute_group per_node_hstate_attr_group = { +static const struct attribute_group per_node_hstate_attr_group = { .attrs = per_node_hstate_attrs, }; @@ -4062,9 +4067,9 @@ out: return ret; out_release_unlock: spin_unlock(ptl); -out_release_nounlock: if (vm_shared) unlock_page(page); +out_release_nounlock: put_page(page); goto out; } @@ -4078,6 +4083,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long vaddr = *position; unsigned long remainder = *nr_pages; struct hstate *h = hstate_vma(vma); + int err = -EFAULT; while (vaddr < vma->vm_end && remainder) { pte_t *pte; @@ -4154,11 +4160,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, } ret = hugetlb_fault(mm, vma, vaddr, fault_flags); if (ret & VM_FAULT_ERROR) { - int err = vm_fault_to_errno(ret, flags); - - if (err) - return err; - + err = vm_fault_to_errno(ret, flags); remainder = 0; break; } @@ -4213,7 +4215,7 @@ same_page: */ *position = vaddr; - return i ? i : -EFAULT; + return i ? i : err; } #ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE @@ -4603,6 +4605,15 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } +/* + * huge_pte_offset() - Walk the page table to resolve the hugepage + * entry at address @addr + * + * Return: Pointer to page table or swap entry (PUD or PMD) for + * address @addr, or NULL if a p*d_none() entry is encountered and the + * size @sz doesn't match the hugepage size at this level of the page + * table. + */ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { @@ -4617,13 +4628,22 @@ pte_t *huge_pte_offset(struct mm_struct *mm, p4d = p4d_offset(pgd, addr); if (!p4d_present(*p4d)) return NULL; + pud = pud_offset(p4d, addr); - if (!pud_present(*pud)) + if (sz != PUD_SIZE && pud_none(*pud)) return NULL; - if (pud_huge(*pud)) + /* hugepage or swap? */ + if (pud_huge(*pud) || !pud_present(*pud)) return (pte_t *)pud; + pmd = pmd_offset(pud, addr); - return (pte_t *) pmd; + if (sz != PMD_SIZE && pmd_none(*pmd)) + return NULL; + /* hugepage or swap? */ + if (pmd_huge(*pmd) || !pmd_present(*pmd)) + return (pte_t *)pmd; + + return NULL; } #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ diff --git a/mm/internal.h b/mm/internal.h index 24d88f084705..1df011f62480 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -480,6 +480,17 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, /* Mask to get the watermark bits */ #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) +/* + * Only MMU archs have async oom victim reclaim - aka oom_reaper so we + * cannot assume a reduced access to memory reserves is sufficient for + * !MMU + */ +#ifdef CONFIG_MMU +#define ALLOC_OOM 0x08 +#else +#define ALLOC_OOM ALLOC_NO_WATERMARKS +#endif + #define ALLOC_HARDER 0x10 /* try to alloc harder */ #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ @@ -498,6 +509,7 @@ extern struct workqueue_struct *mm_percpu_wq; #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH void try_to_unmap_flush(void); void try_to_unmap_flush_dirty(void); +void flush_tlb_batched_pending(struct mm_struct *mm); #else static inline void try_to_unmap_flush(void) { @@ -505,7 +517,9 @@ static inline void try_to_unmap_flush(void) static inline void try_to_unmap_flush_dirty(void) { } - +static inline void flush_tlb_batched_pending(struct mm_struct *mm) +{ +} #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ extern const struct trace_print_flags pageflag_names[]; @@ -522,4 +536,5 @@ static inline bool is_migrate_highatomic_page(struct page *page) return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC; } +void setup_zone_pageset(struct zone *zone); #endif /* __MM_INTERNAL_H */ diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index ca11bc4ce205..6f319fb81718 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -267,13 +267,13 @@ static void check_memory_region(unsigned long addr, check_memory_region_inline(addr, size, write, ret_ip); } -void kasan_check_read(const void *p, unsigned int size) +void kasan_check_read(const volatile void *p, unsigned int size) { check_memory_region((unsigned long)p, size, false, _RET_IP_); } EXPORT_SYMBOL(kasan_check_read); -void kasan_check_write(const void *p, unsigned int size) +void kasan_check_write(const volatile void *p, unsigned int size) { check_memory_region((unsigned long)p, size, true, _RET_IP_); } diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 04bb1d3eb9ec..6bcfb01ba038 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -401,6 +401,7 @@ void kasan_report(unsigned long addr, size_t size, disable_trace_on_warning(); info.access_addr = (void *)addr; + info.first_bad_addr = (void *)addr; info.access_size = size; info.is_write = is_write; info.ip = ip; @@ -1038,7 +1038,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, goto out_unlock; if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) || - (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte))) { + (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) || + mm_tlb_flush_pending(mm)) { pte_t entry; swapped = PageSwapCache(page); @@ -3042,7 +3043,7 @@ static struct attribute *ksm_attrs[] = { NULL, }; -static struct attribute_group ksm_attr_group = { +static const struct attribute_group ksm_attr_group = { .attrs = ksm_attrs, .name = "ksm", }; diff --git a/mm/madvise.c b/mm/madvise.c index 9976852f1e1c..eea1c733286f 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -80,6 +80,17 @@ static long madvise_behavior(struct vm_area_struct *vma, } new_flags &= ~VM_DONTCOPY; break; + case MADV_WIPEONFORK: + /* MADV_WIPEONFORK is only supported on anonymous memory. */ + if (vma->vm_file || vma->vm_flags & VM_SHARED) { + error = -EINVAL; + goto out; + } + new_flags |= VM_WIPEONFORK; + break; + case MADV_KEEPONFORK: + new_flags &= ~VM_WIPEONFORK; + break; case MADV_DONTDUMP: new_flags |= VM_DONTDUMP; break; @@ -320,6 +331,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, tlb_remove_check_page_size_change(tlb, PAGE_SIZE); orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); for (; addr != end; pte++, addr += PAGE_SIZE) { ptent = *pte; @@ -367,8 +379,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, pte_offset_map_lock(mm, pmd, addr, &ptl); goto out; } - put_page(page); unlock_page(page); + put_page(page); pte = pte_offset_map_lock(mm, pmd, addr, &ptl); pte--; addr -= PAGE_SIZE; @@ -612,6 +624,7 @@ static int madvise_inject_error(int behavior, unsigned long start, unsigned long end) { struct page *page; + struct zone *zone; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -645,6 +658,11 @@ static int madvise_inject_error(int behavior, if (ret) return ret; } + + /* Ensure that all poisoned pages are removed from per-cpu lists */ + for_each_populated_zone(zone) + drain_all_pages(zone); + return 0; } #endif @@ -689,6 +707,8 @@ madvise_behavior_valid(int behavior) #endif case MADV_DONTDUMP: case MADV_DODUMP: + case MADV_WIPEONFORK: + case MADV_KEEPONFORK: #ifdef CONFIG_MEMORY_FAILURE case MADV_SOFT_OFFLINE: case MADV_HWPOISON: diff --git a/mm/memblock.c b/mm/memblock.c index 2cb25fe4452c..91205780e6b1 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -285,31 +285,27 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u } #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK - -phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( - phys_addr_t *addr) -{ - if (memblock.reserved.regions == memblock_reserved_init_regions) - return 0; - - *addr = __pa(memblock.reserved.regions); - - return PAGE_ALIGN(sizeof(struct memblock_region) * - memblock.reserved.max); -} - -phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info( - phys_addr_t *addr) +/** + * Discard memory and reserved arrays if they were allocated + */ +void __init memblock_discard(void) { - if (memblock.memory.regions == memblock_memory_init_regions) - return 0; + phys_addr_t addr, size; - *addr = __pa(memblock.memory.regions); + if (memblock.reserved.regions != memblock_reserved_init_regions) { + addr = __pa(memblock.reserved.regions); + size = PAGE_ALIGN(sizeof(struct memblock_region) * + memblock.reserved.max); + __memblock_free_late(addr, size); + } - return PAGE_ALIGN(sizeof(struct memblock_region) * - memblock.memory.max); + if (memblock.memory.regions != memblock_memory_init_regions) { + addr = __pa(memblock.memory.regions); + size = PAGE_ALIGN(sizeof(struct memblock_region) * + memblock.memory.max); + __memblock_free_late(addr, size); + } } - #endif /** diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3df3c04d73ab..ad15850ee157 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -550,10 +550,12 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) * value, and reading all cpu value can be performance bottleneck in some * common workload, threshold and synchronization as vmstat[] should be * implemented. + * + * The parameter idx can be of type enum memcg_event_item or vm_event_item. */ static unsigned long memcg_sum_events(struct mem_cgroup *memcg, - enum memcg_event_item event) + int event) { unsigned long val = 0; int cpu; @@ -1611,9 +1613,13 @@ cleanup: * @page: the page * * This function protects unlocked LRU pages from being moved to - * another cgroup and stabilizes their page->mem_cgroup binding. + * another cgroup. + * + * It ensures lifetime of the returned memcg. Caller is responsible + * for the lifetime of the page; __unlock_page_memcg() is available + * when @page might get freed inside the locked section. */ -void lock_page_memcg(struct page *page) +struct mem_cgroup *lock_page_memcg(struct page *page) { struct mem_cgroup *memcg; unsigned long flags; @@ -1622,18 +1628,24 @@ void lock_page_memcg(struct page *page) * The RCU lock is held throughout the transaction. The fast * path can get away without acquiring the memcg->move_lock * because page moving starts with an RCU grace period. - */ + * + * The RCU lock also protects the memcg from being freed when + * the page state that is going to change is the only thing + * preventing the page itself from being freed. E.g. writeback + * doesn't hold a page reference and relies on PG_writeback to + * keep off truncation, migration and so forth. + */ rcu_read_lock(); if (mem_cgroup_disabled()) - return; + return NULL; again: memcg = page->mem_cgroup; if (unlikely(!memcg)) - return; + return NULL; if (atomic_read(&memcg->moving_account) <= 0) - return; + return memcg; spin_lock_irqsave(&memcg->move_lock, flags); if (memcg != page->mem_cgroup) { @@ -1649,18 +1661,18 @@ again: memcg->move_lock_task = current; memcg->move_lock_flags = flags; - return; + return memcg; } EXPORT_SYMBOL(lock_page_memcg); /** - * unlock_page_memcg - unlock a page->mem_cgroup binding - * @page: the page + * __unlock_page_memcg - unlock and unpin a memcg + * @memcg: the memcg + * + * Unlock and unpin a memcg returned by lock_page_memcg(). */ -void unlock_page_memcg(struct page *page) +void __unlock_page_memcg(struct mem_cgroup *memcg) { - struct mem_cgroup *memcg = page->mem_cgroup; - if (memcg && memcg->move_lock_task == current) { unsigned long flags = memcg->move_lock_flags; @@ -1672,6 +1684,15 @@ void unlock_page_memcg(struct page *page) rcu_read_unlock(); } + +/** + * unlock_page_memcg - unlock a page->mem_cgroup binding + * @page: the page + */ +void unlock_page_memcg(struct page *page) +{ + __unlock_page_memcg(page->mem_cgroup); +} EXPORT_SYMBOL(unlock_page_memcg); /* @@ -1896,7 +1917,7 @@ retry: * bypass the last charges so that they can exit quickly and * free their memory. */ - if (unlikely(test_thread_flag(TIF_MEMDIE) || + if (unlikely(tsk_is_oom_victim(current) || fatal_signal_pending(current) || current->flags & PF_EXITING)) goto force; @@ -4300,6 +4321,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) } spin_unlock(&memcg->event_list_lock); + memcg->low = 0; + memcg_offline_kmem(memcg); wb_memcg_offline(memcg); @@ -4616,8 +4639,11 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, if (!ret || !target) put_page(page); } - /* There is a swap entry and a page doesn't exist or isn't charged */ - if (ent.val && !ret && + /* + * There is a swap entry and a page doesn't exist or isn't charged. + * But we cannot move a tail-page in a THP. + */ + if (ent.val && !ret && (!page || !PageTransCompound(page)) && mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { ret = MC_TARGET_SWAP; if (target) @@ -4628,8 +4654,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* - * We don't consider swapping or file mapped pages because THP does not - * support them for now. + * We don't consider PMD mapped swapping or file mapped pages because THP does + * not support them for now. * Caller should make sure that pmd_trans_huge(pmd) is true. */ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, @@ -5404,7 +5430,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, * in turn serializes uncharging. */ VM_BUG_ON_PAGE(!PageLocked(page), page); - if (page->mem_cgroup) + if (compound_head(page)->mem_cgroup) goto out; if (do_swap_account) { @@ -5887,6 +5913,7 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) void mem_cgroup_swapout(struct page *page, swp_entry_t entry) { struct mem_cgroup *memcg, *swap_memcg; + unsigned int nr_entries; unsigned short oldid; VM_BUG_ON_PAGE(PageLRU(page), page); @@ -5907,19 +5934,24 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * ancestor for the swap instead and transfer the memory+swap charge. */ swap_memcg = mem_cgroup_id_get_online(memcg); - oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 1); + nr_entries = hpage_nr_pages(page); + /* Get references for the tail pages, too */ + if (nr_entries > 1) + mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); + oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), + nr_entries); VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(swap_memcg, 1); + mem_cgroup_swap_statistics(swap_memcg, nr_entries); page->mem_cgroup = NULL; if (!mem_cgroup_is_root(memcg)) - page_counter_uncharge(&memcg->memory, 1); + page_counter_uncharge(&memcg->memory, nr_entries); if (memcg != swap_memcg) { if (!mem_cgroup_is_root(swap_memcg)) - page_counter_charge(&swap_memcg->memsw, 1); - page_counter_uncharge(&memcg->memsw, 1); + page_counter_charge(&swap_memcg->memsw, nr_entries); + page_counter_uncharge(&memcg->memsw, nr_entries); } /* @@ -5929,7 +5961,8 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * only synchronisation we have for udpating the per-CPU variables. */ VM_BUG_ON(!irqs_disabled()); - mem_cgroup_charge_statistics(memcg, page, false, -1); + mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page), + -nr_entries); memcg_check_events(memcg, page); if (!mem_cgroup_is_root(memcg)) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 1cd3b3569af8..88366626c0b7 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1146,6 +1146,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags) return 0; } + arch_unmap_kpfn(pfn); + orig_head = hpage = compound_head(p); num_poisoned_pages_inc(); diff --git a/mm/memory.c b/mm/memory.c index 0e517be91a89..13ee83b43878 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -68,6 +68,7 @@ #include <linux/debugfs.h> #include <linux/userfaultfd_k.h> #include <linux/dax.h> +#include <linux/oom.h> #include <asm/io.h> #include <asm/mmu_context.h> @@ -215,12 +216,8 @@ static bool tlb_next_batch(struct mmu_gather *tlb) return true; } -/* tlb_gather_mmu - * Called to initialize an (on-stack) mmu_gather structure for page-table - * tear-down from @mm. The @fullmm argument is used when @mm is without - * users and we're going to destroy the full address space (exit/execve). - */ -void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) +void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, + unsigned long start, unsigned long end) { tlb->mm = mm; @@ -275,10 +272,14 @@ void tlb_flush_mmu(struct mmu_gather *tlb) * Called at the end of the shootdown operation to free up any resources * that were required. */ -void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) +void arch_tlb_finish_mmu(struct mmu_gather *tlb, + unsigned long start, unsigned long end, bool force) { struct mmu_gather_batch *batch, *next; + if (force) + __tlb_adjust_range(tlb, start, end - start); + tlb_flush_mmu(tlb); /* keep the page table cache within bounds */ @@ -398,6 +399,34 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ +/* tlb_gather_mmu + * Called to initialize an (on-stack) mmu_gather structure for page-table + * tear-down from @mm. The @fullmm argument is used when @mm is without + * users and we're going to destroy the full address space (exit/execve). + */ +void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + arch_tlb_gather_mmu(tlb, mm, start, end); + inc_tlb_flush_pending(tlb->mm); +} + +void tlb_finish_mmu(struct mmu_gather *tlb, + unsigned long start, unsigned long end) +{ + /* + * If there are parallel threads are doing PTE changes on same range + * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB + * flush by batching, a thread has stable TLB entry can fail to flush + * the TLB by observing pte_none|!pte_dirty, for example so flush TLB + * forcefully if we detect parallel PTE batching threads. + */ + bool force = mm_tlb_flush_nested(tlb->mm); + + arch_tlb_finish_mmu(tlb, start, end, force); + dec_tlb_flush_pending(tlb->mm); +} + /* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. @@ -1197,6 +1226,7 @@ again: init_rss_vec(rss); start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); pte = start_pte; + flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); do { pte_t ptent = *pte; @@ -1483,8 +1513,20 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, tlb_gather_mmu(&tlb, mm, start, end); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(mm, start, end); - for ( ; vma && vma->vm_start < end; vma = vma->vm_next) + for ( ; vma && vma->vm_start < end; vma = vma->vm_next) { unmap_single_vma(&tlb, vma, start, end, NULL); + + /* + * zap_page_range does not specify whether mmap_sem should be + * held for read or write. That allows parallel zap_page_range + * operations to unmap a PTE and defer a flush meaning that + * this call observes pte_none and fails to flush the TLB. + * Rather than adding a complex API, ensure that no stale + * TLB entries exist when this call returns. + */ + flush_tlb_range(vma, start, end); + } + mmu_notifier_invalidate_range_end(mm, start, end); tlb_finish_mmu(&tlb, start, end); } @@ -1646,7 +1688,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, EXPORT_SYMBOL(vm_insert_page); static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn, pgprot_t prot) + pfn_t pfn, pgprot_t prot, bool mkwrite) { struct mm_struct *mm = vma->vm_mm; int retval; @@ -1658,14 +1700,35 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, if (!pte) goto out; retval = -EBUSY; - if (!pte_none(*pte)) - goto out_unlock; + if (!pte_none(*pte)) { + if (mkwrite) { + /* + * For read faults on private mappings the PFN passed + * in may not match the PFN we have mapped if the + * mapped PFN is a writeable COW page. In the mkwrite + * case we are creating a writable PTE for a shared + * mapping and we expect the PFNs to match. + */ + if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn))) + goto out_unlock; + entry = *pte; + goto out_mkwrite; + } else + goto out_unlock; + } /* Ok, finally just insert the thing.. */ if (pfn_t_devmap(pfn)) entry = pte_mkdevmap(pfn_t_pte(pfn, prot)); else entry = pte_mkspecial(pfn_t_pte(pfn, prot)); + +out_mkwrite: + if (mkwrite) { + entry = pte_mkyoung(entry); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + } + set_pte_at(mm, addr, pte, entry); update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ @@ -1736,14 +1799,15 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); - ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot); + ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, + false); return ret; } EXPORT_SYMBOL(vm_insert_pfn_prot); -int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn) +static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, + pfn_t pfn, bool mkwrite) { pgprot_t pgprot = vma->vm_page_prot; @@ -1772,10 +1836,24 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, page = pfn_to_page(pfn_t_to_pfn(pfn)); return insert_page(vma, addr, page, pgprot); } - return insert_pfn(vma, addr, pfn, pgprot); + return insert_pfn(vma, addr, pfn, pgprot, mkwrite); +} + +int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, + pfn_t pfn) +{ + return __vm_insert_mixed(vma, addr, pfn, false); + } EXPORT_SYMBOL(vm_insert_mixed); +int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, + pfn_t pfn) +{ + return __vm_insert_mixed(vma, addr, pfn, true); +} +EXPORT_SYMBOL(vm_insert_mixed_mkwrite); + /* * maps a range of physical memory into the requested pages. the old * mappings are removed. any references to nonexistent pages results @@ -2541,7 +2619,7 @@ static int do_wp_page(struct vm_fault *vmf) * not dirty accountable. */ if (PageAnon(vmf->page) && !PageKsm(vmf->page)) { - int total_mapcount; + int total_map_swapcount; if (!trylock_page(vmf->page)) { get_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -2556,8 +2634,8 @@ static int do_wp_page(struct vm_fault *vmf) } put_page(vmf->page); } - if (reuse_swap_page(vmf->page, &total_mapcount)) { - if (total_mapcount == 1) { + if (reuse_swap_page(vmf->page, &total_map_swapcount)) { + if (total_map_swapcount == 1) { /* * The page is all ours. Move it to * our anon_vma so the rmap code will @@ -2674,16 +2752,23 @@ EXPORT_SYMBOL(unmap_mapping_range); int do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct page *page, *swapcache; + struct page *page = NULL, *swapcache; struct mem_cgroup *memcg; + struct vma_swap_readahead swap_ra; swp_entry_t entry; pte_t pte; int locked; int exclusive = 0; int ret = 0; + bool vma_readahead = swap_use_vma_readahead(); - if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) + if (vma_readahead) + page = swap_readahead_detect(vmf, &swap_ra); + if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) { + if (page) + put_page(page); goto out; + } entry = pte_to_swp_entry(vmf->orig_pte); if (unlikely(non_swap_entry(entry))) { @@ -2699,10 +2784,16 @@ int do_swap_page(struct vm_fault *vmf) goto out; } delayacct_set_flag(DELAYACCT_PF_SWAPIN); - page = lookup_swap_cache(entry); + if (!page) + page = lookup_swap_cache(entry, vma_readahead ? vma : NULL, + vmf->address); if (!page) { - page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, - vmf->address); + if (vma_readahead) + page = do_swap_page_readahead(entry, + GFP_HIGHUSER_MOVABLE, vmf, &swap_ra); + else + page = swapin_readahead(entry, + GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (!page) { /* * Back out if somebody else faulted in this pte @@ -2864,6 +2955,7 @@ static int do_anonymous_page(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct mem_cgroup *memcg; struct page *page; + int ret = 0; pte_t entry; /* File mapping without ->vm_ops ? */ @@ -2896,6 +2988,9 @@ static int do_anonymous_page(struct vm_fault *vmf) vmf->address, &vmf->ptl); if (!pte_none(*vmf->pte)) goto unlock; + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto unlock; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -2930,6 +3025,10 @@ static int do_anonymous_page(struct vm_fault *vmf) if (!pte_none(*vmf->pte)) goto release; + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto release; + /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -2949,7 +3048,7 @@ setpte: update_mmu_cache(vma, vmf->address, vmf->pte); unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); - return 0; + return ret; release: mem_cgroup_cancel_charge(page, memcg, false); put_page(page); @@ -3223,7 +3322,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, int finish_fault(struct vm_fault *vmf) { struct page *page; - int ret; + int ret = 0; /* Did we COW the page? */ if ((vmf->flags & FAULT_FLAG_WRITE) && @@ -3231,7 +3330,15 @@ int finish_fault(struct vm_fault *vmf) page = vmf->cow_page; else page = vmf->page; - ret = alloc_set_pte(vmf, vmf->memcg, page); + + /* + * check even for read faults because we might have lost our CoWed + * page + */ + if (!(vmf->vma->vm_flags & VM_SHARED)) + ret = check_stable_address_space(vmf->vma->vm_mm); + if (!ret) + ret = alloc_set_pte(vmf, vmf->memcg, page); if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; @@ -3871,19 +3978,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, mem_cgroup_oom_synchronize(false); } - /* - * This mm has been already reaped by the oom reaper and so the - * refault cannot be trusted in general. Anonymous refaults would - * lose data and give a zero page instead e.g. This is especially - * problem for use_mm() because regular tasks will just die and - * the corrupted data will not be visible anywhere while kthread - * will outlive the oom victim and potentially propagate the date - * further. - */ - if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR) - && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags))) - ret = VM_FAULT_SIGBUS; - return ret; } EXPORT_SYMBOL_GPL(handle_mm_fault); @@ -3975,7 +4069,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) #endif /* __PAGETABLE_PMD_FOLDED */ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, - pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) + unsigned long *start, unsigned long *end, + pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) { pgd_t *pgd; p4d_t *p4d; @@ -4002,17 +4097,29 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, if (!pmdpp) goto out; + if (start && end) { + *start = address & PMD_MASK; + *end = *start + PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, *start, *end); + } *ptlp = pmd_lock(mm, pmd); if (pmd_huge(*pmd)) { *pmdpp = pmd; return 0; } spin_unlock(*ptlp); + if (start && end) + mmu_notifier_invalidate_range_end(mm, *start, *end); } if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) goto out; + if (start && end) { + *start = address & PAGE_MASK; + *end = *start + PAGE_SIZE; + mmu_notifier_invalidate_range_start(mm, *start, *end); + } ptep = pte_offset_map_lock(mm, pmd, address, ptlp); if (!pte_present(*ptep)) goto unlock; @@ -4020,6 +4127,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, return 0; unlock: pte_unmap_unlock(ptep, *ptlp); + if (start && end) + mmu_notifier_invalidate_range_end(mm, *start, *end); out: return -EINVAL; } @@ -4031,20 +4140,21 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address, /* (void) is needed to make gcc happy */ (void) __cond_lock(*ptlp, - !(res = __follow_pte_pmd(mm, address, ptepp, NULL, - ptlp))); + !(res = __follow_pte_pmd(mm, address, NULL, NULL, + ptepp, NULL, ptlp))); return res; } int follow_pte_pmd(struct mm_struct *mm, unsigned long address, + unsigned long *start, unsigned long *end, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) { int res; /* (void) is needed to make gcc happy */ (void) __cond_lock(*ptlp, - !(res = __follow_pte_pmd(mm, address, ptepp, pmdpp, - ptlp))); + !(res = __follow_pte_pmd(mm, address, start, end, + ptepp, pmdpp, ptlp))); return res; } EXPORT_SYMBOL(follow_pte_pmd); @@ -4307,19 +4417,53 @@ static void clear_gigantic_page(struct page *page, } } void clear_huge_page(struct page *page, - unsigned long addr, unsigned int pages_per_huge_page) + unsigned long addr_hint, unsigned int pages_per_huge_page) { - int i; + int i, n, base, l; + unsigned long addr = addr_hint & + ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { clear_gigantic_page(page, addr, pages_per_huge_page); return; } + /* Clear sub-page to access last to keep its cache lines hot */ might_sleep(); - for (i = 0; i < pages_per_huge_page; i++) { + n = (addr_hint - addr) / PAGE_SIZE; + if (2 * n <= pages_per_huge_page) { + /* If sub-page to access in first half of huge page */ + base = 0; + l = n; + /* Clear sub-pages at the end of huge page */ + for (i = pages_per_huge_page - 1; i >= 2 * n; i--) { + cond_resched(); + clear_user_highpage(page + i, addr + i * PAGE_SIZE); + } + } else { + /* If sub-page to access in second half of huge page */ + base = pages_per_huge_page - 2 * (pages_per_huge_page - n); + l = pages_per_huge_page - n; + /* Clear sub-pages at the begin of huge page */ + for (i = 0; i < base; i++) { + cond_resched(); + clear_user_highpage(page + i, addr + i * PAGE_SIZE); + } + } + /* + * Clear remaining sub-pages in left-right-left-right pattern + * towards the sub-page to access + */ + for (i = 0; i < l; i++) { + int left_idx = base + i; + int right_idx = base + 2 * l - 1 - i; + + cond_resched(); + clear_user_highpage(page + left_idx, + addr + left_idx * PAGE_SIZE); cond_resched(); - clear_user_highpage(page + i, addr + i * PAGE_SIZE); + clear_user_highpage(page + right_idx, + addr + right_idx * PAGE_SIZE); } } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 8dccc317aac2..73bf17df6899 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -773,31 +773,6 @@ static void node_states_set_node(int node, struct memory_notify *arg) node_set_state(node, N_MEMORY); } -bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type) -{ - struct pglist_data *pgdat = NODE_DATA(nid); - struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; - struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages); - - /* - * TODO there shouldn't be any inherent reason to have ZONE_NORMAL - * physically before ZONE_MOVABLE. All we need is they do not - * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE - * though so let's stick with it for simplicity for now. - * TODO make sure we do not overlap with ZONE_DEVICE - */ - if (online_type == MMOP_ONLINE_KERNEL) { - if (zone_is_empty(movable_zone)) - return true; - return movable_zone->zone_start_pfn >= pfn + nr_pages; - } else if (online_type == MMOP_ONLINE_MOVABLE) { - return zone_end_pfn(default_zone) <= pfn; - } - - /* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */ - return online_type == MMOP_ONLINE_KEEP; -} - static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { @@ -856,7 +831,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, * If no kernel zone covers this pfn range it will automatically go * to the ZONE_NORMAL. */ -struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, +static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn, unsigned long nr_pages) { struct pglist_data *pgdat = NODE_DATA(nid); @@ -872,17 +847,40 @@ struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, return &pgdat->node_zones[ZONE_NORMAL]; } -static inline bool movable_pfn_range(int nid, struct zone *default_zone, - unsigned long start_pfn, unsigned long nr_pages) +static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, + unsigned long nr_pages) { - if (!allow_online_pfn_range(nid, start_pfn, nr_pages, - MMOP_ONLINE_KERNEL)) - return true; + struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn, + nr_pages); + struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; + bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages); + bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages); - if (!movable_node_is_enabled()) - return false; + /* + * We inherit the existing zone in a simple case where zones do not + * overlap in the given range + */ + if (in_kernel ^ in_movable) + return (in_kernel) ? kernel_zone : movable_zone; - return !zone_intersects(default_zone, start_pfn, nr_pages); + /* + * If the range doesn't belong to any zone or two zones overlap in the + * given range then we use movable zone only if movable_node is + * enabled because we always online to a kernel zone by default. + */ + return movable_node_enabled ? movable_zone : kernel_zone; +} + +struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, + unsigned long nr_pages) +{ + if (online_type == MMOP_ONLINE_KERNEL) + return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages); + + if (online_type == MMOP_ONLINE_MOVABLE) + return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; + + return default_zone_for_pfn(nid, start_pfn, nr_pages); } /* @@ -892,28 +890,14 @@ static inline bool movable_pfn_range(int nid, struct zone *default_zone, static struct zone * __meminit move_pfn_range(int online_type, int nid, unsigned long start_pfn, unsigned long nr_pages) { - struct pglist_data *pgdat = NODE_DATA(nid); - struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages); - - if (online_type == MMOP_ONLINE_KEEP) { - struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; - /* - * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use - * movable zone if that is not possible (e.g. we are within - * or past the existing movable zone). movable_node overrides - * this default and defaults to movable zone - */ - if (movable_pfn_range(nid, zone, start_pfn, nr_pages)) - zone = movable_zone; - } else if (online_type == MMOP_ONLINE_MOVABLE) { - zone = &pgdat->node_zones[ZONE_MOVABLE]; - } + struct zone *zone; + zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages); move_pfn_range_to_zone(zone, start_pfn, nr_pages); return zone; } -/* Must be protected by mem_hotplug_begin() */ +/* Must be protected by mem_hotplug_begin() or a device_lock */ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) { unsigned long flags; @@ -925,9 +909,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ struct memory_notify arg; nid = pfn_to_nid(pfn); - if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type)) - return -EINVAL; - /* associate pfn range with the zone */ zone = move_pfn_range(online_type, nid, pfn, nr_pages); @@ -945,10 +926,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ * This means the page allocator ignores this zone. * So, zonelist must be updated after online. */ - mutex_lock(&zonelists_mutex); if (!populated_zone(zone)) { need_zonelists_rebuild = 1; - build_all_zonelists(NULL, zone); + setup_zone_pageset(zone); } ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, @@ -956,7 +936,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if (ret) { if (need_zonelists_rebuild) zone_pcp_reset(zone); - mutex_unlock(&zonelists_mutex); goto failed_addition; } @@ -969,13 +948,11 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if (onlined_pages) { node_states_set_node(nid, &arg); if (need_zonelists_rebuild) - build_all_zonelists(NULL, NULL); + build_all_zonelists(NULL); else zone_pcp_update(zone); } - mutex_unlock(&zonelists_mutex); - init_per_zone_wmark_min(); if (onlined_pages) { @@ -1046,9 +1023,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) * The node we allocated has no zone fallback lists. For avoiding * to access not-initialized zonelist, build here. */ - mutex_lock(&zonelists_mutex); - build_all_zonelists(pgdat, NULL); - mutex_unlock(&zonelists_mutex); + build_all_zonelists(pgdat); /* * zone->managed_pages is set to an approximate value in @@ -1100,13 +1075,6 @@ int try_online_node(int nid) node_set_online(nid); ret = register_one_node(nid); BUG_ON(ret); - - if (pgdat->node_zonelists->_zonerefs->zone == NULL) { - mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL, NULL); - mutex_unlock(&zonelists_mutex); - } - out: mem_hotplug_done(); return ret; @@ -1722,9 +1690,7 @@ repeat: if (!populated_zone(zone)) { zone_pcp_reset(zone); - mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL, NULL); - mutex_unlock(&zonelists_mutex); + build_all_zonelists(NULL); } else zone_pcp_update(zone); @@ -1750,7 +1716,7 @@ failed_removal: return ret; } -/* Must be protected by mem_hotplug_begin() */ +/* Must be protected by mem_hotplug_begin() or a device_lock */ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) { return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d911fa5cb2a7..618ab125228b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -861,11 +861,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, *policy |= (pol->flags & MPOL_MODE_FLAGS); } - if (vma) { - up_read(¤t->mm->mmap_sem); - vma = NULL; - } - err = 0; if (nmask) { if (mpol_store_user_nodemask(pol)) { diff --git a/mm/migrate.c b/mm/migrate.c index 627671551873..e84eeb4e4356 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -41,6 +41,7 @@ #include <linux/page_idle.h> #include <linux/page_owner.h> #include <linux/sched/mm.h> +#include <linux/ptrace.h> #include <asm/tlbflush.h> @@ -1652,7 +1653,6 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, const int __user *, nodes, int __user *, status, int, flags) { - const struct cred *cred = current_cred(), *tcred; struct task_struct *task; struct mm_struct *mm; int err; @@ -1676,14 +1676,9 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, /* * Check if this process has the right to modify the specified - * process. The right exists if the process has administrative - * capabilities, superuser privileges or the same - * userid as the target process. + * process. Use the regular "ptrace_may_access()" checks. */ - tcred = __task_cred(task); - if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) && - !capable(CAP_SYS_NICE)) { + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { rcu_read_unlock(); err = -EPERM; goto out; @@ -1937,12 +1932,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, put_page(new_page); goto out_fail; } - /* - * We are not sure a pending tlb flush here is for a huge page - * mapping or not. Hence use the tlb range variant - */ - if (mm_tlb_flush_pending(mm)) - flush_tlb_range(vma, mmun_start, mmun_end); /* Prepare a page as a migration target */ __SetPageLocked(new_page); diff --git a/mm/mmap.c b/mm/mmap.c index f19efcf75418..4c5981651407 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -44,6 +44,7 @@ #include <linux/userfaultfd_k.h> #include <linux/moduleparam.h> #include <linux/pkeys.h> +#include <linux/oom.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> @@ -2639,13 +2640,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, if (vma->vm_start >= end) return 0; - if (uf) { - int error = userfaultfd_unmap_prep(vma, start, end, uf); - - if (error) - return error; - } - /* * If we need to split any vma, do it now to save pain later. * @@ -2679,6 +2673,21 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, } vma = prev ? prev->vm_next : mm->mmap; + if (unlikely(uf)) { + /* + * If userfaultfd_unmap_prep returns an error the vmas + * will remain splitted, but userland will get a + * highly unexpected error anyway. This is no + * different than the case where the first of the two + * __split_vma fails, but we don't undo the first + * split, despite we could. This is unlikely enough + * failure that it's not worth optimizing it for. + */ + int error = userfaultfd_unmap_prep(vma, start, end, uf); + if (error) + return error; + } + /* * unlock any mlock()ed ranges before detaching vmas */ @@ -2993,6 +3002,23 @@ void exit_mmap(struct mm_struct *mm) /* Use -1 here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, vma, 0, -1); + set_bit(MMF_OOM_SKIP, &mm->flags); + if (unlikely(tsk_is_oom_victim(current))) { + /* + * Wait for oom_reap_task() to stop working on this + * mm. Because MMF_OOM_SKIP is already set before + * calling down_read(), oom_reap_task() will not run + * on this "mm" post up_write(). + * + * tsk_is_oom_victim() cannot be set from under us + * either because current->mm is already set to NULL + * under task_lock before calling mmput and oom_mm is + * set not NULL by the OOM killer only if current->mm + * is found not NULL while holding the task_lock. + */ + down_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); + } free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb, 0, -1); @@ -3514,7 +3540,7 @@ static int init_user_reserve(void) { unsigned long free_kbytes; - free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); return 0; @@ -3535,7 +3561,7 @@ static int init_admin_reserve(void) { unsigned long free_kbytes; - free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); return 0; @@ -3579,7 +3605,7 @@ static int reserve_mem_notifier(struct notifier_block *nb, break; case MEM_OFFLINE: - free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); if (sysctl_user_reserve_kbytes > free_kbytes) { init_user_reserve(); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 54ca54562928..314285284e6e 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -174,20 +174,6 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, srcu_read_unlock(&srcu, id); } -void __mmu_notifier_invalidate_page(struct mm_struct *mm, - unsigned long address) -{ - struct mmu_notifier *mn; - int id; - - id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { - if (mn->ops->invalidate_page) - mn->ops->invalidate_page(mn, mm, address); - } - srcu_read_unlock(&srcu, id); -} - void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, unsigned long start, unsigned long end) { diff --git a/mm/mprotect.c b/mm/mprotect.c index 1a8c9ca83e48..bd0f409922cb 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -64,6 +64,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, atomic_read(&vma->vm_mm->mm_users) == 1) target_node = numa_node_id(); + flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); do { oldpte = *pte; @@ -243,7 +244,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); - set_tlb_flush_pending(mm); + inc_tlb_flush_pending(mm); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) @@ -255,7 +256,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, /* Only flush the TLB if we actually modified any entries: */ if (pages) flush_tlb_range(vma, start, end); - clear_tlb_flush_pending(mm); + dec_tlb_flush_pending(mm); return pages; } diff --git a/mm/mremap.c b/mm/mremap.c index cd8a1b199ef9..7395564daa6c 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -152,6 +152,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, new_ptl = pte_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, @@ -383,6 +384,19 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, if (!vma || vma->vm_start > addr) return ERR_PTR(-EFAULT); + /* + * !old_len is a special case where an attempt is made to 'duplicate' + * a mapping. This makes no sense for private mappings as it will + * instead create a fresh/new mapping unrelated to the original. This + * is contrary to the basic idea of mremap which creates new mappings + * based on the original. There are no known use cases for this + * behavior. As a result, fail such attempts. + */ + if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) { + pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", current->comm, current->pid); + return ERR_PTR(-EINVAL); + } + if (is_vm_hugetlb_page(vma)) return ERR_PTR(-EINVAL); @@ -428,6 +442,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, static unsigned long mremap_to(unsigned long addr, unsigned long old_len, unsigned long new_addr, unsigned long new_len, bool *locked, struct vm_userfaultfd_ctx *uf, + struct list_head *uf_unmap_early, struct list_head *uf_unmap) { struct mm_struct *mm = current->mm; @@ -446,7 +461,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, if (addr + old_len > new_addr && new_addr + new_len > addr) goto out; - ret = do_munmap(mm, new_addr, new_len, NULL); + ret = do_munmap(mm, new_addr, new_len, uf_unmap_early); if (ret) goto out; @@ -514,6 +529,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, unsigned long charged = 0; bool locked = false; struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; + LIST_HEAD(uf_unmap_early); LIST_HEAD(uf_unmap); if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) @@ -541,7 +557,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (flags & MREMAP_FIXED) { ret = mremap_to(addr, old_len, new_addr, new_len, - &locked, &uf, &uf_unmap); + &locked, &uf, &uf_unmap_early, &uf_unmap); goto out; } @@ -621,6 +637,7 @@ out: up_write(¤t->mm->mmap_sem); if (locked && new_len > old_len) mm_populate(new_addr + old_len, new_len - old_len); + userfaultfd_unmap_complete(mm, &uf_unmap_early); mremap_userfaultfd_complete(&uf, addr, new_addr, old_len); userfaultfd_unmap_complete(mm, &uf_unmap); return ret; diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 36454d0f96ee..3637809a18d0 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -146,22 +146,6 @@ static unsigned long __init free_low_memory_core_early(void) NULL) count += __free_memory_core(start, end); -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK - { - phys_addr_t size; - - /* Free memblock.reserved array if it was allocated */ - size = get_allocated_memblock_reserved_regions_info(&start); - if (size) - count += __free_memory_core(start, start + size); - - /* Free memblock.memory array if it was allocated */ - size = get_allocated_memblock_memory_regions_info(&start); - if (size) - count += __free_memory_core(start, start + size); - } -#endif - return count; } diff --git a/mm/nommu.c b/mm/nommu.c index fc184f597d59..53d5175a5c14 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1962,7 +1962,7 @@ static int __meminit init_user_reserve(void) { unsigned long free_kbytes; - free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); return 0; @@ -1983,7 +1983,7 @@ static int __meminit init_admin_reserve(void) { unsigned long free_kbytes; - free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); return 0; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 9e8b4f030c1c..99736e026712 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -495,11 +495,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) } /* - * increase mm_users only after we know we will reap something so - * that the mmput_async is called only when we have reaped something - * and delayed __mmput doesn't matter that much + * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't + * work on the mm anymore. The check for MMF_OOM_SKIP must run + * under mmap_sem for reading because it serializes against the + * down_write();up_write() cycle in exit_mmap(). */ - if (!mmget_not_zero(mm)) { + if (test_bit(MMF_OOM_SKIP, &mm->flags)) { up_read(&mm->mmap_sem); trace_skip_task_reaping(tsk->pid); goto unlock_oom; @@ -542,12 +543,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) K(get_mm_counter(mm, MM_SHMEMPAGES))); up_read(&mm->mmap_sem); - /* - * Drop our reference but make sure the mmput slow path is called from a - * different context because we shouldn't risk we get stuck there and - * put the oom_reaper out of the way. - */ - mmput_async(mm); trace_finish_task_reaping(tsk->pid); unlock_oom: mutex_unlock(&oom_lock); @@ -824,7 +819,8 @@ static void oom_kill_process(struct oom_control *oc, const char *message) /* * If the task is already exiting, don't alarm the sysadmin or kill - * its children or threads, just set TIF_MEMDIE so it can die quickly + * its children or threads, just give it access to memory reserves + * so it can die quickly */ task_lock(p); if (task_will_free_mem(p)) { @@ -889,9 +885,9 @@ static void oom_kill_process(struct oom_control *oc, const char *message) count_memcg_event_mm(mm, OOM_KILL); /* - * We should send SIGKILL before setting TIF_MEMDIE in order to prevent - * the OOM victim from depleting the memory reserves from the user - * space under its control. + * We should send SIGKILL before granting access to memory reserves + * in order to prevent the OOM victim from depleting the memory + * reserves from the user space under its control. */ do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); mark_oom_victim(victim); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 96e93b214d31..0b9c5cbe8eba 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -363,7 +363,7 @@ static unsigned long global_dirtyable_memory(void) { unsigned long x; - x = global_page_state(NR_FREE_PAGES); + x = global_zone_page_state(NR_FREE_PAGES); /* * Pages reserved for the kernel should not be considered * dirtyable, to prevent a situation where reclaim has to @@ -1405,7 +1405,7 @@ void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) * will look to see if it needs to start dirty throttling. * * If dirty_poll_interval is too low, big NUMA machines will call the expensive - * global_page_state() too often. So scale it near-sqrt to the safety margin + * global_zone_page_state() too often. So scale it near-sqrt to the safety margin * (the number of pages we may dirty without exceeding the dirty limits). */ static unsigned long dirty_poll_interval(unsigned long dirty, @@ -2724,9 +2724,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); + struct mem_cgroup *memcg; + struct lruvec *lruvec; int ret; - lock_page_memcg(page); + memcg = lock_page_memcg(page); + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); if (mapping && mapping_use_writeback_tags(mapping)) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -2754,12 +2757,18 @@ int test_clear_page_writeback(struct page *page) } else { ret = TestClearPageWriteback(page); } + /* + * NOTE: Page might be free now! Writeback doesn't hold a page + * reference on its own, it relies on truncation to wait for + * the clearing of PG_writeback. The below can only access + * page state that is static across allocation cycles. + */ if (ret) { - dec_lruvec_page_state(page, NR_WRITEBACK); + dec_lruvec_state(lruvec, NR_WRITEBACK); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); inc_node_page_state(page, NR_WRITTEN); } - unlock_page_memcg(page); + __unlock_page_memcg(memcg); return ret; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6d30e914afb6..a9add06fe768 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -66,6 +66,8 @@ #include <linux/kthread.h> #include <linux/memcontrol.h> #include <linux/ftrace.h> +#include <linux/lockdep.h> +#include <linux/nmi.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -1584,6 +1586,10 @@ void __init page_alloc_init_late(void) /* Reinit limits that are based on free pages after the kernel is up */ files_maxfiles_init(); #endif +#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK + /* Discard memblock private memory */ + memblock_discard(); +#endif for_each_populated_zone(zone) set_zone_contiguous(zone); @@ -2531,9 +2537,14 @@ void drain_all_pages(struct zone *zone) #ifdef CONFIG_HIBERNATION +/* + * Touch the watchdog for every WD_PAGE_COUNT pages. + */ +#define WD_PAGE_COUNT (128*1024) + void mark_free_pages(struct zone *zone) { - unsigned long pfn, max_zone_pfn; + unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT; unsigned long flags; unsigned int order, t; struct page *page; @@ -2548,6 +2559,11 @@ void mark_free_pages(struct zone *zone) if (pfn_valid(pfn)) { page = pfn_to_page(pfn); + if (!--page_count) { + touch_nmi_watchdog(); + page_count = WD_PAGE_COUNT; + } + if (page_zone(page) != zone) continue; @@ -2561,8 +2577,13 @@ void mark_free_pages(struct zone *zone) unsigned long i; pfn = page_to_pfn(page); - for (i = 0; i < (1UL << order); i++) + for (i = 0; i < (1UL << order); i++) { + if (!--page_count) { + touch_nmi_watchdog(); + page_count = WD_PAGE_COUNT; + } swsusp_set_page_free(pfn_to_page(pfn + i)); + } } } spin_unlock_irqrestore(&zone->lock, flags); @@ -2930,7 +2951,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, { long min = mark; int o; - const bool alloc_harder = (alloc_flags & ALLOC_HARDER); + const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); /* free_pages may go negative - that's OK */ free_pages -= (1 << order) - 1; @@ -2943,10 +2964,21 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, * the high-atomic reserves. This will over-estimate the size of the * atomic reserve but it avoids a search. */ - if (likely(!alloc_harder)) + if (likely(!alloc_harder)) { free_pages -= z->nr_reserved_highatomic; - else - min -= min / 4; + } else { + /* + * OOM victims can try even harder than normal ALLOC_HARDER + * users on the grounds that it's definitely going to be in + * the exit path shortly and free memory. Any allocation it + * makes during the free path will be small and short-lived. + */ + if (alloc_flags & ALLOC_OOM) + min -= min / 2; + else + min -= min / 4; + } + #ifdef CONFIG_CMA /* If allocation can't use CMA areas don't use free CMA pages */ @@ -3184,7 +3216,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) * of allowed nodes. */ if (!(gfp_mask & __GFP_NOMEMALLOC)) - if (test_thread_flag(TIF_MEMDIE) || + if (tsk_is_oom_victim(current) || (current->flags & (PF_MEMALLOC | PF_EXITING))) filter &= ~SHOW_MEM_FILTER_NODES; if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) @@ -3271,10 +3303,13 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, /* * Go through the zonelist yet one more time, keep very high watermark * here, this is only to catch a parallel oom killing, we must fail if - * we're still under heavy pressure. + * we're still under heavy pressure. But make sure that this reclaim + * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY + * allocation which will never fail due to oom_lock already held. */ - page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order, - ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); + page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) & + ~__GFP_DIRECT_RECLAIM, order, + ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); if (page) goto out; @@ -3490,6 +3525,47 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla } #endif /* CONFIG_COMPACTION */ +#ifdef CONFIG_LOCKDEP +struct lockdep_map __fs_reclaim_map = + STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map); + +static bool __need_fs_reclaim(gfp_t gfp_mask) +{ + gfp_mask = current_gfp_context(gfp_mask); + + /* no reclaim without waiting on it */ + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) + return false; + + /* this guy won't enter reclaim */ + if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) + return false; + + /* We're only interested __GFP_FS allocations for now */ + if (!(gfp_mask & __GFP_FS)) + return false; + + if (gfp_mask & __GFP_NOLOCKDEP) + return false; + + return true; +} + +void fs_reclaim_acquire(gfp_t gfp_mask) +{ + if (__need_fs_reclaim(gfp_mask)) + lock_map_acquire(&__fs_reclaim_map); +} +EXPORT_SYMBOL_GPL(fs_reclaim_acquire); + +void fs_reclaim_release(gfp_t gfp_mask) +{ + if (__need_fs_reclaim(gfp_mask)) + lock_map_release(&__fs_reclaim_map); +} +EXPORT_SYMBOL_GPL(fs_reclaim_release); +#endif + /* Perform direct synchronous page reclaim */ static int __perform_reclaim(gfp_t gfp_mask, unsigned int order, @@ -3504,7 +3580,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); noreclaim_flag = memalloc_noreclaim_save(); - lockdep_set_current_reclaim_state(gfp_mask); + fs_reclaim_acquire(gfp_mask); reclaim_state.reclaimed_slab = 0; current->reclaim_state = &reclaim_state; @@ -3512,7 +3588,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, ac->nodemask); current->reclaim_state = NULL; - lockdep_clear_current_reclaim_state(); + fs_reclaim_release(gfp_mask); memalloc_noreclaim_restore(noreclaim_flag); cond_resched(); @@ -3603,21 +3679,46 @@ gfp_to_alloc_flags(gfp_t gfp_mask) return alloc_flags; } -bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) +static bool oom_reserves_allowed(struct task_struct *tsk) { - if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) + if (!tsk_is_oom_victim(tsk)) return false; + /* + * !MMU doesn't have oom reaper so give access to memory reserves + * only to the thread with TIF_MEMDIE set + */ + if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE)) + return false; + + return true; +} + +/* + * Distinguish requests which really need access to full memory + * reserves from oom victims which can live with a portion of it + */ +static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask) +{ + if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) + return 0; if (gfp_mask & __GFP_MEMALLOC) - return true; + return ALLOC_NO_WATERMARKS; if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) - return true; - if (!in_interrupt() && - ((current->flags & PF_MEMALLOC) || - unlikely(test_thread_flag(TIF_MEMDIE)))) - return true; + return ALLOC_NO_WATERMARKS; + if (!in_interrupt()) { + if (current->flags & PF_MEMALLOC) + return ALLOC_NO_WATERMARKS; + else if (oom_reserves_allowed(current)) + return ALLOC_OOM; + } - return false; + return 0; +} + +bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) +{ + return !!__gfp_pfmemalloc_flags(gfp_mask); } /* @@ -3770,6 +3871,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long alloc_start = jiffies; unsigned int stall_timeout = 10 * HZ; unsigned int cpuset_mems_cookie; + int reserve_flags; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -3875,15 +3977,16 @@ retry: if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); - if (gfp_pfmemalloc_allowed(gfp_mask)) - alloc_flags = ALLOC_NO_WATERMARKS; + reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); + if (reserve_flags) + alloc_flags = reserve_flags; /* * Reset the zonelist iterators if memory policies can be ignored. * These allocations are high priority and system rather than user * orientated. */ - if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) { + if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); @@ -3960,8 +4063,8 @@ retry: goto got_pg; /* Avoid allocations with no watermarks from looping endlessly */ - if (test_thread_flag(TIF_MEMDIE) && - (alloc_flags == ALLOC_NO_WATERMARKS || + if (tsk_is_oom_victim(current) && + (alloc_flags == ALLOC_OOM || (gfp_mask & __GFP_NOMEMALLOC))) goto nopage; @@ -4041,7 +4144,8 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, *alloc_flags |= ALLOC_CPUSET; } - lockdep_trace_alloc(gfp_mask); + fs_reclaim_acquire(gfp_mask); + fs_reclaim_release(gfp_mask); might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); @@ -4443,7 +4547,7 @@ long si_mem_available(void) * Estimate the amount of memory available for userspace allocations, * without causing swapping. */ - available = global_page_state(NR_FREE_PAGES) - totalreserve_pages; + available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages; /* * Not all the page cache can be freed, otherwise the system will @@ -4458,8 +4562,9 @@ long si_mem_available(void) * Part of the reclaimable slab consists of items that are in use, * and cannot be freed. Cap this estimate at the low watermark. */ - available += global_page_state(NR_SLAB_RECLAIMABLE) - - min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); + available += global_node_page_state(NR_SLAB_RECLAIMABLE) - + min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2, + wmark_low); if (available < 0) available = 0; @@ -4471,7 +4576,7 @@ void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; val->sharedram = global_node_page_state(NR_SHMEM); - val->freeram = global_page_state(NR_FREE_PAGES); + val->freeram = global_zone_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); val->totalhigh = totalhigh_pages; val->freehigh = nr_free_highpages(); @@ -4602,15 +4707,15 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) global_node_page_state(NR_FILE_DIRTY), global_node_page_state(NR_WRITEBACK), global_node_page_state(NR_UNSTABLE_NFS), - global_page_state(NR_SLAB_RECLAIMABLE), - global_page_state(NR_SLAB_UNRECLAIMABLE), + global_node_page_state(NR_SLAB_RECLAIMABLE), + global_node_page_state(NR_SLAB_UNRECLAIMABLE), global_node_page_state(NR_FILE_MAPPED), global_node_page_state(NR_SHMEM), - global_page_state(NR_PAGETABLE), - global_page_state(NR_BOUNCE), - global_page_state(NR_FREE_PAGES), + global_zone_page_state(NR_PAGETABLE), + global_zone_page_state(NR_BOUNCE), + global_zone_page_state(NR_FREE_PAGES), free_pcp, - global_page_state(NR_FREE_CMA_PAGES)); + global_zone_page_state(NR_FREE_CMA_PAGES)); for_each_online_pgdat(pgdat) { if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) @@ -4772,18 +4877,17 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) * * Add all populated zones of a node to the zonelist. */ -static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, - int nr_zones) +static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs) { struct zone *zone; enum zone_type zone_type = MAX_NR_ZONES; + int nr_zones = 0; do { zone_type--; zone = pgdat->node_zones + zone_type; if (managed_zone(zone)) { - zoneref_set_zone(zone, - &zonelist->_zonerefs[nr_zones++]); + zoneref_set_zone(zone, &zonerefs[nr_zones++]); check_highest_zone(zone_type); } } while (zone_type); @@ -4791,52 +4895,18 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, return nr_zones; } - -/* - * zonelist_order: - * 0 = automatic detection of better ordering. - * 1 = order by ([node] distance, -zonetype) - * 2 = order by (-zonetype, [node] distance) - * - * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create - * the same zonelist. So only NUMA can configure this param. - */ -#define ZONELIST_ORDER_DEFAULT 0 -#define ZONELIST_ORDER_NODE 1 -#define ZONELIST_ORDER_ZONE 2 - -/* zonelist order in the kernel. - * set_zonelist_order() will set this to NODE or ZONE. - */ -static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; -static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; - - #ifdef CONFIG_NUMA -/* The value user specified ....changed by config */ -static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; -/* string for sysctl */ -#define NUMA_ZONELIST_ORDER_LEN 16 -char numa_zonelist_order[16] = "default"; - -/* - * interface for configure zonelist ordering. - * command line option "numa_zonelist_order" - * = "[dD]efault - default, automatic configuration. - * = "[nN]ode - order by node locality, then by zone within node - * = "[zZ]one - order by zone, then by locality within zone - */ static int __parse_numa_zonelist_order(char *s) { - if (*s == 'd' || *s == 'D') { - user_zonelist_order = ZONELIST_ORDER_DEFAULT; - } else if (*s == 'n' || *s == 'N') { - user_zonelist_order = ZONELIST_ORDER_NODE; - } else if (*s == 'z' || *s == 'Z') { - user_zonelist_order = ZONELIST_ORDER_ZONE; - } else { - pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s); + /* + * We used to support different zonlists modes but they turned + * out to be just not useful. Let's keep the warning in place + * if somebody still use the cmd line parameter so that we do + * not fail it silently + */ + if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) { + pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s); return -EINVAL; } return 0; @@ -4844,19 +4914,15 @@ static int __parse_numa_zonelist_order(char *s) static __init int setup_numa_zonelist_order(char *s) { - int ret; - if (!s) return 0; - ret = __parse_numa_zonelist_order(s); - if (ret == 0) - strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); - - return ret; + return __parse_numa_zonelist_order(s); } early_param("numa_zonelist_order", setup_numa_zonelist_order); +char numa_zonelist_order[] = "Node"; + /* * sysctl handler for numa_zonelist_order */ @@ -4864,40 +4930,17 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { - char saved_string[NUMA_ZONELIST_ORDER_LEN]; + char *str; int ret; - static DEFINE_MUTEX(zl_order_mutex); - mutex_lock(&zl_order_mutex); - if (write) { - if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { - ret = -EINVAL; - goto out; - } - strcpy(saved_string, (char *)table->data); - } - ret = proc_dostring(table, write, buffer, length, ppos); - if (ret) - goto out; - if (write) { - int oldval = user_zonelist_order; + if (!write) + return proc_dostring(table, write, buffer, length, ppos); + str = memdup_user_nul(buffer, 16); + if (IS_ERR(str)) + return PTR_ERR(str); - ret = __parse_numa_zonelist_order((char *)table->data); - if (ret) { - /* - * bogus value. restore saved string - */ - strncpy((char *)table->data, saved_string, - NUMA_ZONELIST_ORDER_LEN); - user_zonelist_order = oldval; - } else if (oldval != user_zonelist_order) { - mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL, NULL); - mutex_unlock(&zonelists_mutex); - } - } -out: - mutex_unlock(&zl_order_mutex); + ret = __parse_numa_zonelist_order(str); + kfree(str); return ret; } @@ -4971,17 +5014,24 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) * This results in maximum locality--normal zone overflows into local * DMA zone, if any--but risks exhausting DMA zone. */ -static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) +static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order, + unsigned nr_nodes) { - int j; - struct zonelist *zonelist; + struct zoneref *zonerefs; + int i; + + zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; + + for (i = 0; i < nr_nodes; i++) { + int nr_zones; - zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; - for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) - ; - j = build_zonelists_node(NODE_DATA(node), zonelist, j); - zonelist->_zonerefs[j].zone = NULL; - zonelist->_zonerefs[j].zone_idx = 0; + pg_data_t *node = NODE_DATA(node_order[i]); + + nr_zones = build_zonerefs_node(node, zonerefs); + zonerefs += nr_zones; + } + zonerefs->zone = NULL; + zonerefs->zone_idx = 0; } /* @@ -4989,13 +5039,14 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) */ static void build_thisnode_zonelists(pg_data_t *pgdat) { - int j; - struct zonelist *zonelist; + struct zoneref *zonerefs; + int nr_zones; - zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK]; - j = build_zonelists_node(pgdat, zonelist, 0); - zonelist->_zonerefs[j].zone = NULL; - zonelist->_zonerefs[j].zone_idx = 0; + zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs; + nr_zones = build_zonerefs_node(pgdat, zonerefs); + zonerefs += nr_zones; + zonerefs->zone = NULL; + zonerefs->zone_idx = 0; } /* @@ -5004,79 +5055,13 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) * exhausted, but results in overflowing to remote node while memory * may still exist in local DMA zone. */ -static int node_order[MAX_NUMNODES]; - -static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) -{ - int pos, j, node; - int zone_type; /* needs to be signed */ - struct zone *z; - struct zonelist *zonelist; - - zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; - pos = 0; - for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { - for (j = 0; j < nr_nodes; j++) { - node = node_order[j]; - z = &NODE_DATA(node)->node_zones[zone_type]; - if (managed_zone(z)) { - zoneref_set_zone(z, - &zonelist->_zonerefs[pos++]); - check_highest_zone(zone_type); - } - } - } - zonelist->_zonerefs[pos].zone = NULL; - zonelist->_zonerefs[pos].zone_idx = 0; -} - -#if defined(CONFIG_64BIT) -/* - * Devices that require DMA32/DMA are relatively rare and do not justify a - * penalty to every machine in case the specialised case applies. Default - * to Node-ordering on 64-bit NUMA machines - */ -static int default_zonelist_order(void) -{ - return ZONELIST_ORDER_NODE; -} -#else -/* - * On 32-bit, the Normal zone needs to be preserved for allocations accessible - * by the kernel. If processes running on node 0 deplete the low memory zone - * then reclaim will occur more frequency increasing stalls and potentially - * be easier to OOM if a large percentage of the zone is under writeback or - * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set. - * Hence, default to zone ordering on 32-bit. - */ -static int default_zonelist_order(void) -{ - return ZONELIST_ORDER_ZONE; -} -#endif /* CONFIG_64BIT */ - -static void set_zonelist_order(void) -{ - if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) - current_zonelist_order = default_zonelist_order(); - else - current_zonelist_order = user_zonelist_order; -} static void build_zonelists(pg_data_t *pgdat) { - int i, node, load; + static int node_order[MAX_NUMNODES]; + int node, load, nr_nodes = 0; nodemask_t used_mask; int local_node, prev_node; - struct zonelist *zonelist; - unsigned int order = current_zonelist_order; - - /* initialize zonelists */ - for (i = 0; i < MAX_ZONELISTS; i++) { - zonelist = pgdat->node_zonelists + i; - zonelist->_zonerefs[0].zone = NULL; - zonelist->_zonerefs[0].zone_idx = 0; - } /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; @@ -5085,8 +5070,6 @@ static void build_zonelists(pg_data_t *pgdat) nodes_clear(used_mask); memset(node_order, 0, sizeof(node_order)); - i = 0; - while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { /* * We don't want to pressure a particular node. @@ -5097,19 +5080,12 @@ static void build_zonelists(pg_data_t *pgdat) node_distance(local_node, prev_node)) node_load[node] = load; + node_order[nr_nodes++] = node; prev_node = node; load--; - if (order == ZONELIST_ORDER_NODE) - build_zonelists_in_node_order(pgdat, node); - else - node_order[i++] = node; /* remember order */ - } - - if (order == ZONELIST_ORDER_ZONE) { - /* calculate node order -- i.e., DMA last! */ - build_zonelists_in_zone_order(pgdat, i); } + build_zonelists_in_node_order(pgdat, node_order, nr_nodes); build_thisnode_zonelists(pgdat); } @@ -5135,21 +5111,17 @@ static void setup_min_unmapped_ratio(void); static void setup_min_slab_ratio(void); #else /* CONFIG_NUMA */ -static void set_zonelist_order(void) -{ - current_zonelist_order = ZONELIST_ORDER_ZONE; -} - static void build_zonelists(pg_data_t *pgdat) { int node, local_node; - enum zone_type j; - struct zonelist *zonelist; + struct zoneref *zonerefs; + int nr_zones; local_node = pgdat->node_id; - zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; - j = build_zonelists_node(pgdat, zonelist, 0); + zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; + nr_zones = build_zonerefs_node(pgdat, zonerefs); + zonerefs += nr_zones; /* * Now we build the zonelist so that it contains the zones @@ -5162,16 +5134,18 @@ static void build_zonelists(pg_data_t *pgdat) for (node = local_node + 1; node < MAX_NUMNODES; node++) { if (!node_online(node)) continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j); + nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); + zonerefs += nr_zones; } for (node = 0; node < local_node; node++) { if (!node_online(node)) continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j); + nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); + zonerefs += nr_zones; } - zonelist->_zonerefs[j].zone = NULL; - zonelist->_zonerefs[j].zone_idx = 0; + zonerefs->zone = NULL; + zonerefs->zone_idx = 0; } #endif /* CONFIG_NUMA */ @@ -5194,50 +5168,32 @@ static void build_zonelists(pg_data_t *pgdat) static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); -static void setup_zone_pageset(struct zone *zone); - -/* - * Global mutex to protect against size modification of zonelists - * as well as to serialize pageset setup for the new populated zone. - */ -DEFINE_MUTEX(zonelists_mutex); -/* return values int ....just for stop_machine() */ -static int __build_all_zonelists(void *data) +static void __build_all_zonelists(void *data) { int nid; - int cpu; + int __maybe_unused cpu; pg_data_t *self = data; + static DEFINE_SPINLOCK(lock); + + spin_lock(&lock); #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); #endif + /* + * This node is hotadded and no memory is yet present. So just + * building zonelists is fine - no need to touch other nodes. + */ if (self && !node_online(self->node_id)) { build_zonelists(self); - } - - for_each_online_node(nid) { - pg_data_t *pgdat = NODE_DATA(nid); - - build_zonelists(pgdat); - } + } else { + for_each_online_node(nid) { + pg_data_t *pgdat = NODE_DATA(nid); - /* - * Initialize the boot_pagesets that are going to be used - * for bootstrapping processors. The real pagesets for - * each zone will be allocated later when the per cpu - * allocator is available. - * - * boot_pagesets are used also for bootstrapping offline - * cpus if the system is already booted because the pagesets - * are needed to initialize allocators on a specific cpu too. - * F.e. the percpu allocator needs the page allocator which - * needs the percpu allocator in order to allocate its pagesets - * (a chicken-egg dilemma). - */ - for_each_possible_cpu(cpu) { - setup_pageset(&per_cpu(boot_pageset, cpu), 0); + build_zonelists(pgdat); + } #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* @@ -5248,45 +5204,53 @@ static int __build_all_zonelists(void *data) * secondary cpus' numa_mem as they come on-line. During * node/memory hotplug, we'll fixup all on-line cpus. */ - if (cpu_online(cpu)) + for_each_online_cpu(cpu) set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); #endif } - return 0; + spin_unlock(&lock); } static noinline void __init build_all_zonelists_init(void) { + int cpu; + __build_all_zonelists(NULL); + + /* + * Initialize the boot_pagesets that are going to be used + * for bootstrapping processors. The real pagesets for + * each zone will be allocated later when the per cpu + * allocator is available. + * + * boot_pagesets are used also for bootstrapping offline + * cpus if the system is already booted because the pagesets + * are needed to initialize allocators on a specific cpu too. + * F.e. the percpu allocator needs the page allocator which + * needs the percpu allocator in order to allocate its pagesets + * (a chicken-egg dilemma). + */ + for_each_possible_cpu(cpu) + setup_pageset(&per_cpu(boot_pageset, cpu), 0); + mminit_verify_zonelist(); cpuset_init_current_mems_allowed(); } /* - * Called with zonelists_mutex held always * unless system_state == SYSTEM_BOOTING. * - * __ref due to (1) call of __meminit annotated setup_zone_pageset - * [we're only called with non-NULL zone through __meminit paths] and - * (2) call of __init annotated helper build_all_zonelists_init + * __ref due to call of __init annotated helper build_all_zonelists_init * [protected by SYSTEM_BOOTING]. */ -void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) +void __ref build_all_zonelists(pg_data_t *pgdat) { - set_zonelist_order(); - if (system_state == SYSTEM_BOOTING) { build_all_zonelists_init(); } else { -#ifdef CONFIG_MEMORY_HOTPLUG - if (zone) - setup_zone_pageset(zone); -#endif - /* we have to stop all cpus to guarantee there is no user - of zonelist */ - stop_machine_cpuslocked(__build_all_zonelists, pgdat, NULL); + __build_all_zonelists(pgdat); /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); @@ -5302,9 +5266,8 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) else page_group_by_mobility_disabled = 0; - pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n", + pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n", nr_online_nodes, - zonelist_order_name[current_zonelist_order], page_group_by_mobility_disabled ? "off" : "on", vm_total_pages); #ifdef CONFIG_NUMA @@ -5558,7 +5521,7 @@ static void __meminit zone_pageset_init(struct zone *zone, int cpu) pageset_set_high_and_batch(zone, pcp); } -static void __meminit setup_zone_pageset(struct zone *zone) +void __meminit setup_zone_pageset(struct zone *zone) { int cpu; zone->pageset = alloc_percpu(struct per_cpu_pageset); @@ -7012,9 +6975,11 @@ static void __setup_per_zone_wmarks(void) */ void setup_per_zone_wmarks(void) { - mutex_lock(&zonelists_mutex); + static DEFINE_SPINLOCK(lock); + + spin_lock(&lock); __setup_per_zone_wmarks(); - mutex_unlock(&zonelists_mutex); + spin_unlock(&lock); } /* @@ -7666,7 +7631,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, /* Make sure the range is really isolated. */ if (test_pages_isolated(outer_start, end, false)) { - pr_info("%s: [%lx, %lx) PFNs busy\n", + pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n", __func__, outer_start, end); ret = -EBUSY; goto done; diff --git a/mm/page_ext.c b/mm/page_ext.c index 88ccc044b09a..32f18911deda 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -222,10 +222,7 @@ static void *__meminit alloc_page_ext(size_t size, int nid) return addr; } - if (node_state(nid, N_HIGH_MEMORY)) - addr = vzalloc_node(size, nid); - else - addr = vzalloc(size); + addr = vzalloc_node(size, nid); return addr; } @@ -409,6 +406,7 @@ void __init page_ext_init(void) continue; if (init_section_page_ext(pfn, nid)) goto oom; + cond_resched(); } } hotplug_memory_notifier(page_ext_callback, 0); diff --git a/mm/page_idle.c b/mm/page_idle.c index 1b0f48c62316..4bd03a8d809e 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -204,7 +204,7 @@ static struct bin_attribute *page_idle_bin_attrs[] = { NULL, }; -static struct attribute_group page_idle_attr_group = { +static const struct attribute_group page_idle_attr_group = { .bin_attrs = page_idle_bin_attrs, .name = "page_idle", }; diff --git a/mm/page_io.c b/mm/page_io.c index b6c4ac388209..20139b90125a 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -22,21 +22,24 @@ #include <linux/frontswap.h> #include <linux/blkdev.h> #include <linux/uio.h> +#include <linux/sched/task.h> #include <asm/pgtable.h> static struct bio *get_swap_bio(gfp_t gfp_flags, struct page *page, bio_end_io_t end_io) { + int i, nr = hpage_nr_pages(page); struct bio *bio; - bio = bio_alloc(gfp_flags, 1); + bio = bio_alloc(gfp_flags, nr); if (bio) { bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev); bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9; bio->bi_end_io = end_io; - bio_add_page(bio, page, PAGE_SIZE, 0); - BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE); + for (i = 0; i < nr; i++) + bio_add_page(bio, page + i, PAGE_SIZE, 0); + VM_BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE * nr); } return bio; } @@ -136,6 +139,7 @@ out: WRITE_ONCE(bio->bi_private, NULL); bio_put(bio); wake_up_process(waiter); + put_task_struct(waiter); } int generic_swapfile_activate(struct swap_info_struct *sis, @@ -260,6 +264,15 @@ static sector_t swap_page_sector(struct page *page) return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9); } +static inline void count_swpout_vm_event(struct page *page) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (unlikely(PageTransHuge(page))) + count_vm_event(THP_SWPOUT); +#endif + count_vm_events(PSWPOUT, hpage_nr_pages(page)); +} + int __swap_writepage(struct page *page, struct writeback_control *wbc, bio_end_io_t end_write_func) { @@ -311,7 +324,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc); if (!ret) { - count_vm_event(PSWPOUT); + count_swpout_vm_event(page); return 0; } @@ -324,7 +337,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, goto out; } bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); - count_vm_event(PSWPOUT); + count_swpout_vm_event(page); set_page_writeback(page); unlock_page(page); submit_bio(bio); @@ -378,6 +391,11 @@ int swap_readpage(struct page *page, bool do_poll) goto out; } bdev = bio->bi_bdev; + /* + * Keep this task valid during swap readpage because the oom killer may + * attempt to access it in the page fault retry time check. + */ + get_task_struct(current); bio->bi_private = current; bio_set_op_attrs(bio, REQ_OP_READ, 0); count_vm_event(PSWPIN); diff --git a/mm/page_owner.c b/mm/page_owner.c index 0fd9dcf2c5dc..8e2d7137510c 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -30,6 +30,7 @@ DEFINE_STATIC_KEY_FALSE(page_owner_inited); static depot_stack_handle_t dummy_handle; static depot_stack_handle_t failure_handle; +static depot_stack_handle_t early_handle; static void init_early_allocated_pages(void); @@ -53,7 +54,7 @@ static bool need_page_owner(void) return true; } -static noinline void register_dummy_stack(void) +static __always_inline depot_stack_handle_t create_dummy_stack(void) { unsigned long entries[4]; struct stack_trace dummy; @@ -64,21 +65,22 @@ static noinline void register_dummy_stack(void) dummy.skip = 0; save_stack_trace(&dummy); - dummy_handle = depot_save_stack(&dummy, GFP_KERNEL); + return depot_save_stack(&dummy, GFP_KERNEL); } -static noinline void register_failure_stack(void) +static noinline void register_dummy_stack(void) { - unsigned long entries[4]; - struct stack_trace failure; + dummy_handle = create_dummy_stack(); +} - failure.nr_entries = 0; - failure.max_entries = ARRAY_SIZE(entries); - failure.entries = &entries[0]; - failure.skip = 0; +static noinline void register_failure_stack(void) +{ + failure_handle = create_dummy_stack(); +} - save_stack_trace(&failure); - failure_handle = depot_save_stack(&failure, GFP_KERNEL); +static noinline void register_early_stack(void) +{ + early_handle = create_dummy_stack(); } static void init_page_owner(void) @@ -88,6 +90,7 @@ static void init_page_owner(void) register_dummy_stack(); register_failure_stack(); + register_early_stack(); static_branch_enable(&page_owner_inited); init_early_allocated_pages(); } @@ -165,17 +168,13 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags) return handle; } -noinline void __set_page_owner(struct page *page, unsigned int order, - gfp_t gfp_mask) +static inline void __set_page_owner_handle(struct page_ext *page_ext, + depot_stack_handle_t handle, unsigned int order, gfp_t gfp_mask) { - struct page_ext *page_ext = lookup_page_ext(page); struct page_owner *page_owner; - if (unlikely(!page_ext)) - return; - page_owner = get_page_owner(page_ext); - page_owner->handle = save_stack(gfp_mask); + page_owner->handle = handle; page_owner->order = order; page_owner->gfp_mask = gfp_mask; page_owner->last_migrate_reason = -1; @@ -183,6 +182,19 @@ noinline void __set_page_owner(struct page *page, unsigned int order, __set_bit(PAGE_EXT_OWNER, &page_ext->flags); } +noinline void __set_page_owner(struct page *page, unsigned int order, + gfp_t gfp_mask) +{ + struct page_ext *page_ext = lookup_page_ext(page); + depot_stack_handle_t handle; + + if (unlikely(!page_ext)) + return; + + handle = save_stack(gfp_mask); + __set_page_owner_handle(page_ext, handle, order, gfp_mask); +} + void __set_page_owner_migrate_reason(struct page *page, int reason) { struct page_ext *page_ext = lookup_page_ext(page); @@ -550,11 +562,17 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) continue; /* - * We are safe to check buddy flag and order, because - * this is init stage and only single thread runs. + * To avoid having to grab zone->lock, be a little + * careful when reading buddy page order. The only + * danger is that we skip too much and potentially miss + * some early allocated pages, which is better than + * heavy lock contention. */ if (PageBuddy(page)) { - pfn += (1UL << page_order(page)) - 1; + unsigned long order = page_order_unsafe(page); + + if (order > 0 && order < MAX_ORDER) + pfn += (1UL << order) - 1; continue; } @@ -565,14 +583,15 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) if (unlikely(!page_ext)) continue; - /* Maybe overraping zone */ + /* Maybe overlapping zone */ if (test_bit(PAGE_EXT_OWNER, &page_ext->flags)) continue; /* Found early allocated page */ - set_page_owner(page, 0, 0); + __set_page_owner_handle(page_ext, early_handle, 0, 0); count++; } + cond_resched(); } pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n", @@ -583,15 +602,12 @@ static void init_zones_in_node(pg_data_t *pgdat) { struct zone *zone; struct zone *node_zones = pgdat->node_zones; - unsigned long flags; for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { if (!populated_zone(zone)) continue; - spin_lock_irqsave(&zone->lock, flags); init_pages_in_zone(pgdat, zone); - spin_unlock_irqrestore(&zone->lock, flags); } } diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index cd2442e13d8f..7065faf74b46 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -4,6 +4,22 @@ #include <linux/types.h> #include <linux/percpu.h> +/* + * pcpu_block_md is the metadata block struct. + * Each chunk's bitmap is split into a number of full blocks. + * All units are in terms of bits. + */ +struct pcpu_block_md { + int contig_hint; /* contig hint for block */ + int contig_hint_start; /* block relative starting + position of the contig hint */ + int left_free; /* size of free space along + the left side of the block */ + int right_free; /* size of free space along + the right side of the block */ + int first_free; /* block position of first free */ +}; + struct pcpu_chunk { #ifdef CONFIG_PERCPU_STATS int nr_alloc; /* # of allocations */ @@ -11,24 +27,29 @@ struct pcpu_chunk { #endif struct list_head list; /* linked to pcpu_slot lists */ - int free_size; /* free bytes in the chunk */ - int contig_hint; /* max contiguous size hint */ + int free_bytes; /* free bytes in the chunk */ + int contig_bits; /* max contiguous size hint */ + int contig_bits_start; /* contig_bits starting + offset */ void *base_addr; /* base address of this chunk */ - int map_used; /* # of map entries used before the sentry */ - int map_alloc; /* # of map entries allocated */ - int *map; /* allocation map */ - struct list_head map_extend_list;/* on pcpu_map_extend_chunks */ + unsigned long *alloc_map; /* allocation map */ + unsigned long *bound_map; /* boundary map */ + struct pcpu_block_md *md_blocks; /* metadata blocks */ void *data; /* chunk data */ - int first_free; /* no free below this */ + int first_bit; /* no free below this */ bool immutable; /* no [de]population allowed */ - bool has_reserved; /* Indicates if chunk has reserved space - at the beginning. Reserved chunk will - contain reservation for static chunk. - Dynamic chunk will contain reservation - for static and reserved chunks. */ + int start_offset; /* the overlap with the previous + region to have a page aligned + base_addr */ + int end_offset; /* additional area required to + have the region end page + aligned */ + + int nr_pages; /* # of pages served by this chunk */ int nr_populated; /* # of populated pages */ + int nr_empty_pop_pages; /* # of empty populated pages */ unsigned long populated[]; /* populated bitmap */ }; @@ -36,10 +57,47 @@ extern spinlock_t pcpu_lock; extern struct list_head *pcpu_slot; extern int pcpu_nr_slots; +extern int pcpu_nr_empty_pop_pages; extern struct pcpu_chunk *pcpu_first_chunk; extern struct pcpu_chunk *pcpu_reserved_chunk; +/** + * pcpu_chunk_nr_blocks - converts nr_pages to # of md_blocks + * @chunk: chunk of interest + * + * This conversion is from the number of physical pages that the chunk + * serves to the number of bitmap blocks used. + */ +static inline int pcpu_chunk_nr_blocks(struct pcpu_chunk *chunk) +{ + return chunk->nr_pages * PAGE_SIZE / PCPU_BITMAP_BLOCK_SIZE; +} + +/** + * pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap + * @pages: number of physical pages + * + * This conversion is from physical pages to the number of bits + * required in the bitmap. + */ +static inline int pcpu_nr_pages_to_map_bits(int pages) +{ + return pages * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; +} + +/** + * pcpu_chunk_map_bits - helper to convert nr_pages to size of bitmap + * @chunk: chunk of interest + * + * This conversion is from the number of physical pages that the chunk + * serves to the number of bits in the bitmap. + */ +static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk) +{ + return pcpu_nr_pages_to_map_bits(chunk->nr_pages); +} + #ifdef CONFIG_PERCPU_STATS #include <linux/spinlock.h> diff --git a/mm/percpu-km.c b/mm/percpu-km.c index eb58aa4c0997..d2a76642c4ae 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -69,7 +69,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void) chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; spin_lock_irq(&pcpu_lock); - pcpu_chunk_populated(chunk, 0, nr_pages); + pcpu_chunk_populated(chunk, 0, nr_pages, false); spin_unlock_irq(&pcpu_lock); pcpu_stats_chunk_alloc(); diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index 03524a56eeff..6142484e88f7 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -18,7 +18,7 @@ #include "percpu-internal.h" #define P(X, Y) \ - seq_printf(m, " %-24s: %8lld\n", X, (long long int)Y) + seq_printf(m, " %-20s: %12lld\n", X, (long long int)Y) struct percpu_stats pcpu_stats; struct pcpu_alloc_info pcpu_stats_ai; @@ -29,64 +29,85 @@ static int cmpint(const void *a, const void *b) } /* - * Iterates over all chunks to find the max # of map entries used. + * Iterates over all chunks to find the max nr_alloc entries. */ -static int find_max_map_used(void) +static int find_max_nr_alloc(void) { struct pcpu_chunk *chunk; - int slot, max_map_used; + int slot, max_nr_alloc; - max_map_used = 0; + max_nr_alloc = 0; for (slot = 0; slot < pcpu_nr_slots; slot++) list_for_each_entry(chunk, &pcpu_slot[slot], list) - max_map_used = max(max_map_used, chunk->map_used); + max_nr_alloc = max(max_nr_alloc, chunk->nr_alloc); - return max_map_used; + return max_nr_alloc; } /* * Prints out chunk state. Fragmentation is considered between * the beginning of the chunk to the last allocation. + * + * All statistics are in bytes unless stated otherwise. */ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, - void *buffer) + int *buffer) { - int i, s_index, last_alloc, alloc_sign, as_len; + int i, last_alloc, as_len, start, end; int *alloc_sizes, *p; /* statistics */ int sum_frag = 0, max_frag = 0; int cur_min_alloc = 0, cur_med_alloc = 0, cur_max_alloc = 0; alloc_sizes = buffer; - s_index = chunk->has_reserved ? 1 : 0; - - /* find last allocation */ - last_alloc = -1; - for (i = chunk->map_used - 1; i >= s_index; i--) { - if (chunk->map[i] & 1) { - last_alloc = i; - break; - } - } - /* if the chunk is not empty - ignoring reserve */ - if (last_alloc >= s_index) { - as_len = last_alloc + 1 - s_index; - - /* - * Iterate through chunk map computing size info. - * The first bit is overloaded to be a used flag. - * negative = free space, positive = allocated - */ - for (i = 0, p = chunk->map + s_index; i < as_len; i++, p++) { - alloc_sign = (*p & 1) ? 1 : -1; - alloc_sizes[i] = alloc_sign * - ((p[1] & ~1) - (p[0] & ~1)); + /* + * find_last_bit returns the start value if nothing found. + * Therefore, we must determine if it is a failure of find_last_bit + * and set the appropriate value. + */ + last_alloc = find_last_bit(chunk->alloc_map, + pcpu_chunk_map_bits(chunk) - + chunk->end_offset / PCPU_MIN_ALLOC_SIZE - 1); + last_alloc = test_bit(last_alloc, chunk->alloc_map) ? + last_alloc + 1 : 0; + + as_len = 0; + start = chunk->start_offset; + + /* + * If a bit is set in the allocation map, the bound_map identifies + * where the allocation ends. If the allocation is not set, the + * bound_map does not identify free areas as it is only kept accurate + * on allocation, not free. + * + * Positive values are allocations and negative values are free + * fragments. + */ + while (start < last_alloc) { + if (test_bit(start, chunk->alloc_map)) { + end = find_next_bit(chunk->bound_map, last_alloc, + start + 1); + alloc_sizes[as_len] = 1; + } else { + end = find_next_bit(chunk->alloc_map, last_alloc, + start + 1); + alloc_sizes[as_len] = -1; } - sort(alloc_sizes, as_len, sizeof(chunk->map[0]), cmpint, NULL); + alloc_sizes[as_len++] *= (end - start) * PCPU_MIN_ALLOC_SIZE; + + start = end; + } + + /* + * The negative values are free fragments and thus sorting gives the + * free fragments at the beginning in largest first order. + */ + if (as_len > 0) { + sort(alloc_sizes, as_len, sizeof(int), cmpint, NULL); - /* Iterate through the unallocated fragements. */ + /* iterate through the unallocated fragments */ for (i = 0, p = alloc_sizes; *p < 0 && i < as_len; i++, p++) { sum_frag -= *p; max_frag = max(max_frag, -1 * (*p)); @@ -99,8 +120,10 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, P("nr_alloc", chunk->nr_alloc); P("max_alloc_size", chunk->max_alloc_size); - P("free_size", chunk->free_size); - P("contig_hint", chunk->contig_hint); + P("empty_pop_pages", chunk->nr_empty_pop_pages); + P("first_bit", chunk->first_bit); + P("free_bytes", chunk->free_bytes); + P("contig_bytes", chunk->contig_bits * PCPU_MIN_ALLOC_SIZE); P("sum_frag", sum_frag); P("max_frag", max_frag); P("cur_min_alloc", cur_min_alloc); @@ -112,29 +135,30 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, static int percpu_stats_show(struct seq_file *m, void *v) { struct pcpu_chunk *chunk; - int slot, max_map_used; - void *buffer; + int slot, max_nr_alloc; + int *buffer; alloc_buffer: spin_lock_irq(&pcpu_lock); - max_map_used = find_max_map_used(); + max_nr_alloc = find_max_nr_alloc(); spin_unlock_irq(&pcpu_lock); - buffer = vmalloc(max_map_used * sizeof(pcpu_first_chunk->map[0])); + /* there can be at most this many free and allocated fragments */ + buffer = vmalloc((2 * max_nr_alloc + 1) * sizeof(int)); if (!buffer) return -ENOMEM; spin_lock_irq(&pcpu_lock); /* if the buffer allocated earlier is too small */ - if (max_map_used < find_max_map_used()) { + if (max_nr_alloc < find_max_nr_alloc()) { spin_unlock_irq(&pcpu_lock); vfree(buffer); goto alloc_buffer; } #define PL(X) \ - seq_printf(m, " %-24s: %8lld\n", #X, (long long int)pcpu_stats_ai.X) + seq_printf(m, " %-20s: %12lld\n", #X, (long long int)pcpu_stats_ai.X) seq_printf(m, "Percpu Memory Statistics\n" @@ -151,7 +175,7 @@ alloc_buffer: #undef PL #define PU(X) \ - seq_printf(m, " %-18s: %14llu\n", #X, (unsigned long long)pcpu_stats.X) + seq_printf(m, " %-20s: %12llu\n", #X, (unsigned long long)pcpu_stats.X) seq_printf(m, "Global Stats:\n" @@ -164,6 +188,7 @@ alloc_buffer: PU(nr_max_chunks); PU(min_alloc_size); PU(max_alloc_size); + P("empty_pop_pages", pcpu_nr_empty_pop_pages); seq_putc(m, '\n'); #undef PU diff --git a/mm/percpu.c b/mm/percpu.c index bd4130a69bbc..59d44d61f5f1 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -4,44 +4,53 @@ * Copyright (C) 2009 SUSE Linux Products GmbH * Copyright (C) 2009 Tejun Heo <tj@kernel.org> * - * This file is released under the GPLv2. + * Copyright (C) 2017 Facebook Inc. + * Copyright (C) 2017 Dennis Zhou <dennisszhou@gmail.com> * - * This is percpu allocator which can handle both static and dynamic - * areas. Percpu areas are allocated in chunks. Each chunk is - * consisted of boot-time determined number of units and the first - * chunk is used for static percpu variables in the kernel image - * (special boot time alloc/init handling necessary as these areas - * need to be brought up before allocation services are running). - * Unit grows as necessary and all units grow or shrink in unison. - * When a chunk is filled up, another chunk is allocated. + * This file is released under the GPLv2 license. + * + * The percpu allocator handles both static and dynamic areas. Percpu + * areas are allocated in chunks which are divided into units. There is + * a 1-to-1 mapping for units to possible cpus. These units are grouped + * based on NUMA properties of the machine. * * c0 c1 c2 * ------------------- ------------------- ------------ * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u * ------------------- ...... ------------------- .... ------------ * - * Allocation is done in offset-size areas of single unit space. Ie, - * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, - * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to - * cpus. On NUMA, the mapping can be non-linear and even sparse. - * Percpu access can be done by configuring percpu base registers - * according to cpu to unit mapping and pcpu_unit_size. - * - * There are usually many small percpu allocations many of them being - * as small as 4 bytes. The allocator organizes chunks into lists - * according to free size and tries to allocate from the fullest one. - * Each chunk keeps the maximum contiguous area size hint which is - * guaranteed to be equal to or larger than the maximum contiguous - * area in the chunk. This helps the allocator not to iterate the - * chunk maps unnecessarily. - * - * Allocation state in each chunk is kept using an array of integers - * on chunk->map. A positive value in the map represents a free - * region and negative allocated. Allocation inside a chunk is done - * by scanning this map sequentially and serving the first matching - * entry. This is mostly copied from the percpu_modalloc() allocator. - * Chunks can be determined from the address using the index field - * in the page struct. The index field contains a pointer to the chunk. + * Allocation is done by offsets into a unit's address space. Ie., an + * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0, + * c1:u1, c1:u2, etc. On NUMA machines, the mapping may be non-linear + * and even sparse. Access is handled by configuring percpu base + * registers according to the cpu to unit mappings and offsetting the + * base address using pcpu_unit_size. + * + * There is special consideration for the first chunk which must handle + * the static percpu variables in the kernel image as allocation services + * are not online yet. In short, the first chunk is structured like so: + * + * <Static | [Reserved] | Dynamic> + * + * The static data is copied from the original section managed by the + * linker. The reserved section, if non-zero, primarily manages static + * percpu variables from kernel modules. Finally, the dynamic section + * takes care of normal allocations. + * + * The allocator organizes chunks into lists according to free size and + * tries to allocate from the fullest chunk first. Each chunk is managed + * by a bitmap with metadata blocks. The allocation map is updated on + * every allocation and free to reflect the current state while the boundary + * map is only updated on allocation. Each metadata block contains + * information to help mitigate the need to iterate over large portions + * of the bitmap. The reverse mapping from page to chunk is stored in + * the page's index. Lastly, units are lazily backed and grow in unison. + * + * There is a unique conversion that goes on here between bytes and bits. + * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE. The chunk + * tracks the number of pages it is responsible for in nr_pages. Helper + * functions are used to convert from between the bytes, bits, and blocks. + * All hints are managed in bits unless explicitly stated. * * To use this allocator, arch code should do the following: * @@ -58,6 +67,7 @@ #include <linux/bitmap.h> #include <linux/bootmem.h> #include <linux/err.h> +#include <linux/lcm.h> #include <linux/list.h> #include <linux/log2.h> #include <linux/mm.h> @@ -81,10 +91,9 @@ #include "percpu-internal.h" -#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ -#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ -#define PCPU_ATOMIC_MAP_MARGIN_LOW 32 -#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64 +/* the slots are sorted by free bytes left, 1-31 bytes share the same slot */ +#define PCPU_SLOT_BASE_SHIFT 5 + #define PCPU_EMPTY_POP_PAGES_LOW 2 #define PCPU_EMPTY_POP_PAGES_HIGH 4 @@ -140,13 +149,10 @@ struct pcpu_chunk *pcpu_first_chunk __ro_after_init; /* * Optional reserved chunk. This chunk reserves part of the first - * chunk and serves it for reserved allocations. The amount of - * reserved offset is in pcpu_reserved_chunk_limit. When reserved - * area doesn't exist, the following variables contain NULL and 0 - * respectively. + * chunk and serves it for reserved allocations. When the reserved + * region doesn't exist, the following variable is NULL. */ struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init; -static int pcpu_reserved_chunk_limit __ro_after_init; DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */ @@ -160,7 +166,7 @@ static LIST_HEAD(pcpu_map_extend_chunks); * The number of empty populated pages, protected by pcpu_lock. The * reserved chunk doesn't contribute to the count. */ -static int pcpu_nr_empty_pop_pages; +int pcpu_nr_empty_pop_pages; /* * Balance work is used to populate or destroy chunks asynchronously. We @@ -179,19 +185,26 @@ static void pcpu_schedule_balance_work(void) schedule_work(&pcpu_balance_work); } -static bool pcpu_addr_in_first_chunk(void *addr) +/** + * pcpu_addr_in_chunk - check if the address is served from this chunk + * @chunk: chunk of interest + * @addr: percpu address + * + * RETURNS: + * True if the address is served from this chunk. + */ +static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr) { - void *first_start = pcpu_first_chunk->base_addr; + void *start_addr, *end_addr; - return addr >= first_start && addr < first_start + pcpu_unit_size; -} + if (!chunk) + return false; -static bool pcpu_addr_in_reserved_chunk(void *addr) -{ - void *first_start = pcpu_first_chunk->base_addr; + start_addr = chunk->base_addr + chunk->start_offset; + end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE - + chunk->end_offset; - return addr >= first_start && - addr < first_start + pcpu_reserved_chunk_limit; + return addr >= start_addr && addr < end_addr; } static int __pcpu_size_to_slot(int size) @@ -209,10 +222,10 @@ static int pcpu_size_to_slot(int size) static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) { - if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int)) + if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk->contig_bits == 0) return 0; - return pcpu_size_to_slot(chunk->free_size); + return pcpu_size_to_slot(chunk->free_bytes); } /* set the pointer to a chunk in a page struct */ @@ -232,42 +245,200 @@ static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx) return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx; } +static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx) +{ + return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT); +} + static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, unsigned int cpu, int page_idx) { - return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] + - (page_idx << PAGE_SHIFT); + return (unsigned long)chunk->base_addr + + pcpu_unit_page_offset(cpu, page_idx); } -static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk, - int *rs, int *re, int end) +static void pcpu_next_unpop(unsigned long *bitmap, int *rs, int *re, int end) { - *rs = find_next_zero_bit(chunk->populated, end, *rs); - *re = find_next_bit(chunk->populated, end, *rs + 1); + *rs = find_next_zero_bit(bitmap, end, *rs); + *re = find_next_bit(bitmap, end, *rs + 1); } -static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk, - int *rs, int *re, int end) +static void pcpu_next_pop(unsigned long *bitmap, int *rs, int *re, int end) { - *rs = find_next_bit(chunk->populated, end, *rs); - *re = find_next_zero_bit(chunk->populated, end, *rs + 1); + *rs = find_next_bit(bitmap, end, *rs); + *re = find_next_zero_bit(bitmap, end, *rs + 1); } /* - * (Un)populated page region iterators. Iterate over (un)populated - * page regions between @start and @end in @chunk. @rs and @re should - * be integer variables and will be set to start and end page index of - * the current region. + * Bitmap region iterators. Iterates over the bitmap between + * [@start, @end) in @chunk. @rs and @re should be integer variables + * and will be set to start and end index of the current free region. + */ +#define pcpu_for_each_unpop_region(bitmap, rs, re, start, end) \ + for ((rs) = (start), pcpu_next_unpop((bitmap), &(rs), &(re), (end)); \ + (rs) < (re); \ + (rs) = (re) + 1, pcpu_next_unpop((bitmap), &(rs), &(re), (end))) + +#define pcpu_for_each_pop_region(bitmap, rs, re, start, end) \ + for ((rs) = (start), pcpu_next_pop((bitmap), &(rs), &(re), (end)); \ + (rs) < (re); \ + (rs) = (re) + 1, pcpu_next_pop((bitmap), &(rs), &(re), (end))) + +/* + * The following are helper functions to help access bitmaps and convert + * between bitmap offsets to address offsets. + */ +static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index) +{ + return chunk->alloc_map + + (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG); +} + +static unsigned long pcpu_off_to_block_index(int off) +{ + return off / PCPU_BITMAP_BLOCK_BITS; +} + +static unsigned long pcpu_off_to_block_off(int off) +{ + return off & (PCPU_BITMAP_BLOCK_BITS - 1); +} + +static unsigned long pcpu_block_off_to_off(int index, int off) +{ + return index * PCPU_BITMAP_BLOCK_BITS + off; +} + +/** + * pcpu_next_md_free_region - finds the next hint free area + * @chunk: chunk of interest + * @bit_off: chunk offset + * @bits: size of free area + * + * Helper function for pcpu_for_each_md_free_region. It checks + * block->contig_hint and performs aggregation across blocks to find the + * next hint. It modifies bit_off and bits in-place to be consumed in the + * loop. + */ +static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off, + int *bits) +{ + int i = pcpu_off_to_block_index(*bit_off); + int block_off = pcpu_off_to_block_off(*bit_off); + struct pcpu_block_md *block; + + *bits = 0; + for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk); + block++, i++) { + /* handles contig area across blocks */ + if (*bits) { + *bits += block->left_free; + if (block->left_free == PCPU_BITMAP_BLOCK_BITS) + continue; + return; + } + + /* + * This checks three things. First is there a contig_hint to + * check. Second, have we checked this hint before by + * comparing the block_off. Third, is this the same as the + * right contig hint. In the last case, it spills over into + * the next block and should be handled by the contig area + * across blocks code. + */ + *bits = block->contig_hint; + if (*bits && block->contig_hint_start >= block_off && + *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) { + *bit_off = pcpu_block_off_to_off(i, + block->contig_hint_start); + return; + } + + *bits = block->right_free; + *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free; + } +} + +/** + * pcpu_next_fit_region - finds fit areas for a given allocation request + * @chunk: chunk of interest + * @alloc_bits: size of allocation + * @align: alignment of area (max PAGE_SIZE) + * @bit_off: chunk offset + * @bits: size of free area + * + * Finds the next free region that is viable for use with a given size and + * alignment. This only returns if there is a valid area to be used for this + * allocation. block->first_free is returned if the allocation request fits + * within the block to see if the request can be fulfilled prior to the contig + * hint. */ -#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \ - for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \ - (rs) < (re); \ - (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end))) +static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits, + int align, int *bit_off, int *bits) +{ + int i = pcpu_off_to_block_index(*bit_off); + int block_off = pcpu_off_to_block_off(*bit_off); + struct pcpu_block_md *block; + + *bits = 0; + for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk); + block++, i++) { + /* handles contig area across blocks */ + if (*bits) { + *bits += block->left_free; + if (*bits >= alloc_bits) + return; + if (block->left_free == PCPU_BITMAP_BLOCK_BITS) + continue; + } + + /* check block->contig_hint */ + *bits = ALIGN(block->contig_hint_start, align) - + block->contig_hint_start; + /* + * This uses the block offset to determine if this has been + * checked in the prior iteration. + */ + if (block->contig_hint && + block->contig_hint_start >= block_off && + block->contig_hint >= *bits + alloc_bits) { + *bits += alloc_bits + block->contig_hint_start - + block->first_free; + *bit_off = pcpu_block_off_to_off(i, block->first_free); + return; + } + + *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free, + align); + *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off; + *bit_off = pcpu_block_off_to_off(i, *bit_off); + if (*bits >= alloc_bits) + return; + } -#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \ - for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \ - (rs) < (re); \ - (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end))) + /* no valid offsets were found - fail condition */ + *bit_off = pcpu_chunk_map_bits(chunk); +} + +/* + * Metadata free area iterators. These perform aggregation of free areas + * based on the metadata blocks and return the offset @bit_off and size in + * bits of the free area @bits. pcpu_for_each_fit_region only returns when + * a fit is found for the allocation request. + */ +#define pcpu_for_each_md_free_region(chunk, bit_off, bits) \ + for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits)); \ + (bit_off) < pcpu_chunk_map_bits((chunk)); \ + (bit_off) += (bits) + 1, \ + pcpu_next_md_free_region((chunk), &(bit_off), &(bits))) + +#define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) \ + for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \ + &(bits)); \ + (bit_off) < pcpu_chunk_map_bits((chunk)); \ + (bit_off) += (bits), \ + pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \ + &(bits))) /** * pcpu_mem_zalloc - allocate memory @@ -306,38 +477,6 @@ static void pcpu_mem_free(void *ptr) } /** - * pcpu_count_occupied_pages - count the number of pages an area occupies - * @chunk: chunk of interest - * @i: index of the area in question - * - * Count the number of pages chunk's @i'th area occupies. When the area's - * start and/or end address isn't aligned to page boundary, the straddled - * page is included in the count iff the rest of the page is free. - */ -static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i) -{ - int off = chunk->map[i] & ~1; - int end = chunk->map[i + 1] & ~1; - - if (!PAGE_ALIGNED(off) && i > 0) { - int prev = chunk->map[i - 1]; - - if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE)) - off = round_down(off, PAGE_SIZE); - } - - if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) { - int next = chunk->map[i + 1]; - int nend = chunk->map[i + 2] & ~1; - - if (!(next & 1) && nend >= round_up(end, PAGE_SIZE)) - end = round_up(end, PAGE_SIZE); - } - - return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0); -} - -/** * pcpu_chunk_relocate - put chunk in the appropriate chunk slot * @chunk: chunk of interest * @oslot: the previous slot it was on @@ -363,383 +502,706 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) } /** - * pcpu_need_to_extend - determine whether chunk area map needs to be extended + * pcpu_cnt_pop_pages- counts populated backing pages in range * @chunk: chunk of interest - * @is_atomic: the allocation context + * @bit_off: start offset + * @bits: size of area to check * - * Determine whether area map of @chunk needs to be extended. If - * @is_atomic, only the amount necessary for a new allocation is - * considered; however, async extension is scheduled if the left amount is - * low. If !@is_atomic, it aims for more empty space. Combined, this - * ensures that the map is likely to have enough available space to - * accomodate atomic allocations which can't extend maps directly. - * - * CONTEXT: - * pcpu_lock. + * Calculates the number of populated pages in the region + * [page_start, page_end). This keeps track of how many empty populated + * pages are available and decide if async work should be scheduled. * * RETURNS: - * New target map allocation length if extension is necessary, 0 - * otherwise. + * The nr of populated pages. */ -static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic) +static inline int pcpu_cnt_pop_pages(struct pcpu_chunk *chunk, int bit_off, + int bits) { - int margin, new_alloc; - - lockdep_assert_held(&pcpu_lock); - - if (is_atomic) { - margin = 3; + int page_start = PFN_UP(bit_off * PCPU_MIN_ALLOC_SIZE); + int page_end = PFN_DOWN((bit_off + bits) * PCPU_MIN_ALLOC_SIZE); - if (chunk->map_alloc < - chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) { - if (list_empty(&chunk->map_extend_list)) { - list_add_tail(&chunk->map_extend_list, - &pcpu_map_extend_chunks); - pcpu_schedule_balance_work(); - } - } - } else { - margin = PCPU_ATOMIC_MAP_MARGIN_HIGH; - } - - if (chunk->map_alloc >= chunk->map_used + margin) + if (page_start >= page_end) return 0; - new_alloc = PCPU_DFL_MAP_ALLOC; - while (new_alloc < chunk->map_used + margin) - new_alloc *= 2; - - return new_alloc; + /* + * bitmap_weight counts the number of bits set in a bitmap up to + * the specified number of bits. This is counting the populated + * pages up to page_end and then subtracting the populated pages + * up to page_start to count the populated pages in + * [page_start, page_end). + */ + return bitmap_weight(chunk->populated, page_end) - + bitmap_weight(chunk->populated, page_start); } /** - * pcpu_extend_area_map - extend area map of a chunk + * pcpu_chunk_update - updates the chunk metadata given a free area * @chunk: chunk of interest - * @new_alloc: new target allocation length of the area map + * @bit_off: chunk offset + * @bits: size of free area * - * Extend area map of @chunk to have @new_alloc entries. + * This updates the chunk's contig hint and starting offset given a free area. + * Choose the best starting offset if the contig hint is equal. + */ +static void pcpu_chunk_update(struct pcpu_chunk *chunk, int bit_off, int bits) +{ + if (bits > chunk->contig_bits) { + chunk->contig_bits_start = bit_off; + chunk->contig_bits = bits; + } else if (bits == chunk->contig_bits && chunk->contig_bits_start && + (!bit_off || + __ffs(bit_off) > __ffs(chunk->contig_bits_start))) { + /* use the start with the best alignment */ + chunk->contig_bits_start = bit_off; + } +} + +/** + * pcpu_chunk_refresh_hint - updates metadata about a chunk + * @chunk: chunk of interest * - * CONTEXT: - * Does GFP_KERNEL allocation. Grabs and releases pcpu_lock. + * Iterates over the metadata blocks to find the largest contig area. + * It also counts the populated pages and uses the delta to update the + * global count. * - * RETURNS: - * 0 on success, -errno on failure. + * Updates: + * chunk->contig_bits + * chunk->contig_bits_start + * nr_empty_pop_pages (chunk and global) */ -static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc) +static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk) { - int *old = NULL, *new = NULL; - size_t old_size = 0, new_size = new_alloc * sizeof(new[0]); - unsigned long flags; + int bit_off, bits, nr_empty_pop_pages; - lockdep_assert_held(&pcpu_alloc_mutex); + /* clear metadata */ + chunk->contig_bits = 0; - new = pcpu_mem_zalloc(new_size); - if (!new) - return -ENOMEM; + bit_off = chunk->first_bit; + bits = nr_empty_pop_pages = 0; + pcpu_for_each_md_free_region(chunk, bit_off, bits) { + pcpu_chunk_update(chunk, bit_off, bits); - /* acquire pcpu_lock and switch to new area map */ - spin_lock_irqsave(&pcpu_lock, flags); + nr_empty_pop_pages += pcpu_cnt_pop_pages(chunk, bit_off, bits); + } - if (new_alloc <= chunk->map_alloc) - goto out_unlock; + /* + * Keep track of nr_empty_pop_pages. + * + * The chunk maintains the previous number of free pages it held, + * so the delta is used to update the global counter. The reserved + * chunk is not part of the free page count as they are populated + * at init and are special to serving reserved allocations. + */ + if (chunk != pcpu_reserved_chunk) + pcpu_nr_empty_pop_pages += + (nr_empty_pop_pages - chunk->nr_empty_pop_pages); - old_size = chunk->map_alloc * sizeof(chunk->map[0]); - old = chunk->map; + chunk->nr_empty_pop_pages = nr_empty_pop_pages; +} - memcpy(new, old, old_size); +/** + * pcpu_block_update - updates a block given a free area + * @block: block of interest + * @start: start offset in block + * @end: end offset in block + * + * Updates a block given a known free area. The region [start, end) is + * expected to be the entirety of the free area within a block. Chooses + * the best starting offset if the contig hints are equal. + */ +static void pcpu_block_update(struct pcpu_block_md *block, int start, int end) +{ + int contig = end - start; + + block->first_free = min(block->first_free, start); + if (start == 0) + block->left_free = contig; + + if (end == PCPU_BITMAP_BLOCK_BITS) + block->right_free = contig; + + if (contig > block->contig_hint) { + block->contig_hint_start = start; + block->contig_hint = contig; + } else if (block->contig_hint_start && contig == block->contig_hint && + (!start || __ffs(start) > __ffs(block->contig_hint_start))) { + /* use the start with the best alignment */ + block->contig_hint_start = start; + } +} - chunk->map_alloc = new_alloc; - chunk->map = new; - new = NULL; +/** + * pcpu_block_refresh_hint + * @chunk: chunk of interest + * @index: index of the metadata block + * + * Scans over the block beginning at first_free and updates the block + * metadata accordingly. + */ +static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) +{ + struct pcpu_block_md *block = chunk->md_blocks + index; + unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index); + int rs, re; /* region start, region end */ + + /* clear hints */ + block->contig_hint = 0; + block->left_free = block->right_free = 0; + + /* iterate over free areas and update the contig hints */ + pcpu_for_each_unpop_region(alloc_map, rs, re, block->first_free, + PCPU_BITMAP_BLOCK_BITS) { + pcpu_block_update(block, rs, re); + } +} -out_unlock: - spin_unlock_irqrestore(&pcpu_lock, flags); +/** + * pcpu_block_update_hint_alloc - update hint on allocation path + * @chunk: chunk of interest + * @bit_off: chunk offset + * @bits: size of request + * + * Updates metadata for the allocation path. The metadata only has to be + * refreshed by a full scan iff the chunk's contig hint is broken. Block level + * scans are required if the block's contig hint is broken. + */ +static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, + int bits) +{ + struct pcpu_block_md *s_block, *e_block, *block; + int s_index, e_index; /* block indexes of the freed allocation */ + int s_off, e_off; /* block offsets of the freed allocation */ /* - * pcpu_mem_free() might end up calling vfree() which uses - * IRQ-unsafe lock and thus can't be called under pcpu_lock. + * Calculate per block offsets. + * The calculation uses an inclusive range, but the resulting offsets + * are [start, end). e_index always points to the last block in the + * range. */ - pcpu_mem_free(old); - pcpu_mem_free(new); + s_index = pcpu_off_to_block_index(bit_off); + e_index = pcpu_off_to_block_index(bit_off + bits - 1); + s_off = pcpu_off_to_block_off(bit_off); + e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1; - return 0; + s_block = chunk->md_blocks + s_index; + e_block = chunk->md_blocks + e_index; + + /* + * Update s_block. + * block->first_free must be updated if the allocation takes its place. + * If the allocation breaks the contig_hint, a scan is required to + * restore this hint. + */ + if (s_off == s_block->first_free) + s_block->first_free = find_next_zero_bit( + pcpu_index_alloc_map(chunk, s_index), + PCPU_BITMAP_BLOCK_BITS, + s_off + bits); + + if (s_off >= s_block->contig_hint_start && + s_off < s_block->contig_hint_start + s_block->contig_hint) { + /* block contig hint is broken - scan to fix it */ + pcpu_block_refresh_hint(chunk, s_index); + } else { + /* update left and right contig manually */ + s_block->left_free = min(s_block->left_free, s_off); + if (s_index == e_index) + s_block->right_free = min_t(int, s_block->right_free, + PCPU_BITMAP_BLOCK_BITS - e_off); + else + s_block->right_free = 0; + } + + /* + * Update e_block. + */ + if (s_index != e_index) { + /* + * When the allocation is across blocks, the end is along + * the left part of the e_block. + */ + e_block->first_free = find_next_zero_bit( + pcpu_index_alloc_map(chunk, e_index), + PCPU_BITMAP_BLOCK_BITS, e_off); + + if (e_off == PCPU_BITMAP_BLOCK_BITS) { + /* reset the block */ + e_block++; + } else { + if (e_off > e_block->contig_hint_start) { + /* contig hint is broken - scan to fix it */ + pcpu_block_refresh_hint(chunk, e_index); + } else { + e_block->left_free = 0; + e_block->right_free = + min_t(int, e_block->right_free, + PCPU_BITMAP_BLOCK_BITS - e_off); + } + } + + /* update in-between md_blocks */ + for (block = s_block + 1; block < e_block; block++) { + block->contig_hint = 0; + block->left_free = 0; + block->right_free = 0; + } + } + + /* + * The only time a full chunk scan is required is if the chunk + * contig hint is broken. Otherwise, it means a smaller space + * was used and therefore the chunk contig hint is still correct. + */ + if (bit_off >= chunk->contig_bits_start && + bit_off < chunk->contig_bits_start + chunk->contig_bits) + pcpu_chunk_refresh_hint(chunk); } /** - * pcpu_fit_in_area - try to fit the requested allocation in a candidate area - * @chunk: chunk the candidate area belongs to - * @off: the offset to the start of the candidate area - * @this_size: the size of the candidate area - * @size: the size of the target allocation - * @align: the alignment of the target allocation - * @pop_only: only allocate from already populated region - * - * We're trying to allocate @size bytes aligned at @align. @chunk's area - * at @off sized @this_size is a candidate. This function determines - * whether the target allocation fits in the candidate area and returns the - * number of bytes to pad after @off. If the target area doesn't fit, -1 - * is returned. - * - * If @pop_only is %true, this function only considers the already - * populated part of the candidate area. + * pcpu_block_update_hint_free - updates the block hints on the free path + * @chunk: chunk of interest + * @bit_off: chunk offset + * @bits: size of request + * + * Updates metadata for the allocation path. This avoids a blind block + * refresh by making use of the block contig hints. If this fails, it scans + * forward and backward to determine the extent of the free area. This is + * capped at the boundary of blocks. + * + * A chunk update is triggered if a page becomes free, a block becomes free, + * or the free spans across blocks. This tradeoff is to minimize iterating + * over the block metadata to update chunk->contig_bits. chunk->contig_bits + * may be off by up to a page, but it will never be more than the available + * space. If the contig hint is contained in one block, it will be accurate. */ -static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size, - int size, int align, bool pop_only) +static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, + int bits) { - int cand_off = off; - - while (true) { - int head = ALIGN(cand_off, align) - off; - int page_start, page_end, rs, re; + struct pcpu_block_md *s_block, *e_block, *block; + int s_index, e_index; /* block indexes of the freed allocation */ + int s_off, e_off; /* block offsets of the freed allocation */ + int start, end; /* start and end of the whole free area */ - if (this_size < head + size) - return -1; + /* + * Calculate per block offsets. + * The calculation uses an inclusive range, but the resulting offsets + * are [start, end). e_index always points to the last block in the + * range. + */ + s_index = pcpu_off_to_block_index(bit_off); + e_index = pcpu_off_to_block_index(bit_off + bits - 1); + s_off = pcpu_off_to_block_off(bit_off); + e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1; - if (!pop_only) - return head; + s_block = chunk->md_blocks + s_index; + e_block = chunk->md_blocks + e_index; + /* + * Check if the freed area aligns with the block->contig_hint. + * If it does, then the scan to find the beginning/end of the + * larger free area can be avoided. + * + * start and end refer to beginning and end of the free area + * within each their respective blocks. This is not necessarily + * the entire free area as it may span blocks past the beginning + * or end of the block. + */ + start = s_off; + if (s_off == s_block->contig_hint + s_block->contig_hint_start) { + start = s_block->contig_hint_start; + } else { /* - * If the first unpopulated page is beyond the end of the - * allocation, the whole allocation is populated; - * otherwise, retry from the end of the unpopulated area. + * Scan backwards to find the extent of the free area. + * find_last_bit returns the starting bit, so if the start bit + * is returned, that means there was no last bit and the + * remainder of the chunk is free. */ - page_start = PFN_DOWN(head + off); - page_end = PFN_UP(head + off + size); - - rs = page_start; - pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size)); - if (rs >= page_end) - return head; - cand_off = re * PAGE_SIZE; + int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), + start); + start = (start == l_bit) ? 0 : l_bit + 1; + } + + end = e_off; + if (e_off == e_block->contig_hint_start) + end = e_block->contig_hint_start + e_block->contig_hint; + else + end = find_next_bit(pcpu_index_alloc_map(chunk, e_index), + PCPU_BITMAP_BLOCK_BITS, end); + + /* update s_block */ + e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS; + pcpu_block_update(s_block, start, e_off); + + /* freeing in the same block */ + if (s_index != e_index) { + /* update e_block */ + pcpu_block_update(e_block, 0, end); + + /* reset md_blocks in the middle */ + for (block = s_block + 1; block < e_block; block++) { + block->first_free = 0; + block->contig_hint_start = 0; + block->contig_hint = PCPU_BITMAP_BLOCK_BITS; + block->left_free = PCPU_BITMAP_BLOCK_BITS; + block->right_free = PCPU_BITMAP_BLOCK_BITS; + } } + + /* + * Refresh chunk metadata when the free makes a page free, a block + * free, or spans across blocks. The contig hint may be off by up to + * a page, but if the hint is contained in a block, it will be accurate + * with the else condition below. + */ + if ((ALIGN_DOWN(end, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS)) > + ALIGN(start, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS))) || + s_index != e_index) + pcpu_chunk_refresh_hint(chunk); + else + pcpu_chunk_update(chunk, pcpu_block_off_to_off(s_index, start), + s_block->contig_hint); } /** - * pcpu_alloc_area - allocate area from a pcpu_chunk + * pcpu_is_populated - determines if the region is populated * @chunk: chunk of interest - * @size: wanted size in bytes - * @align: wanted align - * @pop_only: allocate only from the populated area - * @occ_pages_p: out param for the number of pages the area occupies - * - * Try to allocate @size bytes area aligned at @align from @chunk. - * Note that this function only allocates the offset. It doesn't - * populate or map the area. + * @bit_off: chunk offset + * @bits: size of area + * @next_off: return value for the next offset to start searching * - * @chunk->map must have at least two free slots. + * For atomic allocations, check if the backing pages are populated. * - * CONTEXT: - * pcpu_lock. + * RETURNS: + * Bool if the backing pages are populated. + * next_index is to skip over unpopulated blocks in pcpu_find_block_fit. + */ +static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits, + int *next_off) +{ + int page_start, page_end, rs, re; + + page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE); + page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE); + + rs = page_start; + pcpu_next_unpop(chunk->populated, &rs, &re, page_end); + if (rs >= page_end) + return true; + + *next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; + return false; +} + +/** + * pcpu_find_block_fit - finds the block index to start searching + * @chunk: chunk of interest + * @alloc_bits: size of request in allocation units + * @align: alignment of area (max PAGE_SIZE bytes) + * @pop_only: use populated regions only + * + * Given a chunk and an allocation spec, find the offset to begin searching + * for a free region. This iterates over the bitmap metadata blocks to + * find an offset that will be guaranteed to fit the requirements. It is + * not quite first fit as if the allocation does not fit in the contig hint + * of a block or chunk, it is skipped. This errs on the side of caution + * to prevent excess iteration. Poor alignment can cause the allocator to + * skip over blocks and chunks that have valid free areas. * * RETURNS: - * Allocated offset in @chunk on success, -1 if no matching area is - * found. + * The offset in the bitmap to begin searching. + * -1 if no offset is found. */ -static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, - bool pop_only, int *occ_pages_p) +static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits, + size_t align, bool pop_only) { - int oslot = pcpu_chunk_slot(chunk); - int max_contig = 0; - int i, off; - bool seen_free = false; - int *p; - - for (i = chunk->first_free, p = chunk->map + i; i < chunk->map_used; i++, p++) { - int head, tail; - int this_size; - - off = *p; - if (off & 1) - continue; + int bit_off, bits, next_off; - this_size = (p[1] & ~1) - off; + /* + * Check to see if the allocation can fit in the chunk's contig hint. + * This is an optimization to prevent scanning by assuming if it + * cannot fit in the global hint, there is memory pressure and creating + * a new chunk would happen soon. + */ + bit_off = ALIGN(chunk->contig_bits_start, align) - + chunk->contig_bits_start; + if (bit_off + alloc_bits > chunk->contig_bits) + return -1; + + bit_off = chunk->first_bit; + bits = 0; + pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) { + if (!pop_only || pcpu_is_populated(chunk, bit_off, bits, + &next_off)) + break; - head = pcpu_fit_in_area(chunk, off, this_size, size, align, - pop_only); - if (head < 0) { - if (!seen_free) { - chunk->first_free = i; - seen_free = true; - } - max_contig = max(this_size, max_contig); - continue; - } + bit_off = next_off; + bits = 0; + } - /* - * If head is small or the previous block is free, - * merge'em. Note that 'small' is defined as smaller - * than sizeof(int), which is very small but isn't too - * uncommon for percpu allocations. - */ - if (head && (head < sizeof(int) || !(p[-1] & 1))) { - *p = off += head; - if (p[-1] & 1) - chunk->free_size -= head; - else - max_contig = max(*p - p[-1], max_contig); - this_size -= head; - head = 0; - } + if (bit_off == pcpu_chunk_map_bits(chunk)) + return -1; - /* if tail is small, just keep it around */ - tail = this_size - head - size; - if (tail < sizeof(int)) { - tail = 0; - size = this_size - head; - } + return bit_off; +} - /* split if warranted */ - if (head || tail) { - int nr_extra = !!head + !!tail; - - /* insert new subblocks */ - memmove(p + nr_extra + 1, p + 1, - sizeof(chunk->map[0]) * (chunk->map_used - i)); - chunk->map_used += nr_extra; - - if (head) { - if (!seen_free) { - chunk->first_free = i; - seen_free = true; - } - *++p = off += head; - ++i; - max_contig = max(head, max_contig); - } - if (tail) { - p[1] = off + size; - max_contig = max(tail, max_contig); - } - } +/** + * pcpu_alloc_area - allocates an area from a pcpu_chunk + * @chunk: chunk of interest + * @alloc_bits: size of request in allocation units + * @align: alignment of area (max PAGE_SIZE) + * @start: bit_off to start searching + * + * This function takes in a @start offset to begin searching to fit an + * allocation of @alloc_bits with alignment @align. It needs to scan + * the allocation map because if it fits within the block's contig hint, + * @start will be block->first_free. This is an attempt to fill the + * allocation prior to breaking the contig hint. The allocation and + * boundary maps are updated accordingly if it confirms a valid + * free area. + * + * RETURNS: + * Allocated addr offset in @chunk on success. + * -1 if no matching area is found. + */ +static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits, + size_t align, int start) +{ + size_t align_mask = (align) ? (align - 1) : 0; + int bit_off, end, oslot; - if (!seen_free) - chunk->first_free = i + 1; + lockdep_assert_held(&pcpu_lock); - /* update hint and mark allocated */ - if (i + 1 == chunk->map_used) - chunk->contig_hint = max_contig; /* fully scanned */ - else - chunk->contig_hint = max(chunk->contig_hint, - max_contig); + oslot = pcpu_chunk_slot(chunk); - chunk->free_size -= size; - *p |= 1; + /* + * Search to find a fit. + */ + end = start + alloc_bits + PCPU_BITMAP_BLOCK_BITS; + bit_off = bitmap_find_next_zero_area(chunk->alloc_map, end, start, + alloc_bits, align_mask); + if (bit_off >= end) + return -1; - *occ_pages_p = pcpu_count_occupied_pages(chunk, i); - pcpu_chunk_relocate(chunk, oslot); - return off; - } + /* update alloc map */ + bitmap_set(chunk->alloc_map, bit_off, alloc_bits); + + /* update boundary map */ + set_bit(bit_off, chunk->bound_map); + bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1); + set_bit(bit_off + alloc_bits, chunk->bound_map); + + chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE; + + /* update first free bit */ + if (bit_off == chunk->first_bit) + chunk->first_bit = find_next_zero_bit( + chunk->alloc_map, + pcpu_chunk_map_bits(chunk), + bit_off + alloc_bits); + + pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits); - chunk->contig_hint = max_contig; /* fully scanned */ pcpu_chunk_relocate(chunk, oslot); - /* tell the upper layer that this chunk has no matching area */ - return -1; + return bit_off * PCPU_MIN_ALLOC_SIZE; } /** - * pcpu_free_area - free area to a pcpu_chunk + * pcpu_free_area - frees the corresponding offset * @chunk: chunk of interest - * @freeme: offset of area to free - * @occ_pages_p: out param for the number of pages the area occupies - * - * Free area starting from @freeme to @chunk. Note that this function - * only modifies the allocation map. It doesn't depopulate or unmap - * the area. + * @off: addr offset into chunk * - * CONTEXT: - * pcpu_lock. + * This function determines the size of an allocation to free using + * the boundary bitmap and clears the allocation map. */ -static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme, - int *occ_pages_p) +static void pcpu_free_area(struct pcpu_chunk *chunk, int off) { - int oslot = pcpu_chunk_slot(chunk); - int off = 0; - unsigned i, j; - int to_free = 0; - int *p; + int bit_off, bits, end, oslot; lockdep_assert_held(&pcpu_lock); pcpu_stats_area_dealloc(chunk); - freeme |= 1; /* we are searching for <given offset, in use> pair */ - - i = 0; - j = chunk->map_used; - while (i != j) { - unsigned k = (i + j) / 2; - off = chunk->map[k]; - if (off < freeme) - i = k + 1; - else if (off > freeme) - j = k; - else - i = j = k; + oslot = pcpu_chunk_slot(chunk); + + bit_off = off / PCPU_MIN_ALLOC_SIZE; + + /* find end index */ + end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk), + bit_off + 1); + bits = end - bit_off; + bitmap_clear(chunk->alloc_map, bit_off, bits); + + /* update metadata */ + chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE; + + /* update first free bit */ + chunk->first_bit = min(chunk->first_bit, bit_off); + + pcpu_block_update_hint_free(chunk, bit_off, bits); + + pcpu_chunk_relocate(chunk, oslot); +} + +static void pcpu_init_md_blocks(struct pcpu_chunk *chunk) +{ + struct pcpu_block_md *md_block; + + for (md_block = chunk->md_blocks; + md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk); + md_block++) { + md_block->contig_hint = PCPU_BITMAP_BLOCK_BITS; + md_block->left_free = PCPU_BITMAP_BLOCK_BITS; + md_block->right_free = PCPU_BITMAP_BLOCK_BITS; } - BUG_ON(off != freeme); +} - if (i < chunk->first_free) - chunk->first_free = i; +/** + * pcpu_alloc_first_chunk - creates chunks that serve the first chunk + * @tmp_addr: the start of the region served + * @map_size: size of the region served + * + * This is responsible for creating the chunks that serve the first chunk. The + * base_addr is page aligned down of @tmp_addr while the region end is page + * aligned up. Offsets are kept track of to determine the region served. All + * this is done to appease the bitmap allocator in avoiding partial blocks. + * + * RETURNS: + * Chunk serving the region at @tmp_addr of @map_size. + */ +static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, + int map_size) +{ + struct pcpu_chunk *chunk; + unsigned long aligned_addr, lcm_align; + int start_offset, offset_bits, region_size, region_bits; - p = chunk->map + i; - *p = off &= ~1; - chunk->free_size += (p[1] & ~1) - off; + /* region calculations */ + aligned_addr = tmp_addr & PAGE_MASK; - *occ_pages_p = pcpu_count_occupied_pages(chunk, i); + start_offset = tmp_addr - aligned_addr; - /* merge with next? */ - if (!(p[1] & 1)) - to_free++; - /* merge with previous? */ - if (i > 0 && !(p[-1] & 1)) { - to_free++; - i--; - p--; + /* + * Align the end of the region with the LCM of PAGE_SIZE and + * PCPU_BITMAP_BLOCK_SIZE. One of these constants is a multiple of + * the other. + */ + lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE); + region_size = ALIGN(start_offset + map_size, lcm_align); + + /* allocate chunk */ + chunk = memblock_virt_alloc(sizeof(struct pcpu_chunk) + + BITS_TO_LONGS(region_size >> PAGE_SHIFT), + 0); + + INIT_LIST_HEAD(&chunk->list); + + chunk->base_addr = (void *)aligned_addr; + chunk->start_offset = start_offset; + chunk->end_offset = region_size - chunk->start_offset - map_size; + + chunk->nr_pages = region_size >> PAGE_SHIFT; + region_bits = pcpu_chunk_map_bits(chunk); + + chunk->alloc_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits) * + sizeof(chunk->alloc_map[0]), 0); + chunk->bound_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits + 1) * + sizeof(chunk->bound_map[0]), 0); + chunk->md_blocks = memblock_virt_alloc(pcpu_chunk_nr_blocks(chunk) * + sizeof(chunk->md_blocks[0]), 0); + pcpu_init_md_blocks(chunk); + + /* manage populated page bitmap */ + chunk->immutable = true; + bitmap_fill(chunk->populated, chunk->nr_pages); + chunk->nr_populated = chunk->nr_pages; + chunk->nr_empty_pop_pages = + pcpu_cnt_pop_pages(chunk, start_offset / PCPU_MIN_ALLOC_SIZE, + map_size / PCPU_MIN_ALLOC_SIZE); + + chunk->contig_bits = map_size / PCPU_MIN_ALLOC_SIZE; + chunk->free_bytes = map_size; + + if (chunk->start_offset) { + /* hide the beginning of the bitmap */ + offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE; + bitmap_set(chunk->alloc_map, 0, offset_bits); + set_bit(0, chunk->bound_map); + set_bit(offset_bits, chunk->bound_map); + + chunk->first_bit = offset_bits; + + pcpu_block_update_hint_alloc(chunk, 0, offset_bits); } - if (to_free) { - chunk->map_used -= to_free; - memmove(p + 1, p + 1 + to_free, - (chunk->map_used - i) * sizeof(chunk->map[0])); + + if (chunk->end_offset) { + /* hide the end of the bitmap */ + offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE; + bitmap_set(chunk->alloc_map, + pcpu_chunk_map_bits(chunk) - offset_bits, + offset_bits); + set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE, + chunk->bound_map); + set_bit(region_bits, chunk->bound_map); + + pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk) + - offset_bits, offset_bits); } - chunk->contig_hint = max(chunk->map[i + 1] - chunk->map[i] - 1, chunk->contig_hint); - pcpu_chunk_relocate(chunk, oslot); + return chunk; } static struct pcpu_chunk *pcpu_alloc_chunk(void) { struct pcpu_chunk *chunk; + int region_bits; chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size); if (!chunk) return NULL; - chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC * - sizeof(chunk->map[0])); - if (!chunk->map) { - pcpu_mem_free(chunk); - return NULL; - } + INIT_LIST_HEAD(&chunk->list); + chunk->nr_pages = pcpu_unit_pages; + region_bits = pcpu_chunk_map_bits(chunk); - chunk->map_alloc = PCPU_DFL_MAP_ALLOC; - chunk->map[0] = 0; - chunk->map[1] = pcpu_unit_size | 1; - chunk->map_used = 1; - chunk->has_reserved = false; + chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) * + sizeof(chunk->alloc_map[0])); + if (!chunk->alloc_map) + goto alloc_map_fail; - INIT_LIST_HEAD(&chunk->list); - INIT_LIST_HEAD(&chunk->map_extend_list); - chunk->free_size = pcpu_unit_size; - chunk->contig_hint = pcpu_unit_size; + chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) * + sizeof(chunk->bound_map[0])); + if (!chunk->bound_map) + goto bound_map_fail; + + chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) * + sizeof(chunk->md_blocks[0])); + if (!chunk->md_blocks) + goto md_blocks_fail; + + pcpu_init_md_blocks(chunk); + + /* init metadata */ + chunk->contig_bits = region_bits; + chunk->free_bytes = chunk->nr_pages * PAGE_SIZE; return chunk; + +md_blocks_fail: + pcpu_mem_free(chunk->bound_map); +bound_map_fail: + pcpu_mem_free(chunk->alloc_map); +alloc_map_fail: + pcpu_mem_free(chunk); + + return NULL; } static void pcpu_free_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; - pcpu_mem_free(chunk->map); + pcpu_mem_free(chunk->bound_map); + pcpu_mem_free(chunk->alloc_map); pcpu_mem_free(chunk); } @@ -748,13 +1210,17 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) * @chunk: pcpu_chunk which got populated * @page_start: the start page * @page_end: the end page + * @for_alloc: if this is to populate for allocation * * Pages in [@page_start,@page_end) have been populated to @chunk. Update * the bookkeeping information accordingly. Must be called after each * successful population. + * + * If this is @for_alloc, do not increment pcpu_nr_empty_pop_pages because it + * is to serve an allocation in that area. */ -static void pcpu_chunk_populated(struct pcpu_chunk *chunk, - int page_start, int page_end) +static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start, + int page_end, bool for_alloc) { int nr = page_end - page_start; @@ -762,7 +1228,11 @@ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, bitmap_set(chunk->populated, page_start, nr); chunk->nr_populated += nr; - pcpu_nr_empty_pop_pages += nr; + + if (!for_alloc) { + chunk->nr_empty_pop_pages += nr; + pcpu_nr_empty_pop_pages += nr; + } } /** @@ -784,6 +1254,7 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, bitmap_clear(chunk->populated, page_start, nr); chunk->nr_populated -= nr; + chunk->nr_empty_pop_pages -= nr; pcpu_nr_empty_pop_pages -= nr; } @@ -819,18 +1290,21 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai); * pcpu_chunk_addr_search - determine chunk containing specified address * @addr: address for which the chunk needs to be determined. * + * This is an internal function that handles all but static allocations. + * Static percpu address values should never be passed into the allocator. + * * RETURNS: * The address of the found chunk. */ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) { - /* is it in the first chunk? */ - if (pcpu_addr_in_first_chunk(addr)) { - /* is it in the reserved area? */ - if (pcpu_addr_in_reserved_chunk(addr)) - return pcpu_reserved_chunk; + /* is it in the dynamic region (first chunk)? */ + if (pcpu_addr_in_chunk(pcpu_first_chunk, addr)) return pcpu_first_chunk; - } + + /* is it in the reserved region? */ + if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr)) + return pcpu_reserved_chunk; /* * The address is relative to unit0 which might be unused and @@ -863,19 +1337,23 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, struct pcpu_chunk *chunk; const char *err; bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; - int occ_pages = 0; - int slot, off, new_alloc, cpu, ret; + int slot, off, cpu, ret; unsigned long flags; void __percpu *ptr; + size_t bits, bit_align; /* - * We want the lowest bit of offset available for in-use/free - * indicator, so force >= 16bit alignment and make size even. + * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE, + * therefore alignment must be a minimum of that many bytes. + * An allocation may have internal fragmentation from rounding up + * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes. */ - if (unlikely(align < 2)) - align = 2; + if (unlikely(align < PCPU_MIN_ALLOC_SIZE)) + align = PCPU_MIN_ALLOC_SIZE; - size = ALIGN(size, 2); + size = ALIGN(size, PCPU_MIN_ALLOC_SIZE); + bits = size >> PCPU_MIN_ALLOC_SHIFT; + bit_align = align >> PCPU_MIN_ALLOC_SHIFT; if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE || !is_power_of_2(align))) { @@ -893,23 +1371,13 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, if (reserved && pcpu_reserved_chunk) { chunk = pcpu_reserved_chunk; - if (size > chunk->contig_hint) { + off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic); + if (off < 0) { err = "alloc from reserved chunk failed"; goto fail_unlock; } - while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) { - spin_unlock_irqrestore(&pcpu_lock, flags); - if (is_atomic || - pcpu_extend_area_map(chunk, new_alloc) < 0) { - err = "failed to extend area map of reserved chunk"; - goto fail; - } - spin_lock_irqsave(&pcpu_lock, flags); - } - - off = pcpu_alloc_area(chunk, size, align, is_atomic, - &occ_pages); + off = pcpu_alloc_area(chunk, bits, bit_align, off); if (off >= 0) goto area_found; @@ -921,31 +1389,15 @@ restart: /* search through normal chunks */ for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { list_for_each_entry(chunk, &pcpu_slot[slot], list) { - if (size > chunk->contig_hint) + off = pcpu_find_block_fit(chunk, bits, bit_align, + is_atomic); + if (off < 0) continue; - new_alloc = pcpu_need_to_extend(chunk, is_atomic); - if (new_alloc) { - if (is_atomic) - continue; - spin_unlock_irqrestore(&pcpu_lock, flags); - if (pcpu_extend_area_map(chunk, - new_alloc) < 0) { - err = "failed to extend area map"; - goto fail; - } - spin_lock_irqsave(&pcpu_lock, flags); - /* - * pcpu_lock has been dropped, need to - * restart cpu_slot list walking. - */ - goto restart; - } - - off = pcpu_alloc_area(chunk, size, align, is_atomic, - &occ_pages); + off = pcpu_alloc_area(chunk, bits, bit_align, off); if (off >= 0) goto area_found; + } } @@ -987,30 +1439,25 @@ area_found: page_start = PFN_DOWN(off); page_end = PFN_UP(off + size); - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + pcpu_for_each_unpop_region(chunk->populated, rs, re, + page_start, page_end) { WARN_ON(chunk->immutable); ret = pcpu_populate_chunk(chunk, rs, re); spin_lock_irqsave(&pcpu_lock, flags); if (ret) { - pcpu_free_area(chunk, off, &occ_pages); + pcpu_free_area(chunk, off); err = "failed to populate"; goto fail_unlock; } - pcpu_chunk_populated(chunk, rs, re); + pcpu_chunk_populated(chunk, rs, re, true); spin_unlock_irqrestore(&pcpu_lock, flags); } mutex_unlock(&pcpu_alloc_mutex); } - if (chunk != pcpu_reserved_chunk) { - spin_lock_irqsave(&pcpu_lock, flags); - pcpu_nr_empty_pop_pages -= occ_pages; - spin_unlock_irqrestore(&pcpu_lock, flags); - } - if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) pcpu_schedule_balance_work(); @@ -1128,7 +1575,6 @@ static void pcpu_balance_workfn(struct work_struct *work) if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) continue; - list_del_init(&chunk->map_extend_list); list_move(&chunk->list, &to_free); } @@ -1137,7 +1583,8 @@ static void pcpu_balance_workfn(struct work_struct *work) list_for_each_entry_safe(chunk, next, &to_free, list) { int rs, re; - pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) { + pcpu_for_each_pop_region(chunk->populated, rs, re, 0, + chunk->nr_pages) { pcpu_depopulate_chunk(chunk, rs, re); spin_lock_irq(&pcpu_lock); pcpu_chunk_depopulated(chunk, rs, re); @@ -1146,25 +1593,6 @@ static void pcpu_balance_workfn(struct work_struct *work) pcpu_destroy_chunk(chunk); } - /* service chunks which requested async area map extension */ - do { - int new_alloc = 0; - - spin_lock_irq(&pcpu_lock); - - chunk = list_first_entry_or_null(&pcpu_map_extend_chunks, - struct pcpu_chunk, map_extend_list); - if (chunk) { - list_del_init(&chunk->map_extend_list); - new_alloc = pcpu_need_to_extend(chunk, false); - } - - spin_unlock_irq(&pcpu_lock); - - if (new_alloc) - pcpu_extend_area_map(chunk, new_alloc); - } while (chunk); - /* * Ensure there are certain number of free populated pages for * atomic allocs. Fill up from the most packed so that atomic @@ -1194,7 +1622,7 @@ retry_pop: spin_lock_irq(&pcpu_lock); list_for_each_entry(chunk, &pcpu_slot[slot], list) { - nr_unpop = pcpu_unit_pages - chunk->nr_populated; + nr_unpop = chunk->nr_pages - chunk->nr_populated; if (nr_unpop) break; } @@ -1204,14 +1632,15 @@ retry_pop: continue; /* @chunk can't go away while pcpu_alloc_mutex is held */ - pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) { + pcpu_for_each_unpop_region(chunk->populated, rs, re, 0, + chunk->nr_pages) { int nr = min(re - rs, nr_to_pop); ret = pcpu_populate_chunk(chunk, rs, rs + nr); if (!ret) { nr_to_pop -= nr; spin_lock_irq(&pcpu_lock); - pcpu_chunk_populated(chunk, rs, rs + nr); + pcpu_chunk_populated(chunk, rs, rs + nr, false); spin_unlock_irq(&pcpu_lock); } else { nr_to_pop = 0; @@ -1250,7 +1679,7 @@ void free_percpu(void __percpu *ptr) void *addr; struct pcpu_chunk *chunk; unsigned long flags; - int off, occ_pages; + int off; if (!ptr) return; @@ -1264,13 +1693,10 @@ void free_percpu(void __percpu *ptr) chunk = pcpu_chunk_addr_search(addr); off = addr - chunk->base_addr; - pcpu_free_area(chunk, off, &occ_pages); - - if (chunk != pcpu_reserved_chunk) - pcpu_nr_empty_pop_pages += occ_pages; + pcpu_free_area(chunk, off); /* if there are more than one fully free chunks, wake up grim reaper */ - if (chunk->free_size == pcpu_unit_size) { + if (chunk->free_bytes == pcpu_unit_size) { struct pcpu_chunk *pos; list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) @@ -1361,10 +1787,16 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr) * The following test on unit_low/high isn't strictly * necessary but will speed up lookups of addresses which * aren't in the first chunk. + * + * The address check is against full chunk sizes. pcpu_base_addr + * points to the beginning of the first chunk including the + * static region. Assumes good intent as the first chunk may + * not be full (ie. < pcpu_unit_pages in size). */ - first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0); - first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu, - pcpu_unit_pages); + first_low = (unsigned long)pcpu_base_addr + + pcpu_unit_page_offset(pcpu_low_unit_cpu, 0); + first_high = (unsigned long)pcpu_base_addr + + pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages); if ((unsigned long)addr >= first_low && (unsigned long)addr < first_high) { for_each_possible_cpu(cpu) { @@ -1546,12 +1978,13 @@ static void pcpu_dump_alloc_info(const char *lvl, * The caller should have mapped the first chunk at @base_addr and * copied static data to each unit. * - * If the first chunk ends up with both reserved and dynamic areas, it - * is served by two chunks - one to serve the core static and reserved - * areas and the other for the dynamic area. They share the same vm - * and page map but uses different area allocation map to stay away - * from each other. The latter chunk is circulated in the chunk slots - * and available for dynamic allocation like any other chunks. + * The first chunk will always contain a static and a dynamic region. + * However, the static region is not managed by any chunk. If the first + * chunk also contains a reserved region, it is served by two chunks - + * one for the reserved region and one for the dynamic region. They + * share the same vm, but use offset regions in the area allocation map. + * The chunk serving the dynamic region is circulated in the chunk slots + * and available for dynamic allocation like any other chunk. * * RETURNS: * 0 on success, -errno on failure. @@ -1559,17 +1992,17 @@ static void pcpu_dump_alloc_info(const char *lvl, int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, void *base_addr) { - static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; - static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; - size_t dyn_size = ai->dyn_size; - size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; - struct pcpu_chunk *schunk, *dchunk = NULL; + size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; + size_t static_size, dyn_size; + struct pcpu_chunk *chunk; unsigned long *group_offsets; size_t *group_sizes; unsigned long *unit_off; unsigned int cpu; int *unit_map; int group, unit, i; + int map_size; + unsigned long tmp_addr; #define PCPU_SETUP_BUG_ON(cond) do { \ if (unlikely(cond)) { \ @@ -1592,7 +2025,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size)); PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); + PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE)); PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE); + PCPU_SETUP_BUG_ON(!ai->dyn_size); + PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE)); + PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) || + IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE))); PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); /* process group information and build config tables accordingly */ @@ -1671,64 +2109,41 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, INIT_LIST_HEAD(&pcpu_slot[i]); /* - * Initialize static chunk. If reserved_size is zero, the - * static chunk covers static area + dynamic allocation area - * in the first chunk. If reserved_size is not zero, it - * covers static area + reserved area (mostly used for module - * static percpu allocation). + * The end of the static region needs to be aligned with the + * minimum allocation size as this offsets the reserved and + * dynamic region. The first chunk ends page aligned by + * expanding the dynamic region, therefore the dynamic region + * can be shrunk to compensate while still staying above the + * configured sizes. */ - schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); - INIT_LIST_HEAD(&schunk->list); - INIT_LIST_HEAD(&schunk->map_extend_list); - schunk->base_addr = base_addr; - schunk->map = smap; - schunk->map_alloc = ARRAY_SIZE(smap); - schunk->immutable = true; - bitmap_fill(schunk->populated, pcpu_unit_pages); - schunk->nr_populated = pcpu_unit_pages; + static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE); + dyn_size = ai->dyn_size - (static_size - ai->static_size); - if (ai->reserved_size) { - schunk->free_size = ai->reserved_size; - pcpu_reserved_chunk = schunk; - pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size; - } else { - schunk->free_size = dyn_size; - dyn_size = 0; /* dynamic area covered */ - } - schunk->contig_hint = schunk->free_size; - - schunk->map[0] = 1; - schunk->map[1] = ai->static_size; - schunk->map_used = 1; - if (schunk->free_size) - schunk->map[++schunk->map_used] = ai->static_size + schunk->free_size; - schunk->map[schunk->map_used] |= 1; - schunk->has_reserved = true; + /* + * Initialize first chunk. + * If the reserved_size is non-zero, this initializes the reserved + * chunk. If the reserved_size is zero, the reserved chunk is NULL + * and the dynamic region is initialized here. The first chunk, + * pcpu_first_chunk, will always point to the chunk that serves + * the dynamic region. + */ + tmp_addr = (unsigned long)base_addr + static_size; + map_size = ai->reserved_size ?: dyn_size; + chunk = pcpu_alloc_first_chunk(tmp_addr, map_size); /* init dynamic chunk if necessary */ - if (dyn_size) { - dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); - INIT_LIST_HEAD(&dchunk->list); - INIT_LIST_HEAD(&dchunk->map_extend_list); - dchunk->base_addr = base_addr; - dchunk->map = dmap; - dchunk->map_alloc = ARRAY_SIZE(dmap); - dchunk->immutable = true; - bitmap_fill(dchunk->populated, pcpu_unit_pages); - dchunk->nr_populated = pcpu_unit_pages; - - dchunk->contig_hint = dchunk->free_size = dyn_size; - dchunk->map[0] = 1; - dchunk->map[1] = pcpu_reserved_chunk_limit; - dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1; - dchunk->map_used = 2; - dchunk->has_reserved = true; + if (ai->reserved_size) { + pcpu_reserved_chunk = chunk; + + tmp_addr = (unsigned long)base_addr + static_size + + ai->reserved_size; + map_size = dyn_size; + chunk = pcpu_alloc_first_chunk(tmp_addr, map_size); } /* link the first chunk in */ - pcpu_first_chunk = dchunk ?: schunk; - pcpu_nr_empty_pop_pages += - pcpu_count_occupied_pages(pcpu_first_chunk, 1); + pcpu_first_chunk = chunk; + pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages; pcpu_chunk_relocate(pcpu_first_chunk, -1); pcpu_stats_chunk_alloc(); @@ -1842,6 +2257,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( */ min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + /* determine the maximum # of units that can fit in an allocation */ alloc_size = roundup(min_unit_size, atom_size); upa = alloc_size / min_unit_size; while (alloc_size % upa || (offset_in_page(alloc_size / upa))) @@ -1868,9 +2284,9 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( } /* - * Expand unit size until address space usage goes over 75% - * and then as much as possible without using more address - * space. + * Wasted space is caused by a ratio imbalance of upa to group_cnt. + * Expand the unit_size until we use >= 75% of the units allocated. + * Related to atom_size, which could be much larger than the unit_size. */ last_allocs = INT_MAX; for (upa = max_upa; upa; upa--) { @@ -2299,36 +2715,6 @@ void __init setup_per_cpu_areas(void) #endif /* CONFIG_SMP */ /* - * First and reserved chunks are initialized with temporary allocation - * map in initdata so that they can be used before slab is online. - * This function is called after slab is brought up and replaces those - * with properly allocated maps. - */ -void __init percpu_init_late(void) -{ - struct pcpu_chunk *target_chunks[] = - { pcpu_first_chunk, pcpu_reserved_chunk, NULL }; - struct pcpu_chunk *chunk; - unsigned long flags; - int i; - - for (i = 0; (chunk = target_chunks[i]); i++) { - int *map; - const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]); - - BUILD_BUG_ON(size > PAGE_SIZE); - - map = pcpu_mem_zalloc(size); - BUG_ON(!map); - - spin_lock_irqsave(&pcpu_lock, flags); - memcpy(map, chunk->map, size); - chunk->map = map; - spin_unlock_irqrestore(&pcpu_lock, flags); - } -} - -/* * Percpu allocator is initialized early during boot when neither slab or * workqueue is available. Plug async management until everything is up * and running. diff --git a/mm/rmap.c b/mm/rmap.c index ced14f1af6dc..c570f82e6827 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -605,6 +605,13 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) tlb_ubc->flush_required = true; /* + * Ensure compiler does not re-order the setting of tlb_flush_batched + * before the PTE is cleared. + */ + barrier(); + mm->tlb_flush_batched = true; + + /* * If the PTE was dirty then it's best to assume it's writable. The * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush() * before the page is queued for IO. @@ -631,6 +638,35 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) return should_defer; } + +/* + * Reclaim unmaps pages under the PTL but do not flush the TLB prior to + * releasing the PTL if TLB flushes are batched. It's possible for a parallel + * operation such as mprotect or munmap to race between reclaim unmapping + * the page and flushing the page. If this race occurs, it potentially allows + * access to data via a stale TLB entry. Tracking all mm's that have TLB + * batching in flight would be expensive during reclaim so instead track + * whether TLB batching occurred in the past and if so then do a flush here + * if required. This will cost one additional flush per reclaim cycle paid + * by the first operation at risk such as mprotect and mumap. + * + * This must be called under the PTL so that an access to tlb_flush_batched + * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise + * via the PTL. + */ +void flush_tlb_batched_pending(struct mm_struct *mm) +{ + if (mm->tlb_flush_batched) { + flush_tlb_mm(mm); + + /* + * Do not allow the compiler to re-order the clearing of + * tlb_flush_batched before the tlb is flushed. + */ + barrier(); + mm->tlb_flush_batched = false; + } +} #else static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) { @@ -851,11 +887,21 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, .address = address, .flags = PVMW_SYNC, }; + unsigned long start = address, end; int *cleaned = arg; + /* + * We have to assume the worse case ie pmd for invalidation. Note that + * the page can not be free from this function. + */ + end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); + mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); + while (page_vma_mapped_walk(&pvmw)) { + unsigned long cstart, cend; int ret = 0; - address = pvmw.address; + + cstart = address = pvmw.address; if (pvmw.pte) { pte_t entry; pte_t *pte = pvmw.pte; @@ -868,6 +914,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, entry = pte_wrprotect(entry); entry = pte_mkclean(entry); set_pte_at(vma->vm_mm, address, pte, entry); + cend = cstart + PAGE_SIZE; ret = 1; } else { #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE @@ -882,6 +929,8 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, entry = pmd_wrprotect(entry); entry = pmd_mkclean(entry); set_pmd_at(vma->vm_mm, address, pmd, entry); + cstart &= PMD_MASK; + cend = cstart + PMD_SIZE; ret = 1; #else /* unexpected pmd-mapped page? */ @@ -890,11 +939,13 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, } if (ret) { - mmu_notifier_invalidate_page(vma->vm_mm, address); + mmu_notifier_invalidate_range(vma->vm_mm, cstart, cend); (*cleaned)++; } } + mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); + return true; } @@ -1288,6 +1339,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, pte_t pteval; struct page *subpage; bool ret = true; + unsigned long start = address, end; enum ttu_flags flags = (enum ttu_flags)arg; /* munlock has nothing to gain from examining un-locked vmas */ @@ -1299,6 +1351,14 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, flags & TTU_MIGRATION, page); } + /* + * We have to assume the worse case ie pmd for invalidation. Note that + * the page can not be free in this function as call of try_to_unmap() + * must hold a reference on the page. + */ + end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); + mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); + while (page_vma_mapped_walk(&pvmw)) { /* * If the page is mlock()d, we cannot swap it out. @@ -1409,6 +1469,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) { WARN_ON_ONCE(1); ret = false; + /* We have to invalidate as we cleared the pte */ page_vma_mapped_walk_done(&pvmw); break; } @@ -1454,8 +1515,12 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, discard: page_remove_rmap(subpage, PageHuge(page)); put_page(page); - mmu_notifier_invalidate_page(mm, address); + mmu_notifier_invalidate_range(mm, address, + address + PAGE_SIZE); } + + mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); + return ret; } diff --git a/mm/shmem.c b/mm/shmem.c index b0aa6075d164..ace53a582be5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -34,6 +34,7 @@ #include <linux/swap.h> #include <linux/uio.h> #include <linux/khugepaged.h> +#include <linux/hugetlb.h> #include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */ @@ -188,6 +189,38 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages) vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); } +static inline bool shmem_inode_acct_block(struct inode *inode, long pages) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + + if (shmem_acct_block(info->flags, pages)) + return false; + + if (sbinfo->max_blocks) { + if (percpu_counter_compare(&sbinfo->used_blocks, + sbinfo->max_blocks - pages) > 0) + goto unacct; + percpu_counter_add(&sbinfo->used_blocks, pages); + } + + return true; + +unacct: + shmem_unacct_blocks(info->flags, pages); + return false; +} + +static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + + if (sbinfo->max_blocks) + percpu_counter_sub(&sbinfo->used_blocks, pages); + shmem_unacct_blocks(info->flags, pages); +} + static const struct super_operations shmem_ops; static const struct address_space_operations shmem_aops; static const struct file_operations shmem_file_operations; @@ -249,23 +282,20 @@ static void shmem_recalc_inode(struct inode *inode) freed = info->alloced - info->swapped - inode->i_mapping->nrpages; if (freed > 0) { - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); - if (sbinfo->max_blocks) - percpu_counter_add(&sbinfo->used_blocks, -freed); info->alloced -= freed; inode->i_blocks -= freed * BLOCKS_PER_PAGE; - shmem_unacct_blocks(info->flags, freed); + shmem_inode_unacct_blocks(inode, freed); } } bool shmem_charge(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); unsigned long flags; - if (shmem_acct_block(info->flags, pages)) + if (!shmem_inode_acct_block(inode, pages)) return false; + spin_lock_irqsave(&info->lock, flags); info->alloced += pages; inode->i_blocks += pages * BLOCKS_PER_PAGE; @@ -273,26 +303,12 @@ bool shmem_charge(struct inode *inode, long pages) spin_unlock_irqrestore(&info->lock, flags); inode->i_mapping->nrpages += pages; - if (!sbinfo->max_blocks) - return true; - if (percpu_counter_compare(&sbinfo->used_blocks, - sbinfo->max_blocks - pages) > 0) { - inode->i_mapping->nrpages -= pages; - spin_lock_irqsave(&info->lock, flags); - info->alloced -= pages; - shmem_recalc_inode(inode); - spin_unlock_irqrestore(&info->lock, flags); - shmem_unacct_blocks(info->flags, pages); - return false; - } - percpu_counter_add(&sbinfo->used_blocks, pages); return true; } void shmem_uncharge(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); unsigned long flags; spin_lock_irqsave(&info->lock, flags); @@ -301,9 +317,7 @@ void shmem_uncharge(struct inode *inode, long pages) shmem_recalc_inode(inode); spin_unlock_irqrestore(&info->lock, flags); - if (sbinfo->max_blocks) - percpu_counter_sub(&sbinfo->used_blocks, pages); - shmem_unacct_blocks(info->flags, pages); + shmem_inode_unacct_blocks(inode, pages); } /* @@ -1022,7 +1036,11 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) */ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { spin_lock(&sbinfo->shrinklist_lock); - if (list_empty(&info->shrinklist)) { + /* + * _careful to defend against unlocked access to + * ->shrink_list in shmem_unused_huge_shrink() + */ + if (list_empty_careful(&info->shrinklist)) { list_add_tail(&info->shrinklist, &sbinfo->shrinklist); sbinfo->shrinklist_len++; @@ -1448,9 +1466,10 @@ static struct page *shmem_alloc_page(gfp_t gfp, } static struct page *shmem_alloc_and_acct_page(gfp_t gfp, - struct shmem_inode_info *info, struct shmem_sb_info *sbinfo, + struct inode *inode, pgoff_t index, bool huge) { + struct shmem_inode_info *info = SHMEM_I(inode); struct page *page; int nr; int err = -ENOSPC; @@ -1459,14 +1478,8 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp, huge = false; nr = huge ? HPAGE_PMD_NR : 1; - if (shmem_acct_block(info->flags, nr)) + if (!shmem_inode_acct_block(inode, nr)) goto failed; - if (sbinfo->max_blocks) { - if (percpu_counter_compare(&sbinfo->used_blocks, - sbinfo->max_blocks - nr) > 0) - goto unacct; - percpu_counter_add(&sbinfo->used_blocks, nr); - } if (huge) page = shmem_alloc_hugepage(gfp, info, index); @@ -1479,10 +1492,7 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp, } err = -ENOMEM; - if (sbinfo->max_blocks) - percpu_counter_add(&sbinfo->used_blocks, -nr); -unacct: - shmem_unacct_blocks(info->flags, nr); + shmem_inode_unacct_blocks(inode, nr); failed: return ERR_PTR(err); } @@ -1640,7 +1650,7 @@ repeat: if (swap.val) { /* Look it up and read it in.. */ - page = lookup_swap_cache(swap); + page = lookup_swap_cache(swap, NULL, 0); if (!page) { /* Or update major stats only when swapin succeeds?? */ if (fault_type) { @@ -1747,10 +1757,9 @@ repeat: } alloc_huge: - page = shmem_alloc_and_acct_page(gfp, info, sbinfo, - index, true); + page = shmem_alloc_and_acct_page(gfp, inode, index, true); if (IS_ERR(page)) { -alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, info, sbinfo, +alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, index, false); } if (IS_ERR(page)) { @@ -1817,7 +1826,11 @@ alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, info, sbinfo, * to shrink under memory pressure. */ spin_lock(&sbinfo->shrinklist_lock); - if (list_empty(&info->shrinklist)) { + /* + * _careful to defend against unlocked access to + * ->shrink_list in shmem_unused_huge_shrink() + */ + if (list_empty_careful(&info->shrinklist)) { list_add_tail(&info->shrinklist, &sbinfo->shrinklist); sbinfo->shrinklist_len++; @@ -1868,10 +1881,7 @@ clear: * Error recovery. */ unacct: - if (sbinfo->max_blocks) - percpu_counter_sub(&sbinfo->used_blocks, - 1 << compound_order(page)); - shmem_unacct_blocks(info->flags, 1 << compound_order(page)); + shmem_inode_unacct_blocks(inode, 1 << compound_order(page)); if (PageTransHuge(page)) { unlock_page(page); @@ -2198,16 +2208,16 @@ bool shmem_mapping(struct address_space *mapping) return mapping->a_ops == &shmem_aops; } -int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - struct page **pagep) +static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + bool zeropage, + struct page **pagep) { struct inode *inode = file_inode(dst_vma->vm_file); struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct address_space *mapping = inode->i_mapping; gfp_t gfp = mapping_gfp_mask(mapping); pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); @@ -2219,33 +2229,30 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, int ret; ret = -ENOMEM; - if (shmem_acct_block(info->flags, 1)) + if (!shmem_inode_acct_block(inode, 1)) goto out; - if (sbinfo->max_blocks) { - if (percpu_counter_compare(&sbinfo->used_blocks, - sbinfo->max_blocks) >= 0) - goto out_unacct_blocks; - percpu_counter_inc(&sbinfo->used_blocks); - } if (!*pagep) { page = shmem_alloc_page(gfp, info, pgoff); if (!page) - goto out_dec_used_blocks; - - page_kaddr = kmap_atomic(page); - ret = copy_from_user(page_kaddr, (const void __user *)src_addr, - PAGE_SIZE); - kunmap_atomic(page_kaddr); - - /* fallback to copy_from_user outside mmap_sem */ - if (unlikely(ret)) { - *pagep = page; - if (sbinfo->max_blocks) - percpu_counter_add(&sbinfo->used_blocks, -1); - shmem_unacct_blocks(info->flags, 1); - /* don't free the page */ - return -EFAULT; + goto out_unacct_blocks; + + if (!zeropage) { /* mcopy_atomic */ + page_kaddr = kmap_atomic(page); + ret = copy_from_user(page_kaddr, + (const void __user *)src_addr, + PAGE_SIZE); + kunmap_atomic(page_kaddr); + + /* fallback to copy_from_user outside mmap_sem */ + if (unlikely(ret)) { + *pagep = page; + shmem_inode_unacct_blocks(inode, 1); + /* don't free the page */ + return -EFAULT; + } + } else { /* mfill_zeropage_atomic */ + clear_highpage(page); } } else { page = *pagep; @@ -2306,14 +2313,33 @@ out_release_uncharge: out_release: unlock_page(page); put_page(page); -out_dec_used_blocks: - if (sbinfo->max_blocks) - percpu_counter_add(&sbinfo->used_blocks, -1); out_unacct_blocks: - shmem_unacct_blocks(info->flags, 1); + shmem_inode_unacct_blocks(inode, 1); goto out; } +int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + struct page **pagep) +{ + return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, + dst_addr, src_addr, false, pagep); +} + +int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr) +{ + struct page *page = NULL; + + return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, + dst_addr, 0, true, &page); +} + #ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; @@ -3627,7 +3653,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) -#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING) +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) SYSCALL_DEFINE2(memfd_create, const char __user *, uname, @@ -3639,8 +3665,18 @@ SYSCALL_DEFINE2(memfd_create, char *name; long len; - if (flags & ~(unsigned int)MFD_ALL_FLAGS) - return -EINVAL; + if (!(flags & MFD_HUGETLB)) { + if (flags & ~(unsigned int)MFD_ALL_FLAGS) + return -EINVAL; + } else { + /* Sealing not supported in hugetlbfs (MFD_HUGETLB) */ + if (flags & MFD_ALLOW_SEALING) + return -EINVAL; + /* Allow huge page size encoding in flags. */ + if (flags & ~(unsigned int)(MFD_ALL_FLAGS | + (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) + return -EINVAL; + } /* length includes terminating zero */ len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); @@ -3671,16 +3707,30 @@ SYSCALL_DEFINE2(memfd_create, goto err_name; } - file = shmem_file_setup(name, 0, VM_NORESERVE); + if (flags & MFD_HUGETLB) { + struct user_struct *user = NULL; + + file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user, + HUGETLB_ANONHUGE_INODE, + (flags >> MFD_HUGE_SHIFT) & + MFD_HUGE_MASK); + } else + file = shmem_file_setup(name, 0, VM_NORESERVE); if (IS_ERR(file)) { error = PTR_ERR(file); goto err_fd; } - info = SHMEM_I(file_inode(file)); file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; file->f_flags |= O_RDWR | O_LARGEFILE; - if (flags & MFD_ALLOW_SEALING) + + if (flags & MFD_ALLOW_SEALING) { + /* + * flags check at beginning of function ensures + * this is not a hugetlbfs (MFD_HUGETLB) file. + */ + info = SHMEM_I(file_inode(file)); info->seals &= ~F_SEAL_SEAL; + } fd_install(fd, file); kfree(name); @@ -3959,7 +4009,7 @@ int __init shmem_init(void) } #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE - if (has_transparent_hugepage() && shmem_huge < SHMEM_HUGE_DENY) + if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; else shmem_huge = 0; /* just in case it was patched */ @@ -4020,7 +4070,7 @@ static ssize_t shmem_enabled_store(struct kobject *kobj, return -EINVAL; shmem_huge = huge; - if (shmem_huge < SHMEM_HUGE_DENY) + if (shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; return count; } diff --git a/mm/slab.h b/mm/slab.h index 6885e1192ec5..073362816acc 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -43,6 +43,7 @@ struct kmem_cache { #include <linux/kasan.h> #include <linux/kmemleak.h> #include <linux/random.h> +#include <linux/sched/mm.h> /* * State of the slab allocator. @@ -412,7 +413,10 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) { flags &= gfp_allowed_mask; - lockdep_trace_alloc(flags); + + fs_reclaim_acquire(flags); + fs_reclaim_release(flags); + might_sleep_if(gfpflags_allow_blocking(flags)); if (should_failslab(s, flags)) diff --git a/mm/slob.c b/mm/slob.c index 1bae78d71096..a8bd6fa11a66 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -432,7 +432,8 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) gfp &= gfp_allowed_mask; - lockdep_trace_alloc(gfp); + fs_reclaim_acquire(gfp); + fs_reclaim_release(gfp); if (size < PAGE_SIZE - align) { if (!size) @@ -538,7 +539,8 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) flags &= gfp_allowed_mask; - lockdep_trace_alloc(flags); + fs_reclaim_acquire(flags); + fs_reclaim_release(flags); if (c->size < PAGE_SIZE) { b = slob_alloc(c->size, flags, c->align, node); diff --git a/mm/slub.c b/mm/slub.c index 1d3f9835f4ea..ddb04576b342 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -34,6 +34,7 @@ #include <linux/stacktrace.h> #include <linux/prefetch.h> #include <linux/memcontrol.h> +#include <linux/random.h> #include <trace/events/kmem.h> @@ -238,30 +239,62 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) * Core slab cache functions *******************************************************************/ +/* + * Returns freelist pointer (ptr). With hardening, this is obfuscated + * with an XOR of the address where the pointer is held and a per-cache + * random number. + */ +static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr, + unsigned long ptr_addr) +{ +#ifdef CONFIG_SLAB_FREELIST_HARDENED + return (void *)((unsigned long)ptr ^ s->random ^ ptr_addr); +#else + return ptr; +#endif +} + +/* Returns the freelist pointer recorded at location ptr_addr. */ +static inline void *freelist_dereference(const struct kmem_cache *s, + void *ptr_addr) +{ + return freelist_ptr(s, (void *)*(unsigned long *)(ptr_addr), + (unsigned long)ptr_addr); +} + static inline void *get_freepointer(struct kmem_cache *s, void *object) { - return *(void **)(object + s->offset); + return freelist_dereference(s, object + s->offset); } static void prefetch_freepointer(const struct kmem_cache *s, void *object) { - prefetch(object + s->offset); + if (object) + prefetch(freelist_dereference(s, object + s->offset)); } static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) { + unsigned long freepointer_addr; void *p; if (!debug_pagealloc_enabled()) return get_freepointer(s, object); - probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p)); - return p; + freepointer_addr = (unsigned long)object + s->offset; + probe_kernel_read(&p, (void **)freepointer_addr, sizeof(p)); + return freelist_ptr(s, p, freepointer_addr); } static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) { - *(void **)(object + s->offset) = fp; + unsigned long freeptr_addr = (unsigned long)object + s->offset; + +#ifdef CONFIG_SLAB_FREELIST_HARDENED + BUG_ON(object == fp); /* naive detection of double free or corruption */ +#endif + + *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr); } /* Loop over all objects in a slab */ @@ -3358,8 +3391,8 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) struct kmem_cache_node *n; for_each_kmem_cache_node(s, node, n) { - kmem_cache_free(kmem_cache_node, n); s->node[node] = NULL; + kmem_cache_free(kmem_cache_node, n); } } @@ -3389,8 +3422,8 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) return 0; } - s->node[node] = n; init_kmem_cache_node(n); + s->node[node] = n; } return 1; } @@ -3563,6 +3596,9 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) { s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); s->reserved = 0; +#ifdef CONFIG_SLAB_FREELIST_HARDENED + s->random = get_random_long(); +#endif if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU)) s->reserved = sizeof(struct rcu_head); @@ -5423,7 +5459,7 @@ static struct attribute *slab_attrs[] = { NULL }; -static struct attribute_group slab_attr_group = { +static const struct attribute_group slab_attr_group = { .attrs = slab_attrs, }; @@ -5642,13 +5678,14 @@ static void sysfs_slab_remove_workfn(struct work_struct *work) * A cache is never shut down before deactivation is * complete, so no need to worry about synchronization. */ - return; + goto out; #ifdef CONFIG_MEMCG kset_unregister(s->memcg_kset); #endif kobject_uevent(&s->kobj, KOBJ_REMOVE); kobject_del(&s->kobj); +out: kobject_put(&s->kobj); } diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index c50b1a14d55e..d1a39b8051e0 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -54,14 +54,9 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) if (slab_is_available()) { struct page *page; - if (node_state(node, N_HIGH_MEMORY)) - page = alloc_pages_node( - node, GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL, - get_order(size)); - else - page = alloc_pages( - GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL, - get_order(size)); + page = alloc_pages_node(node, + GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL, + get_order(size)); if (page) return page_address(page); return NULL; diff --git a/mm/sparse.c b/mm/sparse.c index 7b4be3fd5cac..a9783acf2bb9 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -65,14 +65,10 @@ static noinline struct mem_section __ref *sparse_index_alloc(int nid) unsigned long array_size = SECTIONS_PER_ROOT * sizeof(struct mem_section); - if (slab_is_available()) { - if (node_state(nid, N_HIGH_MEMORY)) - section = kzalloc_node(array_size, GFP_KERNEL, nid); - else - section = kzalloc(array_size, GFP_KERNEL); - } else { + if (slab_is_available()) + section = kzalloc_node(array_size, GFP_KERNEL, nid); + else section = memblock_virt_alloc_node(array_size, nid); - } return section; } diff --git a/mm/swap.c b/mm/swap.c index 60b1d2a75852..62d96b8e5eb3 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -946,28 +946,34 @@ void pagevec_remove_exceptionals(struct pagevec *pvec) } /** - * pagevec_lookup - gang pagecache lookup + * pagevec_lookup_range - gang pagecache lookup * @pvec: Where the resulting pages are placed * @mapping: The address_space to search * @start: The starting page index + * @end: The final page index * @nr_pages: The maximum number of pages * - * pagevec_lookup() will search for and return a group of up to @nr_pages pages - * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a + * pagevec_lookup_range() will search for and return a group of up to @nr_pages + * pages in the mapping starting from index @start and upto index @end + * (inclusive). The pages are placed in @pvec. pagevec_lookup() takes a * reference against the pages in @pvec. * * The search returns a group of mapping-contiguous pages with ascending - * indexes. There may be holes in the indices due to not-present pages. + * indexes. There may be holes in the indices due to not-present pages. We + * also update @start to index the next page for the traversal. * - * pagevec_lookup() returns the number of pages which were found. + * pagevec_lookup_range() returns the number of pages which were found. If this + * number is smaller than @nr_pages, the end of specified range has been + * reached. */ -unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, - pgoff_t start, unsigned nr_pages) +unsigned pagevec_lookup_range(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *start, pgoff_t end) { - pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); + pvec->nr = find_get_pages_range(mapping, start, end, PAGEVEC_SIZE, + pvec->pages); return pagevec_count(pvec); } -EXPORT_SYMBOL(pagevec_lookup); +EXPORT_SYMBOL(pagevec_lookup_range); unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, int tag, unsigned nr_pages) diff --git a/mm/swap_state.c b/mm/swap_state.c index b68c93014f50..71ce2d1ccbf7 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -37,6 +37,29 @@ static const struct address_space_operations swap_aops = { struct address_space *swapper_spaces[MAX_SWAPFILES]; static unsigned int nr_swapper_spaces[MAX_SWAPFILES]; +bool swap_vma_readahead = true; + +#define SWAP_RA_MAX_ORDER_DEFAULT 3 + +static int swap_ra_max_order = SWAP_RA_MAX_ORDER_DEFAULT; + +#define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) +#define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) +#define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK +#define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) + +#define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) +#define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) +#define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) + +#define SWAP_RA_VAL(addr, win, hits) \ + (((addr) & PAGE_MASK) | \ + (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ + ((hits) & SWAP_RA_HITS_MASK)) + +/* Initial readahead hits is 4 to start up with a small window */ +#define GET_SWAP_RA_VAL(vma) \ + (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) #define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0) @@ -297,19 +320,36 @@ void free_pages_and_swap_cache(struct page **pages, int nr) * lock getting page table operations atomic even if we drop the page * lock before returning. */ -struct page * lookup_swap_cache(swp_entry_t entry) +struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, + unsigned long addr) { struct page *page; + unsigned long ra_info; + int win, hits, readahead; page = find_get_page(swap_address_space(entry), swp_offset(entry)); - if (page && likely(!PageTransCompound(page))) { + INC_CACHE_INFO(find_total); + if (page) { INC_CACHE_INFO(find_success); - if (TestClearPageReadahead(page)) - atomic_inc(&swapin_readahead_hits); + if (unlikely(PageTransCompound(page))) + return page; + readahead = TestClearPageReadahead(page); + if (vma) { + ra_info = GET_SWAP_RA_VAL(vma); + win = SWAP_RA_WIN(ra_info); + hits = SWAP_RA_HITS(ra_info); + if (readahead) + hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); + atomic_long_set(&vma->swap_readahead_info, + SWAP_RA_VAL(addr, win, hits)); + } + if (readahead) { + count_vm_event(SWAP_RA_HIT); + if (!vma) + atomic_inc(&swapin_readahead_hits); + } } - - INC_CACHE_INFO(find_total); return page; } @@ -424,22 +464,20 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return retpage; } -static unsigned long swapin_nr_pages(unsigned long offset) +static unsigned int __swapin_nr_pages(unsigned long prev_offset, + unsigned long offset, + int hits, + int max_pages, + int prev_win) { - static unsigned long prev_offset; - unsigned int pages, max_pages, last_ra; - static atomic_t last_readahead_pages; - - max_pages = 1 << READ_ONCE(page_cluster); - if (max_pages <= 1) - return 1; + unsigned int pages, last_ra; /* * This heuristic has been found to work well on both sequential and * random loads, swapping to hard disk or to SSD: please don't ask * what the "+ 2" means, it just happens to work well, that's all. */ - pages = atomic_xchg(&swapin_readahead_hits, 0) + 2; + pages = hits + 2; if (pages == 2) { /* * We can have no readahead hits to judge by: but must not get @@ -448,7 +486,6 @@ static unsigned long swapin_nr_pages(unsigned long offset) */ if (offset != prev_offset + 1 && offset != prev_offset - 1) pages = 1; - prev_offset = offset; } else { unsigned int roundup = 4; while (roundup < pages) @@ -460,9 +497,28 @@ static unsigned long swapin_nr_pages(unsigned long offset) pages = max_pages; /* Don't shrink readahead too fast */ - last_ra = atomic_read(&last_readahead_pages) / 2; + last_ra = prev_win / 2; if (pages < last_ra) pages = last_ra; + + return pages; +} + +static unsigned long swapin_nr_pages(unsigned long offset) +{ + static unsigned long prev_offset; + unsigned int hits, pages, max_pages; + static atomic_t last_readahead_pages; + + max_pages = 1 << READ_ONCE(page_cluster); + if (max_pages <= 1) + return 1; + + hits = atomic_xchg(&swapin_readahead_hits, 0); + pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages, + atomic_read(&last_readahead_pages)); + if (!hits) + prev_offset = offset; atomic_set(&last_readahead_pages, pages); return pages; @@ -496,7 +552,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, unsigned long start_offset, end_offset; unsigned long mask; struct blk_plug plug; - bool do_poll = true; + bool do_poll = true, page_allocated; mask = swapin_nr_pages(offset) - 1; if (!mask) @@ -512,12 +568,19 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, blk_start_plug(&plug); for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ - page = read_swap_cache_async(swp_entry(swp_type(entry), offset), - gfp_mask, vma, addr, false); + page = __read_swap_cache_async( + swp_entry(swp_type(entry), offset), + gfp_mask, vma, addr, &page_allocated); if (!page) continue; - if (offset != entry_offset && likely(!PageTransCompound(page))) - SetPageReadahead(page); + if (page_allocated) { + swap_readpage(page, false); + if (offset != entry_offset && + likely(!PageTransCompound(page))) { + SetPageReadahead(page); + count_vm_event(SWAP_RA); + } + } put_page(page); } blk_finish_plug(&plug); @@ -561,3 +624,210 @@ void exit_swap_address_space(unsigned int type) synchronize_rcu(); kvfree(spaces); } + +static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma, + unsigned long faddr, + unsigned long lpfn, + unsigned long rpfn, + unsigned long *start, + unsigned long *end) +{ + *start = max3(lpfn, PFN_DOWN(vma->vm_start), + PFN_DOWN(faddr & PMD_MASK)); + *end = min3(rpfn, PFN_DOWN(vma->vm_end), + PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); +} + +struct page *swap_readahead_detect(struct vm_fault *vmf, + struct vma_swap_readahead *swap_ra) +{ + struct vm_area_struct *vma = vmf->vma; + unsigned long swap_ra_info; + struct page *page; + swp_entry_t entry; + unsigned long faddr, pfn, fpfn; + unsigned long start, end; + pte_t *pte; + unsigned int max_win, hits, prev_win, win, left; +#ifndef CONFIG_64BIT + pte_t *tpte; +#endif + + faddr = vmf->address; + entry = pte_to_swp_entry(vmf->orig_pte); + if ((unlikely(non_swap_entry(entry)))) + return NULL; + page = lookup_swap_cache(entry, vma, faddr); + if (page) + return page; + + max_win = 1 << READ_ONCE(swap_ra_max_order); + if (max_win == 1) { + swap_ra->win = 1; + return NULL; + } + + fpfn = PFN_DOWN(faddr); + swap_ra_info = GET_SWAP_RA_VAL(vma); + pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info)); + prev_win = SWAP_RA_WIN(swap_ra_info); + hits = SWAP_RA_HITS(swap_ra_info); + swap_ra->win = win = __swapin_nr_pages(pfn, fpfn, hits, + max_win, prev_win); + atomic_long_set(&vma->swap_readahead_info, + SWAP_RA_VAL(faddr, win, 0)); + + if (win == 1) + return NULL; + + /* Copy the PTEs because the page table may be unmapped */ + if (fpfn == pfn + 1) + swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end); + else if (pfn == fpfn + 1) + swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1, + &start, &end); + else { + left = (win - 1) / 2; + swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left, + &start, &end); + } + swap_ra->nr_pte = end - start; + swap_ra->offset = fpfn - start; + pte = vmf->pte - swap_ra->offset; +#ifdef CONFIG_64BIT + swap_ra->ptes = pte; +#else + tpte = swap_ra->ptes; + for (pfn = start; pfn != end; pfn++) + *tpte++ = *pte++; +#endif + + return NULL; +} + +struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, + struct vm_fault *vmf, + struct vma_swap_readahead *swap_ra) +{ + struct blk_plug plug; + struct vm_area_struct *vma = vmf->vma; + struct page *page; + pte_t *pte, pentry; + swp_entry_t entry; + unsigned int i; + bool page_allocated; + + if (swap_ra->win == 1) + goto skip; + + blk_start_plug(&plug); + for (i = 0, pte = swap_ra->ptes; i < swap_ra->nr_pte; + i++, pte++) { + pentry = *pte; + if (pte_none(pentry)) + continue; + if (pte_present(pentry)) + continue; + entry = pte_to_swp_entry(pentry); + if (unlikely(non_swap_entry(entry))) + continue; + page = __read_swap_cache_async(entry, gfp_mask, vma, + vmf->address, &page_allocated); + if (!page) + continue; + if (page_allocated) { + swap_readpage(page, false); + if (i != swap_ra->offset && + likely(!PageTransCompound(page))) { + SetPageReadahead(page); + count_vm_event(SWAP_RA); + } + } + put_page(page); + } + blk_finish_plug(&plug); + lru_add_drain(); +skip: + return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, + swap_ra->win == 1); +} + +#ifdef CONFIG_SYSFS +static ssize_t vma_ra_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%s\n", swap_vma_readahead ? "true" : "false"); +} +static ssize_t vma_ra_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) + swap_vma_readahead = true; + else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) + swap_vma_readahead = false; + else + return -EINVAL; + + return count; +} +static struct kobj_attribute vma_ra_enabled_attr = + __ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show, + vma_ra_enabled_store); + +static ssize_t vma_ra_max_order_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", swap_ra_max_order); +} +static ssize_t vma_ra_max_order_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err, v; + + err = kstrtoint(buf, 10, &v); + if (err || v > SWAP_RA_ORDER_CEILING || v <= 0) + return -EINVAL; + + swap_ra_max_order = v; + + return count; +} +static struct kobj_attribute vma_ra_max_order_attr = + __ATTR(vma_ra_max_order, 0644, vma_ra_max_order_show, + vma_ra_max_order_store); + +static struct attribute *swap_attrs[] = { + &vma_ra_enabled_attr.attr, + &vma_ra_max_order_attr.attr, + NULL, +}; + +static struct attribute_group swap_attr_group = { + .attrs = swap_attrs, +}; + +static int __init swap_init_sysfs(void) +{ + int err; + struct kobject *swap_kobj; + + swap_kobj = kobject_create_and_add("swap", mm_kobj); + if (!swap_kobj) { + pr_err("failed to create swap kobject\n"); + return -ENOMEM; + } + err = sysfs_create_group(swap_kobj, &swap_attr_group); + if (err) { + pr_err("failed to register swap group\n"); + goto delete_obj; + } + return 0; + +delete_obj: + kobject_put(swap_kobj); + return err; +} +subsys_initcall(swap_init_sysfs); +#endif diff --git a/mm/swapfile.c b/mm/swapfile.c index 6ba4aab2db0b..d483278ee35b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -60,7 +60,7 @@ atomic_long_t nr_swap_pages; EXPORT_SYMBOL_GPL(nr_swap_pages); /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; -static int least_priority; +static int least_priority = -1; static const char Bad_file[] = "Bad swap file entry "; static const char Unused_file[] = "Unused swap file entry "; @@ -85,7 +85,7 @@ PLIST_HEAD(swap_active_head); * is held and the locking order requires swap_lock to be taken * before any swap_info_struct->lock. */ -static PLIST_HEAD(swap_avail_head); +struct plist_head *swap_avail_heads; static DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; @@ -96,6 +96,8 @@ static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); /* Activity counter to indicate that a swapon or swapoff has occurred */ static atomic_t proc_poll_event = ATOMIC_INIT(0); +atomic_t nr_rotate_swap = ATOMIC_INIT(0); + static inline unsigned char swap_count(unsigned char ent) { return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ @@ -265,6 +267,16 @@ static inline void cluster_set_null(struct swap_cluster_info *info) info->data = 0; } +static inline bool cluster_is_huge(struct swap_cluster_info *info) +{ + return info->flags & CLUSTER_FLAG_HUGE; +} + +static inline void cluster_clear_huge(struct swap_cluster_info *info) +{ + info->flags &= ~CLUSTER_FLAG_HUGE; +} + static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, unsigned long offset) { @@ -580,6 +592,21 @@ new_cluster: return found_free; } +static void __del_from_avail_list(struct swap_info_struct *p) +{ + int nid; + + for_each_node(nid) + plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]); +} + +static void del_from_avail_list(struct swap_info_struct *p) +{ + spin_lock(&swap_avail_lock); + __del_from_avail_list(p); + spin_unlock(&swap_avail_lock); +} + static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, unsigned int nr_entries) { @@ -593,10 +620,20 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, if (si->inuse_pages == si->pages) { si->lowest_bit = si->max; si->highest_bit = 0; - spin_lock(&swap_avail_lock); - plist_del(&si->avail_list, &swap_avail_head); - spin_unlock(&swap_avail_lock); + del_from_avail_list(si); + } +} + +static void add_to_avail_list(struct swap_info_struct *p) +{ + int nid; + + spin_lock(&swap_avail_lock); + for_each_node(nid) { + WARN_ON(!plist_node_empty(&p->avail_lists[nid])); + plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]); } + spin_unlock(&swap_avail_lock); } static void swap_range_free(struct swap_info_struct *si, unsigned long offset, @@ -611,13 +648,8 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, bool was_full = !si->highest_bit; si->highest_bit = end; - if (was_full && (si->flags & SWP_WRITEOK)) { - spin_lock(&swap_avail_lock); - WARN_ON(!plist_node_empty(&si->avail_list)); - if (plist_node_empty(&si->avail_list)) - plist_add(&si->avail_list, &swap_avail_head); - spin_unlock(&swap_avail_lock); - } + if (was_full && (si->flags & SWP_WRITEOK)) + add_to_avail_list(si); } atomic_long_add(nr_entries, &nr_swap_pages); si->inuse_pages -= nr_entries; @@ -846,7 +878,7 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) offset = idx * SWAPFILE_CLUSTER; ci = lock_cluster(si, offset); alloc_cluster(si, idx); - cluster_set_count_flag(ci, SWAPFILE_CLUSTER, 0); + cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE); map = si->swap_map + offset; for (i = 0; i < SWAPFILE_CLUSTER; i++) @@ -898,6 +930,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[]) struct swap_info_struct *si, *next; long avail_pgs; int n_ret = 0; + int node; /* Only single cluster request supported */ WARN_ON_ONCE(n_goal > 1 && cluster); @@ -917,14 +950,15 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[]) spin_lock(&swap_avail_lock); start_over: - plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { + node = numa_node_id(); + plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { /* requeue si to after same-priority siblings */ - plist_requeue(&si->avail_list, &swap_avail_head); + plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); spin_unlock(&swap_avail_lock); spin_lock(&si->lock); if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { spin_lock(&swap_avail_lock); - if (plist_node_empty(&si->avail_list)) { + if (plist_node_empty(&si->avail_lists[node])) { spin_unlock(&si->lock); goto nextsi; } @@ -934,13 +968,14 @@ start_over: WARN(!(si->flags & SWP_WRITEOK), "swap_info %d in list but !SWP_WRITEOK\n", si->type); - plist_del(&si->avail_list, &swap_avail_head); + __del_from_avail_list(si); spin_unlock(&si->lock); goto nextsi; } - if (cluster) - n_ret = swap_alloc_cluster(si, swp_entries); - else + if (cluster) { + if (!(si->flags & SWP_FILE)) + n_ret = swap_alloc_cluster(si, swp_entries); + } else n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, n_goal, swp_entries); spin_unlock(&si->lock); @@ -962,7 +997,7 @@ nextsi: * swap_avail_head list then try it, otherwise start over * if we have not gotten any slots. */ - if (plist_node_empty(&next->avail_list)) + if (plist_node_empty(&next->avail_lists[node])) goto start_over; } @@ -1168,22 +1203,57 @@ static void swapcache_free_cluster(swp_entry_t entry) struct swap_cluster_info *ci; struct swap_info_struct *si; unsigned char *map; - unsigned int i; + unsigned int i, free_entries = 0; + unsigned char val; - si = swap_info_get(entry); + si = _swap_info_get(entry); if (!si) return; ci = lock_cluster(si, offset); + VM_BUG_ON(!cluster_is_huge(ci)); map = si->swap_map + offset; for (i = 0; i < SWAPFILE_CLUSTER; i++) { - VM_BUG_ON(map[i] != SWAP_HAS_CACHE); - map[i] = 0; + val = map[i]; + VM_BUG_ON(!(val & SWAP_HAS_CACHE)); + if (val == SWAP_HAS_CACHE) + free_entries++; + } + if (!free_entries) { + for (i = 0; i < SWAPFILE_CLUSTER; i++) + map[i] &= ~SWAP_HAS_CACHE; } + cluster_clear_huge(ci); unlock_cluster(ci); - mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); - swap_free_cluster(si, idx); - spin_unlock(&si->lock); + if (free_entries == SWAPFILE_CLUSTER) { + spin_lock(&si->lock); + ci = lock_cluster(si, offset); + memset(map, 0, SWAPFILE_CLUSTER); + unlock_cluster(ci); + mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); + swap_free_cluster(si, idx); + spin_unlock(&si->lock); + } else if (free_entries) { + for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) { + if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE)) + free_swap_slot(entry); + } + } +} + +int split_swap_cluster(swp_entry_t entry) +{ + struct swap_info_struct *si; + struct swap_cluster_info *ci; + unsigned long offset = swp_offset(entry); + + si = _swap_info_get(entry); + if (!si) + return -EBUSY; + ci = lock_cluster(si, offset); + cluster_clear_huge(ci); + unlock_cluster(ci); + return 0; } #else static inline void swapcache_free_cluster(swp_entry_t entry) @@ -1332,29 +1402,161 @@ out: return count; } +#ifdef CONFIG_THP_SWAP +static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, + swp_entry_t entry) +{ + struct swap_cluster_info *ci; + unsigned char *map = si->swap_map; + unsigned long roffset = swp_offset(entry); + unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER); + int i; + bool ret = false; + + ci = lock_cluster_or_swap_info(si, offset); + if (!ci || !cluster_is_huge(ci)) { + if (map[roffset] != SWAP_HAS_CACHE) + ret = true; + goto unlock_out; + } + for (i = 0; i < SWAPFILE_CLUSTER; i++) { + if (map[offset + i] != SWAP_HAS_CACHE) { + ret = true; + break; + } + } +unlock_out: + unlock_cluster_or_swap_info(si, ci); + return ret; +} + +static bool page_swapped(struct page *page) +{ + swp_entry_t entry; + struct swap_info_struct *si; + + if (likely(!PageTransCompound(page))) + return page_swapcount(page) != 0; + + page = compound_head(page); + entry.val = page_private(page); + si = _swap_info_get(entry); + if (si) + return swap_page_trans_huge_swapped(si, entry); + return false; +} + +static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, + int *total_swapcount) +{ + int i, map_swapcount, _total_mapcount, _total_swapcount; + unsigned long offset = 0; + struct swap_info_struct *si; + struct swap_cluster_info *ci = NULL; + unsigned char *map = NULL; + int mapcount, swapcount = 0; + + /* hugetlbfs shouldn't call it */ + VM_BUG_ON_PAGE(PageHuge(page), page); + + if (likely(!PageTransCompound(page))) { + mapcount = atomic_read(&page->_mapcount) + 1; + if (total_mapcount) + *total_mapcount = mapcount; + if (PageSwapCache(page)) + swapcount = page_swapcount(page); + if (total_swapcount) + *total_swapcount = swapcount; + return mapcount + swapcount; + } + + page = compound_head(page); + + _total_mapcount = _total_swapcount = map_swapcount = 0; + if (PageSwapCache(page)) { + swp_entry_t entry; + + entry.val = page_private(page); + si = _swap_info_get(entry); + if (si) { + map = si->swap_map; + offset = swp_offset(entry); + } + } + if (map) + ci = lock_cluster(si, offset); + for (i = 0; i < HPAGE_PMD_NR; i++) { + mapcount = atomic_read(&page[i]._mapcount) + 1; + _total_mapcount += mapcount; + if (map) { + swapcount = swap_count(map[offset + i]); + _total_swapcount += swapcount; + } + map_swapcount = max(map_swapcount, mapcount + swapcount); + } + unlock_cluster(ci); + if (PageDoubleMap(page)) { + map_swapcount -= 1; + _total_mapcount -= HPAGE_PMD_NR; + } + mapcount = compound_mapcount(page); + map_swapcount += mapcount; + _total_mapcount += mapcount; + if (total_mapcount) + *total_mapcount = _total_mapcount; + if (total_swapcount) + *total_swapcount = _total_swapcount; + + return map_swapcount; +} +#else +#define swap_page_trans_huge_swapped(si, entry) swap_swapcount(si, entry) +#define page_swapped(page) (page_swapcount(page) != 0) + +static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, + int *total_swapcount) +{ + int mapcount, swapcount = 0; + + /* hugetlbfs shouldn't call it */ + VM_BUG_ON_PAGE(PageHuge(page), page); + + mapcount = page_trans_huge_mapcount(page, total_mapcount); + if (PageSwapCache(page)) + swapcount = page_swapcount(page); + if (total_swapcount) + *total_swapcount = swapcount; + return mapcount + swapcount; +} +#endif + /* * We can write to an anon page without COW if there are no other references * to it. And as a side-effect, free up its swap: because the old content * on disk will never be read, and seeking back there to write new content * later would only waste time away from clustering. * - * NOTE: total_mapcount should not be relied upon by the caller if + * NOTE: total_map_swapcount should not be relied upon by the caller if * reuse_swap_page() returns false, but it may be always overwritten * (see the other implementation for CONFIG_SWAP=n). */ -bool reuse_swap_page(struct page *page, int *total_mapcount) +bool reuse_swap_page(struct page *page, int *total_map_swapcount) { - int count; + int count, total_mapcount, total_swapcount; VM_BUG_ON_PAGE(!PageLocked(page), page); if (unlikely(PageKsm(page))) return false; - count = page_trans_huge_mapcount(page, total_mapcount); - if (count <= 1 && PageSwapCache(page)) { - count += page_swapcount(page); - if (count != 1) - goto out; + count = page_trans_huge_map_swapcount(page, &total_mapcount, + &total_swapcount); + if (total_map_swapcount) + *total_map_swapcount = total_mapcount + total_swapcount; + if (count == 1 && PageSwapCache(page) && + (likely(!PageTransCompound(page)) || + /* The remaining swap count will be freed soon */ + total_swapcount == page_swapcount(page))) { if (!PageWriteback(page)) { + page = compound_head(page); delete_from_swap_cache(page); SetPageDirty(page); } else { @@ -1370,7 +1572,7 @@ bool reuse_swap_page(struct page *page, int *total_mapcount) spin_unlock(&p->lock); } } -out: + return count <= 1; } @@ -1386,7 +1588,7 @@ int try_to_free_swap(struct page *page) return 0; if (PageWriteback(page)) return 0; - if (page_swapcount(page)) + if (page_swapped(page)) return 0; /* @@ -1407,6 +1609,7 @@ int try_to_free_swap(struct page *page) if (pm_suspended_storage()) return 0; + page = compound_head(page); delete_from_swap_cache(page); SetPageDirty(page); return 1; @@ -1428,7 +1631,8 @@ int free_swap_and_cache(swp_entry_t entry) p = _swap_info_get(entry); if (p) { count = __swap_entry_free(p, entry, 1); - if (count == SWAP_HAS_CACHE) { + if (count == SWAP_HAS_CACHE && + !swap_page_trans_huge_swapped(p, entry)) { page = find_get_page(swap_address_space(entry), swp_offset(entry)); if (page && !trylock_page(page)) { @@ -1445,7 +1649,8 @@ int free_swap_and_cache(swp_entry_t entry) */ if (PageSwapCache(page) && !PageWriteback(page) && (!page_mapped(page) || mem_cgroup_swap_full(page)) && - !swap_swapcount(p, entry)) { + !swap_page_trans_huge_swapped(p, entry)) { + page = compound_head(page); delete_from_swap_cache(page); SetPageDirty(page); } @@ -1999,7 +2204,7 @@ int try_to_unuse(unsigned int type, bool frontswap, .sync_mode = WB_SYNC_NONE, }; - swap_writepage(page, &wbc); + swap_writepage(compound_head(page), &wbc); lock_page(page); wait_on_page_writeback(page); } @@ -2012,8 +2217,9 @@ int try_to_unuse(unsigned int type, bool frontswap, * delete, since it may not have been written out to swap yet. */ if (PageSwapCache(page) && - likely(page_private(page) == entry.val)) - delete_from_swap_cache(page); + likely(page_private(page) == entry.val) && + !page_swapped(page)) + delete_from_swap_cache(compound_head(page)); /* * So we could skip searching mms once swap count went @@ -2226,10 +2432,24 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) return generic_swapfile_activate(sis, swap_file, span); } +static int swap_node(struct swap_info_struct *p) +{ + struct block_device *bdev; + + if (p->bdev) + bdev = p->bdev; + else + bdev = p->swap_file->f_inode->i_sb->s_bdev; + + return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; +} + static void _enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info) { + int i; + if (prio >= 0) p->prio = prio; else @@ -2239,7 +2459,16 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, * low-to-high, while swap ordering is high-to-low */ p->list.prio = -p->prio; - p->avail_list.prio = -p->prio; + for_each_node(i) { + if (p->prio >= 0) + p->avail_lists[i].prio = -p->prio; + else { + if (swap_node(p) == i) + p->avail_lists[i].prio = 1; + else + p->avail_lists[i].prio = -p->prio; + } + } p->swap_map = swap_map; p->cluster_info = cluster_info; p->flags |= SWP_WRITEOK; @@ -2258,9 +2487,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, * swap_info_struct. */ plist_add(&p->list, &swap_active_head); - spin_lock(&swap_avail_lock); - plist_add(&p->avail_list, &swap_avail_head); - spin_unlock(&swap_avail_lock); + add_to_avail_list(p); } static void enable_swap_info(struct swap_info_struct *p, int prio, @@ -2345,17 +2572,19 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); goto out_dput; } - spin_lock(&swap_avail_lock); - plist_del(&p->avail_list, &swap_avail_head); - spin_unlock(&swap_avail_lock); + del_from_avail_list(p); spin_lock(&p->lock); if (p->prio < 0) { struct swap_info_struct *si = p; + int nid; plist_for_each_entry_continue(si, &swap_active_head, list) { si->prio++; si->list.prio--; - si->avail_list.prio--; + for_each_node(nid) { + if (si->avail_lists[nid].prio != 1) + si->avail_lists[nid].prio--; + } } least_priority++; } @@ -2387,6 +2616,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) if (p->flags & SWP_CONTINUED) free_swap_count_continuations(p); + if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev))) + atomic_dec(&nr_rotate_swap); + mutex_lock(&swapon_mutex); spin_lock(&swap_lock); spin_lock(&p->lock); @@ -2596,6 +2828,7 @@ static struct swap_info_struct *alloc_swap_info(void) { struct swap_info_struct *p; unsigned int type; + int i; p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) @@ -2631,7 +2864,8 @@ static struct swap_info_struct *alloc_swap_info(void) } INIT_LIST_HEAD(&p->first_swap_extent.list); plist_node_init(&p->list, 0); - plist_node_init(&p->avail_list, 0); + for_each_node(i) + plist_node_init(&p->avail_lists[i], 0); p->flags = SWP_USED; spin_unlock(&swap_lock); spin_lock_init(&p->lock); @@ -2873,6 +3107,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!swap_avail_heads) + return -ENOMEM; + p = alloc_swap_info(); if (IS_ERR(p)) return PTR_ERR(p); @@ -2963,7 +3200,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) cluster = per_cpu_ptr(p->percpu_cluster, cpu); cluster_set_null(&cluster->index); } - } + } else + atomic_inc(&nr_rotate_swap); error = swap_cgroup_swapon(p->type, maxpages); if (error) @@ -3457,3 +3695,21 @@ static void free_swap_count_continuations(struct swap_info_struct *si) } } } + +static int __init swapfile_init(void) +{ + int nid; + + swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head), + GFP_KERNEL); + if (!swap_avail_heads) { + pr_emerg("Not enough memory for swap heads, swap is disabled\n"); + return -ENOMEM; + } + + for_each_node(nid) + plist_head_init(&swap_avail_heads[nid]); + + return 0; +} +subsys_initcall(swapfile_init); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 8bcb501bce60..81192701964d 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -371,6 +371,36 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, bool zeropage); #endif /* CONFIG_HUGETLB_PAGE */ +static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + struct page **page, + bool zeropage) +{ + ssize_t err; + + if (vma_is_anonymous(dst_vma)) { + if (!zeropage) + err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, + dst_addr, src_addr, page); + else + err = mfill_zeropage_pte(dst_mm, dst_pmd, + dst_vma, dst_addr); + } else { + if (!zeropage) + err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, + dst_vma, dst_addr, + src_addr, page); + else + err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd, + dst_vma, dst_addr); + } + + return err; +} + static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, @@ -487,22 +517,8 @@ retry: BUG_ON(pmd_none(*dst_pmd)); BUG_ON(pmd_trans_huge(*dst_pmd)); - if (vma_is_anonymous(dst_vma)) { - if (!zeropage) - err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, - dst_addr, src_addr, - &page); - else - err = mfill_zeropage_pte(dst_mm, dst_pmd, - dst_vma, dst_addr); - } else { - err = -EINVAL; /* if zeropage is true return -EINVAL */ - if (likely(!zeropage)) - err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, - dst_vma, dst_addr, - src_addr, &page); - } - + err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, + src_addr, &page, zeropage); cond_resched(); if (unlikely(err == -EFAULT)) { diff --git a/mm/util.c b/mm/util.c index 7b07ec852e01..34e57fae959d 100644 --- a/mm/util.c +++ b/mm/util.c @@ -614,7 +614,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) return 0; if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { - free = global_page_state(NR_FREE_PAGES); + free = global_zone_page_state(NR_FREE_PAGES); free += global_node_page_state(NR_FILE_PAGES); /* @@ -633,7 +633,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) * which are reclaimable, under pressure. The dentry * cache and most inode caches should fall into this */ - free += global_page_state(NR_SLAB_RECLAIMABLE); + free += global_node_page_state(NR_SLAB_RECLAIMABLE); /* * Leave reserved pages. The pages are not for anonymous pages. diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 8698c1c86c4d..8a43db6284eb 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -49,12 +49,10 @@ static void __vunmap(const void *, int); static void free_work(struct work_struct *w) { struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); - struct llist_node *llnode = llist_del_all(&p->list); - while (llnode) { - void *p = llnode; - llnode = llist_next(llnode); - __vunmap(p, 1); - } + struct llist_node *t, *llnode; + + llist_for_each_safe(llnode, t, llist_del_all(&p->list)) + __vunmap((void *)llnode, 1); } /*** Page table manipulation functions ***/ @@ -1671,7 +1669,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page **pages; unsigned int nr_pages, array_size, i; const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; - const gfp_t alloc_mask = gfp_mask | __GFP_HIGHMEM | __GFP_NOWARN; + const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; + const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ? + 0 : + __GFP_HIGHMEM; nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); @@ -1679,7 +1680,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, area->nr_pages = nr_pages; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { - pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, + pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask, PAGE_KERNEL, node, area->caller); } else { pages = kmalloc_node(array_size, nested_gfp, node); @@ -1700,9 +1701,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } if (node == NUMA_NO_NODE) - page = alloc_page(alloc_mask); + page = alloc_page(alloc_mask|highmem_mask); else - page = alloc_pages_node(node, alloc_mask, 0); + page = alloc_pages_node(node, alloc_mask|highmem_mask, 0); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ @@ -1710,7 +1711,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } area->pages[i] = page; - if (gfpflags_allow_blocking(gfp_mask)) + if (gfpflags_allow_blocking(gfp_mask|highmem_mask)) cond_resched(); } @@ -2479,7 +2480,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, * matching slot. While scanning, if any of the areas overlaps with * existing vmap_area, the base address is pulled down to fit the * area. Scanning is repeated till all the areas fit and then all - * necessary data structres are inserted and the result is returned. + * necessary data structures are inserted and the result is returned. */ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, @@ -2507,15 +2508,11 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, if (start > offsets[last_area]) last_area = area; - for (area2 = 0; area2 < nr_vms; area2++) { + for (area2 = area + 1; area2 < nr_vms; area2++) { unsigned long start2 = offsets[area2]; unsigned long end2 = start2 + sizes[area2]; - if (area2 == area) - continue; - - BUG_ON(start2 >= start && start2 < end); - BUG_ON(end2 <= end && end2 > start); + BUG_ON(start2 < end && start < end2); } } last_end = offsets[last_area] + sizes[last_area]; diff --git a/mm/vmscan.c b/mm/vmscan.c index a1af041930a6..13d711dd8776 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -393,14 +393,15 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, unsigned long nr_to_scan = min(batch_size, total_scan); shrinkctl->nr_to_scan = nr_to_scan; + shrinkctl->nr_scanned = nr_to_scan; ret = shrinker->scan_objects(shrinker, shrinkctl); if (ret == SHRINK_STOP) break; freed += ret; - count_vm_events(SLABS_SCANNED, nr_to_scan); - total_scan -= nr_to_scan; - scanned += nr_to_scan; + count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned); + total_scan -= shrinkctl->nr_scanned; + scanned += shrinkctl->nr_scanned; cond_resched(); } @@ -535,7 +536,9 @@ static inline int is_page_cache_freeable(struct page *page) * that isolated the page, the page cache radix tree and * optional buffer heads at page->private. */ - return page_count(page) - page_has_private(page) == 2; + int radix_pins = PageTransHuge(page) && PageSwapCache(page) ? + HPAGE_PMD_NR : 1; + return page_count(page) - page_has_private(page) == 1 + radix_pins; } static int may_write_to_inode(struct inode *inode, struct scan_control *sc) @@ -665,6 +668,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, bool reclaimed) { unsigned long flags; + int refcount; BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); @@ -695,11 +699,15 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * Note that if SetPageDirty is always performed via set_page_dirty, * and thus under tree_lock, then this ordering is not required. */ - if (!page_ref_freeze(page, 2)) + if (unlikely(PageTransHuge(page)) && PageSwapCache(page)) + refcount = 1 + HPAGE_PMD_NR; + else + refcount = 2; + if (!page_ref_freeze(page, refcount)) goto cannot_free; /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ if (unlikely(PageDirty(page))) { - page_ref_unfreeze(page, 2); + page_ref_unfreeze(page, refcount); goto cannot_free; } @@ -1121,58 +1129,59 @@ static unsigned long shrink_page_list(struct list_head *page_list, * Try to allocate it some swap space here. * Lazyfree page could be freed directly */ - if (PageAnon(page) && PageSwapBacked(page) && - !PageSwapCache(page)) { - if (!(sc->gfp_mask & __GFP_IO)) - goto keep_locked; - if (PageTransHuge(page)) { - /* cannot split THP, skip it */ - if (!can_split_huge_page(page, NULL)) - goto activate_locked; - /* - * Split pages without a PMD map right - * away. Chances are some or all of the - * tail pages can be freed without IO. - */ - if (!compound_mapcount(page) && - split_huge_page_to_list(page, page_list)) - goto activate_locked; - } - if (!add_to_swap(page)) { - if (!PageTransHuge(page)) - goto activate_locked; - /* Split THP and swap individual base pages */ - if (split_huge_page_to_list(page, page_list)) - goto activate_locked; - if (!add_to_swap(page)) - goto activate_locked; - } - - /* XXX: We don't support THP writes */ - if (PageTransHuge(page) && - split_huge_page_to_list(page, page_list)) { - delete_from_swap_cache(page); - goto activate_locked; - } + if (PageAnon(page) && PageSwapBacked(page)) { + if (!PageSwapCache(page)) { + if (!(sc->gfp_mask & __GFP_IO)) + goto keep_locked; + if (PageTransHuge(page)) { + /* cannot split THP, skip it */ + if (!can_split_huge_page(page, NULL)) + goto activate_locked; + /* + * Split pages without a PMD map right + * away. Chances are some or all of the + * tail pages can be freed without IO. + */ + if (!compound_mapcount(page) && + split_huge_page_to_list(page, + page_list)) + goto activate_locked; + } + if (!add_to_swap(page)) { + if (!PageTransHuge(page)) + goto activate_locked; + /* Fallback to swap normal pages */ + if (split_huge_page_to_list(page, + page_list)) + goto activate_locked; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + count_vm_event(THP_SWPOUT_FALLBACK); +#endif + if (!add_to_swap(page)) + goto activate_locked; + } - may_enter_fs = 1; + may_enter_fs = 1; - /* Adding to swap updated mapping */ - mapping = page_mapping(page); + /* Adding to swap updated mapping */ + mapping = page_mapping(page); + } } else if (unlikely(PageTransHuge(page))) { /* Split file THP */ if (split_huge_page_to_list(page, page_list)) goto keep_locked; } - VM_BUG_ON_PAGE(PageTransHuge(page), page); - /* * The page is mapped into the page tables of one or more * processes. Try to unmap it here. */ if (page_mapped(page)) { - if (!try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) { + enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH; + + if (unlikely(PageTransHuge(page))) + flags |= TTU_SPLIT_HUGE_PMD; + if (!try_to_unmap(page, flags)) { nr_unmap_fail++; goto activate_locked; } @@ -1312,7 +1321,11 @@ free_it: * Is there need to periodically free_page_list? It would * appear not as the counts should be low */ - list_add(&page->lru, &free_pages); + if (unlikely(PageTransHuge(page))) { + mem_cgroup_uncharge(page); + (*get_compound_page_dtor(page))(page); + } else + list_add(&page->lru, &free_pages); continue; activate_locked: @@ -1742,9 +1755,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, int file = is_file_lru(lru); struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; + bool stalled = false; while (unlikely(too_many_isolated(pgdat, file, sc))) { - congestion_wait(BLK_RW_ASYNC, HZ/10); + if (stalled) + return 0; + + /* wait a bit for the reclaimer. */ + msleep(100); + stalled = true; /* We are about to die and free our memory. Return now. */ if (fatal_signal_pending(current)) @@ -3525,8 +3544,6 @@ static int kswapd(void *p) }; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); - lockdep_set_current_reclaim_state(GFP_KERNEL); - if (!cpumask_empty(cpumask)) set_cpus_allowed_ptr(tsk, cpumask); current->reclaim_state = &reclaim_state; @@ -3585,14 +3602,15 @@ kswapd_try_sleep: */ trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx, alloc_order); + fs_reclaim_acquire(GFP_KERNEL); reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); + fs_reclaim_release(GFP_KERNEL); if (reclaim_order < alloc_order) goto kswapd_try_sleep; } tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); current->reclaim_state = NULL; - lockdep_clear_current_reclaim_state(); return 0; } @@ -3655,14 +3673,14 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) unsigned int noreclaim_flag; noreclaim_flag = memalloc_noreclaim_save(); - lockdep_set_current_reclaim_state(sc.gfp_mask); + fs_reclaim_acquire(sc.gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; nr_reclaimed = do_try_to_free_pages(zonelist, &sc); p->reclaim_state = NULL; - lockdep_clear_current_reclaim_state(); + fs_reclaim_release(sc.gfp_mask); memalloc_noreclaim_restore(noreclaim_flag); return nr_reclaimed; @@ -3847,7 +3865,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in */ noreclaim_flag = memalloc_noreclaim_save(); p->flags |= PF_SWAPWRITE; - lockdep_set_current_reclaim_state(sc.gfp_mask); + fs_reclaim_acquire(sc.gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; @@ -3862,9 +3880,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in } p->reclaim_state = NULL; + fs_reclaim_release(gfp_mask); current->flags &= ~PF_SWAPWRITE; memalloc_noreclaim_restore(noreclaim_flag); - lockdep_clear_current_reclaim_state(); return sc.nr_reclaimed >= nr_pages; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 9a4441bbeef2..c7e4b8458023 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -870,6 +870,9 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in { unsigned long requested = 1UL << order; + if (WARN_ON_ONCE(order >= MAX_ORDER)) + return 0; + if (!info->free_blocks_total) return 0; @@ -1071,6 +1074,8 @@ const char * const vmstat_text[] = { #endif "thp_zero_page_alloc", "thp_zero_page_alloc_failed", + "thp_swpout", + "thp_swpout_fallback", #endif #ifdef CONFIG_MEMORY_BALLOON "balloon_inflate", @@ -1093,6 +1098,10 @@ const char * const vmstat_text[] = { "vmacache_find_hits", "vmacache_full_flushes", #endif +#ifdef CONFIG_SWAP + "swap_ra", + "swap_ra_hit", +#endif #endif /* CONFIG_VM_EVENTS_COUNTERS */ }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ @@ -1250,7 +1259,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, seq_putc(m, '\n'); } -/* Print out the free pages at each order for each migratetype */ +/* Print out the number of pageblocks for each migratetype */ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) { int mtype; @@ -1500,7 +1509,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) if (!v) return ERR_PTR(-ENOMEM); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - v[i] = global_page_state(i); + v[i] = global_zone_page_state(i); v += NR_VM_ZONE_STAT_ITEMS; for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) @@ -1589,7 +1598,7 @@ int vmstat_refresh(struct ctl_table *table, int write, * which can equally be echo'ed to or cat'ted from (by root), * can be used to update the stats just before reading them. * - * Oh, and since global_page_state() etc. are so careful to hide + * Oh, and since global_zone_page_state() etc. are so careful to hide * transiently negative values, report an error here if any of * the stats is negative, so we know to go looking for imbalance. */ diff --git a/mm/z3fold.c b/mm/z3fold.c index 54f63c4a809a..486550df32be 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -23,10 +23,13 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/atomic.h> +#include <linux/sched.h> #include <linux/list.h> #include <linux/mm.h> #include <linux/module.h> +#include <linux/percpu.h> #include <linux/preempt.h> +#include <linux/workqueue.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/zpool.h> @@ -48,11 +51,15 @@ enum buddy { }; /* - * struct z3fold_header - z3fold page metadata occupying the first chunk of each + * struct z3fold_header - z3fold page metadata occupying first chunks of each * z3fold page, except for HEADLESS pages - * @buddy: links the z3fold page into the relevant list in the pool + * @buddy: links the z3fold page into the relevant list in the + * pool * @page_lock: per-page lock - * @refcount: reference cound for the z3fold page + * @refcount: reference count for the z3fold page + * @work: work_struct for page layout optimization + * @pool: pointer to the pool which this page belongs to + * @cpu: CPU which this page "belongs" to * @first_chunks: the size of the first buddy in chunks, 0 if free * @middle_chunks: the size of the middle buddy in chunks, 0 if free * @last_chunks: the size of the last buddy in chunks, 0 if free @@ -62,6 +69,9 @@ struct z3fold_header { struct list_head buddy; spinlock_t page_lock; struct kref refcount; + struct work_struct work; + struct z3fold_pool *pool; + short cpu; unsigned short first_chunks; unsigned short middle_chunks; unsigned short last_chunks; @@ -92,28 +102,39 @@ struct z3fold_header { /** * struct z3fold_pool - stores metadata for each z3fold pool - * @lock: protects all pool fields and first|last_chunk fields of any - * z3fold page in the pool - * @unbuddied: array of lists tracking z3fold pages that contain 2- buddies; - * the lists each z3fold page is added to depends on the size of - * its free region. + * @name: pool name + * @lock: protects pool unbuddied/lru lists + * @stale_lock: protects pool stale page list + * @unbuddied: per-cpu array of lists tracking z3fold pages that contain 2- + * buddies; the list each z3fold page is added to depends on + * the size of its free region. * @lru: list tracking the z3fold pages in LRU order by most recently * added buddy. + * @stale: list of pages marked for freeing * @pages_nr: number of z3fold pages in the pool. * @ops: pointer to a structure of user defined operations specified at * pool creation time. + * @compact_wq: workqueue for page layout background optimization + * @release_wq: workqueue for safe page release + * @work: work_struct for safe page release * * This structure is allocated at pool creation time and maintains metadata * pertaining to a particular z3fold pool. */ struct z3fold_pool { + const char *name; spinlock_t lock; - struct list_head unbuddied[NCHUNKS]; + spinlock_t stale_lock; + struct list_head *unbuddied; struct list_head lru; + struct list_head stale; atomic64_t pages_nr; const struct z3fold_ops *ops; struct zpool *zpool; const struct zpool_ops *zpool_ops; + struct workqueue_struct *compact_wq; + struct workqueue_struct *release_wq; + struct work_struct work; }; /* @@ -122,9 +143,10 @@ struct z3fold_pool { enum z3fold_page_flags { PAGE_HEADLESS = 0, MIDDLE_CHUNK_MAPPED, + NEEDS_COMPACTING, + PAGE_STALE }; - /***************** * Helpers *****************/ @@ -138,14 +160,19 @@ static int size_to_chunks(size_t size) #define for_each_unbuddied_list(_iter, _begin) \ for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) +static void compact_page_work(struct work_struct *w); + /* Initializes the z3fold header of a newly allocated z3fold page */ -static struct z3fold_header *init_z3fold_page(struct page *page) +static struct z3fold_header *init_z3fold_page(struct page *page, + struct z3fold_pool *pool) { struct z3fold_header *zhdr = page_address(page); INIT_LIST_HEAD(&page->lru); clear_bit(PAGE_HEADLESS, &page->private); clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); + clear_bit(NEEDS_COMPACTING, &page->private); + clear_bit(PAGE_STALE, &page->private); spin_lock_init(&zhdr->page_lock); kref_init(&zhdr->refcount); @@ -154,7 +181,10 @@ static struct z3fold_header *init_z3fold_page(struct page *page) zhdr->last_chunks = 0; zhdr->first_num = 0; zhdr->start_middle = 0; + zhdr->cpu = -1; + zhdr->pool = pool; INIT_LIST_HEAD(&zhdr->buddy); + INIT_WORK(&zhdr->work, compact_page_work); return zhdr; } @@ -164,21 +194,6 @@ static void free_z3fold_page(struct page *page) __free_page(page); } -static void release_z3fold_page(struct kref *ref) -{ - struct z3fold_header *zhdr; - struct page *page; - - zhdr = container_of(ref, struct z3fold_header, refcount); - page = virt_to_page(zhdr); - - if (!list_empty(&zhdr->buddy)) - list_del(&zhdr->buddy); - if (!list_empty(&page->lru)) - list_del(&page->lru); - free_z3fold_page(page); -} - /* Lock a z3fold page */ static inline void z3fold_page_lock(struct z3fold_header *zhdr) { @@ -228,6 +243,76 @@ static enum buddy handle_to_buddy(unsigned long handle) return (handle - zhdr->first_num) & BUDDY_MASK; } +static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) +{ + struct page *page = virt_to_page(zhdr); + struct z3fold_pool *pool = zhdr->pool; + + WARN_ON(!list_empty(&zhdr->buddy)); + set_bit(PAGE_STALE, &page->private); + spin_lock(&pool->lock); + if (!list_empty(&page->lru)) + list_del(&page->lru); + spin_unlock(&pool->lock); + if (locked) + z3fold_page_unlock(zhdr); + spin_lock(&pool->stale_lock); + list_add(&zhdr->buddy, &pool->stale); + queue_work(pool->release_wq, &pool->work); + spin_unlock(&pool->stale_lock); +} + +static void __attribute__((__unused__)) + release_z3fold_page(struct kref *ref) +{ + struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, + refcount); + __release_z3fold_page(zhdr, false); +} + +static void release_z3fold_page_locked(struct kref *ref) +{ + struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, + refcount); + WARN_ON(z3fold_page_trylock(zhdr)); + __release_z3fold_page(zhdr, true); +} + +static void release_z3fold_page_locked_list(struct kref *ref) +{ + struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, + refcount); + spin_lock(&zhdr->pool->lock); + list_del_init(&zhdr->buddy); + spin_unlock(&zhdr->pool->lock); + + WARN_ON(z3fold_page_trylock(zhdr)); + __release_z3fold_page(zhdr, true); +} + +static void free_pages_work(struct work_struct *w) +{ + struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work); + + spin_lock(&pool->stale_lock); + while (!list_empty(&pool->stale)) { + struct z3fold_header *zhdr = list_first_entry(&pool->stale, + struct z3fold_header, buddy); + struct page *page = virt_to_page(zhdr); + + list_del(&zhdr->buddy); + if (WARN_ON(!test_bit(PAGE_STALE, &page->private))) + continue; + clear_bit(NEEDS_COMPACTING, &page->private); + spin_unlock(&pool->stale_lock); + cancel_work_sync(&zhdr->work); + free_z3fold_page(page); + cond_resched(); + spin_lock(&pool->stale_lock); + } + spin_unlock(&pool->stale_lock); +} + /* * Returns the number of free chunks in a z3fold page. * NB: can't be used with HEADLESS pages. @@ -252,46 +337,6 @@ static int num_free_chunks(struct z3fold_header *zhdr) return nfree; } -/***************** - * API Functions -*****************/ -/** - * z3fold_create_pool() - create a new z3fold pool - * @gfp: gfp flags when allocating the z3fold pool structure - * @ops: user-defined operations for the z3fold pool - * - * Return: pointer to the new z3fold pool or NULL if the metadata allocation - * failed. - */ -static struct z3fold_pool *z3fold_create_pool(gfp_t gfp, - const struct z3fold_ops *ops) -{ - struct z3fold_pool *pool; - int i; - - pool = kzalloc(sizeof(struct z3fold_pool), gfp); - if (!pool) - return NULL; - spin_lock_init(&pool->lock); - for_each_unbuddied_list(i, 0) - INIT_LIST_HEAD(&pool->unbuddied[i]); - INIT_LIST_HEAD(&pool->lru); - atomic64_set(&pool->pages_nr, 0); - pool->ops = ops; - return pool; -} - -/** - * z3fold_destroy_pool() - destroys an existing z3fold pool - * @pool: the z3fold pool to be destroyed - * - * The pool should be emptied before this function is called. - */ -static void z3fold_destroy_pool(struct z3fold_pool *pool) -{ - kfree(pool); -} - static inline void *mchunk_memmove(struct z3fold_header *zhdr, unsigned short dst_chunk) { @@ -347,6 +392,117 @@ static int z3fold_compact_page(struct z3fold_header *zhdr) return 0; } +static void do_compact_page(struct z3fold_header *zhdr, bool locked) +{ + struct z3fold_pool *pool = zhdr->pool; + struct page *page; + struct list_head *unbuddied; + int fchunks; + + page = virt_to_page(zhdr); + if (locked) + WARN_ON(z3fold_page_trylock(zhdr)); + else + z3fold_page_lock(zhdr); + if (test_bit(PAGE_STALE, &page->private) || + !test_and_clear_bit(NEEDS_COMPACTING, &page->private)) { + z3fold_page_unlock(zhdr); + return; + } + spin_lock(&pool->lock); + list_del_init(&zhdr->buddy); + spin_unlock(&pool->lock); + + z3fold_compact_page(zhdr); + unbuddied = get_cpu_ptr(pool->unbuddied); + fchunks = num_free_chunks(zhdr); + if (fchunks < NCHUNKS && + (!zhdr->first_chunks || !zhdr->middle_chunks || + !zhdr->last_chunks)) { + /* the page's not completely free and it's unbuddied */ + spin_lock(&pool->lock); + list_add(&zhdr->buddy, &unbuddied[fchunks]); + spin_unlock(&pool->lock); + zhdr->cpu = smp_processor_id(); + } + put_cpu_ptr(pool->unbuddied); + z3fold_page_unlock(zhdr); +} + +static void compact_page_work(struct work_struct *w) +{ + struct z3fold_header *zhdr = container_of(w, struct z3fold_header, + work); + + do_compact_page(zhdr, false); +} + + +/* + * API Functions + */ + +/** + * z3fold_create_pool() - create a new z3fold pool + * @name: pool name + * @gfp: gfp flags when allocating the z3fold pool structure + * @ops: user-defined operations for the z3fold pool + * + * Return: pointer to the new z3fold pool or NULL if the metadata allocation + * failed. + */ +static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, + const struct z3fold_ops *ops) +{ + struct z3fold_pool *pool = NULL; + int i, cpu; + + pool = kzalloc(sizeof(struct z3fold_pool), gfp); + if (!pool) + goto out; + spin_lock_init(&pool->lock); + spin_lock_init(&pool->stale_lock); + pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); + for_each_possible_cpu(cpu) { + struct list_head *unbuddied = + per_cpu_ptr(pool->unbuddied, cpu); + for_each_unbuddied_list(i, 0) + INIT_LIST_HEAD(&unbuddied[i]); + } + INIT_LIST_HEAD(&pool->lru); + INIT_LIST_HEAD(&pool->stale); + atomic64_set(&pool->pages_nr, 0); + pool->name = name; + pool->compact_wq = create_singlethread_workqueue(pool->name); + if (!pool->compact_wq) + goto out; + pool->release_wq = create_singlethread_workqueue(pool->name); + if (!pool->release_wq) + goto out_wq; + INIT_WORK(&pool->work, free_pages_work); + pool->ops = ops; + return pool; + +out_wq: + destroy_workqueue(pool->compact_wq); +out: + kfree(pool); + return NULL; +} + +/** + * z3fold_destroy_pool() - destroys an existing z3fold pool + * @pool: the z3fold pool to be destroyed + * + * The pool should be emptied before this function is called. + */ +static void z3fold_destroy_pool(struct z3fold_pool *pool) +{ + destroy_workqueue(pool->release_wq); + destroy_workqueue(pool->compact_wq); + kfree(pool); +} + /** * z3fold_alloc() - allocates a region of a given size * @pool: z3fold pool from which to allocate @@ -371,8 +527,9 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, { int chunks = 0, i, freechunks; struct z3fold_header *zhdr = NULL; + struct page *page = NULL; enum buddy bud; - struct page *page; + bool can_sleep = (gfp & __GFP_RECLAIM) == __GFP_RECLAIM; if (!size || (gfp & __GFP_HIGHMEM)) return -EINVAL; @@ -383,23 +540,57 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) bud = HEADLESS; else { + struct list_head *unbuddied; chunks = size_to_chunks(size); +lookup: /* First, try to find an unbuddied z3fold page. */ - zhdr = NULL; + unbuddied = get_cpu_ptr(pool->unbuddied); for_each_unbuddied_list(i, chunks) { - spin_lock(&pool->lock); - zhdr = list_first_entry_or_null(&pool->unbuddied[i], + struct list_head *l = &unbuddied[i]; + + zhdr = list_first_entry_or_null(READ_ONCE(l), struct z3fold_header, buddy); - if (!zhdr || !z3fold_page_trylock(zhdr)) { - spin_unlock(&pool->lock); + + if (!zhdr) continue; + + /* Re-check under lock. */ + spin_lock(&pool->lock); + l = &unbuddied[i]; + if (unlikely(zhdr != list_first_entry(READ_ONCE(l), + struct z3fold_header, buddy)) || + !z3fold_page_trylock(zhdr)) { + spin_unlock(&pool->lock); + put_cpu_ptr(pool->unbuddied); + goto lookup; } - kref_get(&zhdr->refcount); list_del_init(&zhdr->buddy); + zhdr->cpu = -1; spin_unlock(&pool->lock); page = virt_to_page(zhdr); + if (test_bit(NEEDS_COMPACTING, &page->private)) { + z3fold_page_unlock(zhdr); + zhdr = NULL; + put_cpu_ptr(pool->unbuddied); + if (can_sleep) + cond_resched(); + goto lookup; + } + + /* + * this page could not be removed from its unbuddied + * list while pool lock was held, and then we've taken + * page lock so kref_put could not be called before + * we got here, so it's safe to just call kref_get() + */ + kref_get(&zhdr->refcount); + break; + } + put_cpu_ptr(pool->unbuddied); + + if (zhdr) { if (zhdr->first_chunks == 0) { if (zhdr->middle_chunks != 0 && chunks >= zhdr->start_middle) @@ -411,32 +602,49 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, else if (zhdr->middle_chunks == 0) bud = MIDDLE; else { - z3fold_page_unlock(zhdr); - spin_lock(&pool->lock); if (kref_put(&zhdr->refcount, - release_z3fold_page)) + release_z3fold_page_locked)) atomic64_dec(&pool->pages_nr); - spin_unlock(&pool->lock); + else + z3fold_page_unlock(zhdr); pr_err("No free chunks in unbuddied\n"); WARN_ON(1); - continue; + goto lookup; } goto found; } bud = FIRST; } - /* Couldn't find unbuddied z3fold page, create new one */ - page = alloc_page(gfp); + spin_lock(&pool->stale_lock); + zhdr = list_first_entry_or_null(&pool->stale, + struct z3fold_header, buddy); + /* + * Before allocating a page, let's see if we can take one from the + * stale pages list. cancel_work_sync() can sleep so we must make + * sure it won't be called in case we're in atomic context. + */ + if (zhdr && (can_sleep || !work_pending(&zhdr->work) || + !unlikely(work_busy(&zhdr->work)))) { + list_del(&zhdr->buddy); + clear_bit(NEEDS_COMPACTING, &page->private); + spin_unlock(&pool->stale_lock); + if (can_sleep) + cancel_work_sync(&zhdr->work); + page = virt_to_page(zhdr); + } else { + spin_unlock(&pool->stale_lock); + page = alloc_page(gfp); + } + if (!page) return -ENOMEM; atomic64_inc(&pool->pages_nr); - zhdr = init_z3fold_page(page); + zhdr = init_z3fold_page(page, pool); if (bud == HEADLESS) { set_bit(PAGE_HEADLESS, &page->private); - spin_lock(&pool->lock); goto headless; } z3fold_page_lock(zhdr); @@ -451,15 +659,21 @@ found: zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; } - spin_lock(&pool->lock); if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || zhdr->middle_chunks == 0) { + struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied); + /* Add to unbuddied list */ freechunks = num_free_chunks(zhdr); - list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + spin_lock(&pool->lock); + list_add(&zhdr->buddy, &unbuddied[freechunks]); + spin_unlock(&pool->lock); + zhdr->cpu = smp_processor_id(); + put_cpu_ptr(pool->unbuddied); } headless: + spin_lock(&pool->lock); /* Add/move z3fold page to beginning of LRU */ if (!list_empty(&page->lru)) list_del(&page->lru); @@ -487,7 +701,6 @@ headless: static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) { struct z3fold_header *zhdr; - int freechunks; struct page *page; enum buddy bud; @@ -526,25 +739,27 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) spin_unlock(&pool->lock); free_z3fold_page(page); atomic64_dec(&pool->pages_nr); - } else { - if (zhdr->first_chunks != 0 || zhdr->middle_chunks != 0 || - zhdr->last_chunks != 0) { - z3fold_compact_page(zhdr); - /* Add to the unbuddied list */ - spin_lock(&pool->lock); - if (!list_empty(&zhdr->buddy)) - list_del(&zhdr->buddy); - freechunks = num_free_chunks(zhdr); - list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); - spin_unlock(&pool->lock); - } + return; + } + + if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) { + atomic64_dec(&pool->pages_nr); + return; + } + if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) { z3fold_page_unlock(zhdr); + return; + } + if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) { spin_lock(&pool->lock); - if (kref_put(&zhdr->refcount, release_z3fold_page)) - atomic64_dec(&pool->pages_nr); + list_del_init(&zhdr->buddy); spin_unlock(&pool->lock); + zhdr->cpu = -1; + do_compact_page(zhdr, true); + return; } - + queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work); + z3fold_page_unlock(zhdr); } /** @@ -585,9 +800,10 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) */ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) { - int i, ret = 0, freechunks; - struct z3fold_header *zhdr; - struct page *page; + int i, ret = 0; + struct z3fold_header *zhdr = NULL; + struct page *page = NULL; + struct list_head *pos; unsigned long first_handle = 0, middle_handle = 0, last_handle = 0; spin_lock(&pool->lock); @@ -600,16 +816,24 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) spin_unlock(&pool->lock); return -EINVAL; } - page = list_last_entry(&pool->lru, struct page, lru); + list_for_each_prev(pos, &pool->lru) { + page = list_entry(pos, struct page, lru); + if (test_bit(PAGE_HEADLESS, &page->private)) + /* candidate found */ + break; + + zhdr = page_address(page); + if (!z3fold_page_trylock(zhdr)) + continue; /* can't evict at this point */ + kref_get(&zhdr->refcount); + list_del_init(&zhdr->buddy); + zhdr->cpu = -1; + } + list_del_init(&page->lru); + spin_unlock(&pool->lock); - zhdr = page_address(page); if (!test_bit(PAGE_HEADLESS, &page->private)) { - if (!list_empty(&zhdr->buddy)) - list_del_init(&zhdr->buddy); - kref_get(&zhdr->refcount); - spin_unlock(&pool->lock); - z3fold_page_lock(zhdr); /* * We need encode the handles before unlocking, since * we can race with free that will set @@ -624,11 +848,14 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) middle_handle = encode_handle(zhdr, MIDDLE); if (zhdr->last_chunks) last_handle = encode_handle(zhdr, LAST); + /* + * it's safe to unlock here because we hold a + * reference to this page + */ z3fold_page_unlock(zhdr); } else { first_handle = encode_handle(zhdr, HEADLESS); last_handle = middle_handle = 0; - spin_unlock(&pool->lock); } /* Issue the eviction callback(s) */ @@ -652,31 +879,12 @@ next: if (ret == 0) { free_z3fold_page(page); return 0; - } else { - spin_lock(&pool->lock); - } - } else { - z3fold_page_lock(zhdr); - if ((zhdr->first_chunks || zhdr->last_chunks || - zhdr->middle_chunks) && - !(zhdr->first_chunks && zhdr->last_chunks && - zhdr->middle_chunks)) { - z3fold_compact_page(zhdr); - /* add to unbuddied list */ - spin_lock(&pool->lock); - freechunks = num_free_chunks(zhdr); - list_add(&zhdr->buddy, - &pool->unbuddied[freechunks]); - spin_unlock(&pool->lock); - } - z3fold_page_unlock(zhdr); - spin_lock(&pool->lock); - if (kref_put(&zhdr->refcount, release_z3fold_page)) { - spin_unlock(&pool->lock); - atomic64_dec(&pool->pages_nr); - return 0; } + } else if (kref_put(&zhdr->refcount, release_z3fold_page)) { + atomic64_dec(&pool->pages_nr); + return 0; } + spin_lock(&pool->lock); /* * Add to the beginning of LRU. @@ -795,7 +1003,8 @@ static void *z3fold_zpool_create(const char *name, gfp_t gfp, { struct z3fold_pool *pool; - pool = z3fold_create_pool(gfp, zpool_ops ? &z3fold_zpool_ops : NULL); + pool = z3fold_create_pool(name, gfp, + zpool_ops ? &z3fold_zpool_ops : NULL); if (pool) { pool->zpool = zpool; pool->zpool_ops = zpool_ops; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 013eea76685e..62457eb82330 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1983,8 +1983,11 @@ int zs_page_migrate(struct address_space *mapping, struct page *newpage, spin_lock(&class->lock); if (!get_zspage_inuse(zspage)) { - ret = -EBUSY; - goto unlock_class; + /* + * Set "offset" to end of the page so that every loops + * skips unnecessary object scanning. + */ + offset = PAGE_SIZE; } pos = offset; @@ -2052,7 +2055,6 @@ unpin_objects: } } kunmap_atomic(s_addr); -unlock_class: spin_unlock(&class->lock); migrate_write_unlock(zspage); @@ -2453,7 +2455,6 @@ void zs_destroy_pool(struct zs_pool *pool) } destroy_cache(pool); - kfree(pool->size_class); kfree(pool->name); kfree(pool); } |