diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-04-11 20:51:26 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-04-11 20:51:26 +0300 |
commit | 8837c70d531a1788f975c366c254a5cb973a5291 (patch) | |
tree | f7a719d01090efb3bc534f5b0d7f13ec87eecadb /mm | |
parent | b284d4d5a6785f8cd07eda2646a95782373cd01e (diff) | |
parent | b93b016313b3ba8003c3b8bb71f569af91f19fc7 (diff) | |
download | linux-8837c70d531a1788f975c366c254a5cb973a5291.tar.xz |
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton:
- almost all of the rest of MM
- kasan updates
- lots of procfs work
- misc things
- lib/ updates
- checkpatch
- rapidio
- ipc/shm updates
- the start of willy's XArray conversion
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (140 commits)
page cache: use xa_lock
xarray: add the xa_lock to the radix_tree_root
fscache: use appropriate radix tree accessors
export __set_page_dirty
unicore32: turn flush_dcache_mmap_lock into a no-op
arm64: turn flush_dcache_mmap_lock into a no-op
mac80211_hwsim: use DEFINE_IDA
radix tree: use GFP_ZONEMASK bits of gfp_t for flags
linux/const.h: refactor _BITUL and _BITULL a bit
linux/const.h: move UL() macro to include/linux/const.h
linux/const.h: prefix include guard of uapi/linux/const.h with _UAPI
xen, mm: allow deferred page initialization for xen pv domains
elf: enforce MAP_FIXED on overlaying elf segments
fs, elf: drop MAP_FIXED usage from elf_map
mm: introduce MAP_FIXED_NOREPLACE
MAINTAINERS: update bouncing aacraid@adaptec.com addresses
fs/dcache.c: add cond_resched() in shrink_dentry_list()
include/linux/kfifo.h: fix comment
ipc/shm.c: shm_split(): remove unneeded test for NULL shm_file_data.vm_ops
kernel/sysctl.c: add kdoc comments to do_proc_do{u}intvec_minmax_conv_param
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 19 | ||||
-rw-r--r-- | mm/cma.c | 83 | ||||
-rw-r--r-- | mm/compaction.c | 7 | ||||
-rw-r--r-- | mm/filemap.c | 84 | ||||
-rw-r--r-- | mm/hmm.c | 573 | ||||
-rw-r--r-- | mm/huge_memory.c | 21 | ||||
-rw-r--r-- | mm/internal.h | 5 | ||||
-rw-r--r-- | mm/khugepaged.c | 67 | ||||
-rw-r--r-- | mm/ksm.c | 7 | ||||
-rw-r--r-- | mm/memcontrol.c | 37 | ||||
-rw-r--r-- | mm/memory-failure.c | 2 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 5 | ||||
-rw-r--r-- | mm/mempolicy.c | 40 | ||||
-rw-r--r-- | mm/migrate.c | 401 | ||||
-rw-r--r-- | mm/mmap.c | 11 | ||||
-rw-r--r-- | mm/mprotect.c | 9 | ||||
-rw-r--r-- | mm/page-writeback.c | 43 | ||||
-rw-r--r-- | mm/page_alloc.c | 119 | ||||
-rw-r--r-- | mm/page_isolation.c | 3 | ||||
-rw-r--r-- | mm/readahead.c | 2 | ||||
-rw-r--r-- | mm/rmap.c | 4 | ||||
-rw-r--r-- | mm/shmem.c | 60 | ||||
-rw-r--r-- | mm/slub.c | 57 | ||||
-rw-r--r-- | mm/swap_state.c | 17 | ||||
-rw-r--r-- | mm/swapfile.c | 6 | ||||
-rw-r--r-- | mm/truncate.c | 22 | ||||
-rw-r--r-- | mm/util.c | 9 | ||||
-rw-r--r-- | mm/vmscan.c | 210 | ||||
-rw-r--r-- | mm/vmstat.c | 1 | ||||
-rw-r--r-- | mm/workingset.c | 22 | ||||
-rw-r--r-- | mm/z3fold.c | 11 |
31 files changed, 1136 insertions, 821 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 08b9aab631ab..023190c69dce 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1020,23 +1020,18 @@ EXPORT_SYMBOL(congestion_wait); /** * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes - * @pgdat: A pgdat to check if it is heavily congested * @sync: SYNC or ASYNC IO * @timeout: timeout in jiffies * - * In the event of a congested backing_dev (any backing_dev) and the given - * @pgdat has experienced recent congestion, this waits for up to @timeout - * jiffies for either a BDI to exit congestion of the given @sync queue - * or a write to complete. - * - * In the absence of pgdat congestion, cond_resched() is called to yield - * the processor if necessary but otherwise does not sleep. + * In the event of a congested backing_dev (any backing_dev) this waits + * for up to @timeout jiffies for either a BDI to exit congestion of the + * given @sync queue or a write to complete. * * The return value is 0 if the sleep is for the full timeout. Otherwise, * it is the number of jiffies that were still remaining when the function * returned. return_value == timeout implies the function did not sleep. */ -long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout) +long wait_iff_congested(int sync, long timeout) { long ret; unsigned long start = jiffies; @@ -1044,12 +1039,10 @@ long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout) wait_queue_head_t *wqh = &congestion_wqh[sync]; /* - * If there is no congestion, or heavy congestion is not being - * encountered in the current pgdat, yield if necessary instead + * If there is no congestion, yield if necessary instead * of sleeping on the congestion queue */ - if (atomic_read(&nr_wb_congested[sync]) == 0 || - !test_bit(PGDAT_CONGESTED, &pgdat->flags)) { + if (atomic_read(&nr_wb_congested[sync]) == 0) { cond_resched(); /* In case we scheduled, work out time remaining */ @@ -39,6 +39,7 @@ #include <trace/events/cma.h> #include "cma.h" +#include "internal.h" struct cma cma_areas[MAX_CMA_AREAS]; unsigned cma_area_count; @@ -109,23 +110,25 @@ static int __init cma_activate_area(struct cma *cma) if (!cma->bitmap) return -ENOMEM; - WARN_ON_ONCE(!pfn_valid(pfn)); - zone = page_zone(pfn_to_page(pfn)); - do { unsigned j; base_pfn = pfn; + if (!pfn_valid(base_pfn)) + goto err; + + zone = page_zone(pfn_to_page(base_pfn)); for (j = pageblock_nr_pages; j; --j, pfn++) { - WARN_ON_ONCE(!pfn_valid(pfn)); + if (!pfn_valid(pfn)) + goto err; + /* - * alloc_contig_range requires the pfn range - * specified to be in the same zone. Make this - * simple by forcing the entire CMA resv range - * to be in the same zone. + * In init_cma_reserved_pageblock(), present_pages + * is adjusted with assumption that all pages in + * the pageblock come from a single zone. */ if (page_zone(pfn_to_page(pfn)) != zone) - goto not_in_zone; + goto err; } init_cma_reserved_pageblock(pfn_to_page(base_pfn)); } while (--i); @@ -139,7 +142,7 @@ static int __init cma_activate_area(struct cma *cma) return 0; -not_in_zone: +err: pr_err("CMA area %s could not be activated\n", cma->name); kfree(cma->bitmap); cma->count = 0; @@ -149,6 +152,41 @@ not_in_zone: static int __init cma_init_reserved_areas(void) { int i; + struct zone *zone; + pg_data_t *pgdat; + + if (!cma_area_count) + return 0; + + for_each_online_pgdat(pgdat) { + unsigned long start_pfn = UINT_MAX, end_pfn = 0; + + zone = &pgdat->node_zones[ZONE_MOVABLE]; + + /* + * In this case, we cannot adjust the zone range + * since it is now maximum node span and we don't + * know original zone range. + */ + if (populated_zone(zone)) + continue; + + for (i = 0; i < cma_area_count; i++) { + if (pfn_to_nid(cma_areas[i].base_pfn) != + pgdat->node_id) + continue; + + start_pfn = min(start_pfn, cma_areas[i].base_pfn); + end_pfn = max(end_pfn, cma_areas[i].base_pfn + + cma_areas[i].count); + } + + if (!end_pfn) + continue; + + zone->zone_start_pfn = start_pfn; + zone->spanned_pages = end_pfn - start_pfn; + } for (i = 0; i < cma_area_count; i++) { int ret = cma_activate_area(&cma_areas[i]); @@ -157,9 +195,32 @@ static int __init cma_init_reserved_areas(void) return ret; } + /* + * Reserved pages for ZONE_MOVABLE are now activated and + * this would change ZONE_MOVABLE's managed page counter and + * the other zones' present counter. We need to re-calculate + * various zone information that depends on this initialization. + */ + build_all_zonelists(NULL); + for_each_populated_zone(zone) { + if (zone_idx(zone) == ZONE_MOVABLE) { + zone_pcp_reset(zone); + setup_zone_pageset(zone); + } else + zone_pcp_update(zone); + + set_zone_contiguous(zone); + } + + /* + * We need to re-init per zone wmark by calling + * init_per_zone_wmark_min() but doesn't call here because it is + * registered on core_initcall and it will be called later than us. + */ + return 0; } -core_initcall(cma_init_reserved_areas); +pure_initcall(cma_init_reserved_areas); /** * cma_init_reserved_mem() - create custom contiguous area from reserved memory diff --git a/mm/compaction.c b/mm/compaction.c index 88d01a50a015..028b7210a669 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1166,8 +1166,7 @@ static void isolate_freepages(struct compact_control *cc) * from the isolated freelists in the block we are migrating to. */ static struct page *compaction_alloc(struct page *migratepage, - unsigned long data, - int **result) + unsigned long data) { struct compact_control *cc = (struct compact_control *)data; struct page *freepage; @@ -1451,14 +1450,12 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * if compaction succeeds. * For costly orders, we require low watermark instead of min for * compaction to proceed to increase its chances. - * ALLOC_CMA is used, as pages in CMA pageblocks are considered - * suitable migration targets */ watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? low_wmark_pages(zone) : min_wmark_pages(zone); watermark += compact_gap(order); if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, - ALLOC_CMA, wmark_target)) + 0, wmark_target)) return COMPACT_SKIPPED; return COMPACT_CONTINUE; diff --git a/mm/filemap.c b/mm/filemap.c index 693f62212a59..ab77e19ab09c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -66,7 +66,7 @@ * ->i_mmap_rwsem (truncate_pagecache) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) - * ->mapping->tree_lock + * ->i_pages lock * * ->i_mutex * ->i_mmap_rwsem (truncate->unmap_mapping_range) @@ -74,7 +74,7 @@ * ->mmap_sem * ->i_mmap_rwsem * ->page_table_lock or pte_lock (various, mainly in memory.c) - * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) + * ->i_pages lock (arch-dependent flush_dcache_mmap_lock) * * ->mmap_sem * ->lock_page (access_process_vm) @@ -84,7 +84,7 @@ * * bdi->wb.list_lock * sb_lock (fs/fs-writeback.c) - * ->mapping->tree_lock (__sync_single_inode) + * ->i_pages lock (__sync_single_inode) * * ->i_mmap_rwsem * ->anon_vma.lock (vma_adjust) @@ -95,11 +95,11 @@ * ->page_table_lock or pte_lock * ->swap_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) - * ->tree_lock (try_to_unmap_one) + * ->i_pages lock (try_to_unmap_one) * ->zone_lru_lock(zone) (follow_page->mark_page_accessed) * ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) - * ->tree_lock (page_remove_rmap->set_page_dirty) + * ->i_pages lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) * ->memcg->move_lock (page_remove_rmap->lock_page_memcg) @@ -118,14 +118,15 @@ static int page_cache_tree_insert(struct address_space *mapping, void **slot; int error; - error = __radix_tree_create(&mapping->page_tree, page->index, 0, + error = __radix_tree_create(&mapping->i_pages, page->index, 0, &node, &slot); if (error) return error; if (*slot) { void *p; - p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + p = radix_tree_deref_slot_protected(slot, + &mapping->i_pages.xa_lock); if (!radix_tree_exceptional_entry(p)) return -EEXIST; @@ -133,7 +134,7 @@ static int page_cache_tree_insert(struct address_space *mapping, if (shadowp) *shadowp = p; } - __radix_tree_replace(&mapping->page_tree, node, slot, page, + __radix_tree_replace(&mapping->i_pages, node, slot, page, workingset_lookup_update(mapping)); mapping->nrpages++; return 0; @@ -155,13 +156,13 @@ static void page_cache_tree_delete(struct address_space *mapping, struct radix_tree_node *node; void **slot; - __radix_tree_lookup(&mapping->page_tree, page->index + i, + __radix_tree_lookup(&mapping->i_pages, page->index + i, &node, &slot); VM_BUG_ON_PAGE(!node && nr != 1, page); - radix_tree_clear_tags(&mapping->page_tree, node, slot); - __radix_tree_replace(&mapping->page_tree, node, slot, shadow, + radix_tree_clear_tags(&mapping->i_pages, node, slot); + __radix_tree_replace(&mapping->i_pages, node, slot, shadow, workingset_lookup_update(mapping)); } @@ -253,7 +254,7 @@ static void unaccount_page_cache_page(struct address_space *mapping, /* * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold the mapping's tree_lock. + * is safe. The caller must hold the i_pages lock. */ void __delete_from_page_cache(struct page *page, void *shadow) { @@ -296,9 +297,9 @@ void delete_from_page_cache(struct page *page) unsigned long flags; BUG_ON(!PageLocked(page)); - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); __delete_from_page_cache(page, NULL); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); page_cache_free_page(mapping, page); } @@ -309,14 +310,14 @@ EXPORT_SYMBOL(delete_from_page_cache); * @mapping: the mapping to which pages belong * @pvec: pagevec with pages to delete * - * The function walks over mapping->page_tree and removes pages passed in @pvec - * from the radix tree. The function expects @pvec to be sorted by page index. - * It tolerates holes in @pvec (radix tree entries at those indices are not + * The function walks over mapping->i_pages and removes pages passed in @pvec + * from the mapping. The function expects @pvec to be sorted by page index. + * It tolerates holes in @pvec (mapping entries at those indices are not * modified). The function expects only THP head pages to be present in the - * @pvec and takes care to delete all corresponding tail pages from the radix - * tree as well. + * @pvec and takes care to delete all corresponding tail pages from the + * mapping as well. * - * The function expects mapping->tree_lock to be held. + * The function expects the i_pages lock to be held. */ static void page_cache_tree_delete_batch(struct address_space *mapping, @@ -330,11 +331,11 @@ page_cache_tree_delete_batch(struct address_space *mapping, pgoff_t start; start = pvec->pages[0]->index; - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { if (i >= pagevec_count(pvec) && !tail_pages) break; page = radix_tree_deref_slot_protected(slot, - &mapping->tree_lock); + &mapping->i_pages.xa_lock); if (radix_tree_exceptional_entry(page)) continue; if (!tail_pages) { @@ -357,8 +358,8 @@ page_cache_tree_delete_batch(struct address_space *mapping, } else { tail_pages--; } - radix_tree_clear_tags(&mapping->page_tree, iter.node, slot); - __radix_tree_replace(&mapping->page_tree, iter.node, slot, NULL, + radix_tree_clear_tags(&mapping->i_pages, iter.node, slot); + __radix_tree_replace(&mapping->i_pages, iter.node, slot, NULL, workingset_lookup_update(mapping)); total_pages++; } @@ -374,14 +375,14 @@ void delete_from_page_cache_batch(struct address_space *mapping, if (!pagevec_count(pvec)) return; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); for (i = 0; i < pagevec_count(pvec); i++) { trace_mm_filemap_delete_from_page_cache(pvec->pages[i]); unaccount_page_cache_page(mapping, pvec->pages[i]); } page_cache_tree_delete_batch(mapping, pvec); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); for (i = 0; i < pagevec_count(pvec); i++) page_cache_free_page(mapping, pvec->pages[i]); @@ -798,7 +799,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->mapping = mapping; new->index = offset; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); __delete_from_page_cache(old, NULL); error = page_cache_tree_insert(mapping, new, NULL); BUG_ON(error); @@ -810,7 +811,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) __inc_node_page_state(new, NR_FILE_PAGES); if (PageSwapBacked(new)) __inc_node_page_state(new, NR_SHMEM); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); mem_cgroup_migrate(old, new); radix_tree_preload_end(); if (freepage) @@ -852,7 +853,7 @@ static int __add_to_page_cache_locked(struct page *page, page->mapping = mapping; page->index = offset; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); error = page_cache_tree_insert(mapping, page, shadowp); radix_tree_preload_end(); if (unlikely(error)) @@ -861,7 +862,7 @@ static int __add_to_page_cache_locked(struct page *page, /* hugetlb pages do not participate in page cache accounting. */ if (!huge) __inc_node_page_state(page, NR_FILE_PAGES); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); if (!huge) mem_cgroup_commit_charge(page, memcg, false, false); trace_mm_filemap_add_to_page_cache(page); @@ -869,7 +870,7 @@ static int __add_to_page_cache_locked(struct page *page, err_insert: page->mapping = NULL; /* Leave page->index set: truncation relies upon it */ - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); if (!huge) mem_cgroup_cancel_charge(page, memcg, false); put_page(page); @@ -1353,7 +1354,7 @@ pgoff_t page_cache_next_hole(struct address_space *mapping, for (i = 0; i < max_scan; i++) { struct page *page; - page = radix_tree_lookup(&mapping->page_tree, index); + page = radix_tree_lookup(&mapping->i_pages, index); if (!page || radix_tree_exceptional_entry(page)) break; index++; @@ -1394,7 +1395,7 @@ pgoff_t page_cache_prev_hole(struct address_space *mapping, for (i = 0; i < max_scan; i++) { struct page *page; - page = radix_tree_lookup(&mapping->page_tree, index); + page = radix_tree_lookup(&mapping->i_pages, index); if (!page || radix_tree_exceptional_entry(page)) break; index--; @@ -1427,7 +1428,7 @@ struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) rcu_read_lock(); repeat: page = NULL; - pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); + pagep = radix_tree_lookup_slot(&mapping->i_pages, offset); if (pagep) { page = radix_tree_deref_slot(pagep); if (unlikely(!page)) @@ -1633,7 +1634,7 @@ unsigned find_get_entries(struct address_space *mapping, return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { struct page *head, *page; repeat: page = radix_tree_deref_slot(slot); @@ -1710,7 +1711,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, *start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, *start) { struct page *head, *page; if (iter.index > end) @@ -1795,7 +1796,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, return 0; rcu_read_lock(); - radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { + radix_tree_for_each_contig(slot, &mapping->i_pages, &iter, index) { struct page *head, *page; repeat: page = radix_tree_deref_slot(slot); @@ -1875,8 +1876,7 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, return 0; rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->page_tree, - &iter, *index, tag) { + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, *index, tag) { struct page *head, *page; if (iter.index > end) @@ -1969,8 +1969,7 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, return 0; rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->page_tree, - &iter, start, tag) { + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, tag) { struct page *head, *page; repeat: page = radix_tree_deref_slot(slot); @@ -2624,8 +2623,7 @@ void filemap_map_pages(struct vm_fault *vmf, struct page *head, *page; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, - start_pgoff) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start_pgoff) { if (iter.index > end_pgoff) break; repeat: @@ -160,6 +160,32 @@ static void hmm_invalidate_range(struct hmm *hmm, up_read(&hmm->mirrors_sem); } +static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) +{ + struct hmm_mirror *mirror; + struct hmm *hmm = mm->hmm; + + down_write(&hmm->mirrors_sem); + mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror, + list); + while (mirror) { + list_del_init(&mirror->list); + if (mirror->ops->release) { + /* + * Drop mirrors_sem so callback can wait on any pending + * work that might itself trigger mmu_notifier callback + * and thus would deadlock with us. + */ + up_write(&hmm->mirrors_sem); + mirror->ops->release(mirror); + down_write(&hmm->mirrors_sem); + } + mirror = list_first_entry_or_null(&hmm->mirrors, + struct hmm_mirror, list); + } + up_write(&hmm->mirrors_sem); +} + static void hmm_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, @@ -185,6 +211,7 @@ static void hmm_invalidate_range_end(struct mmu_notifier *mn, } static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { + .release = hmm_release, .invalidate_range_start = hmm_invalidate_range_start, .invalidate_range_end = hmm_invalidate_range_end, }; @@ -206,13 +233,24 @@ int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) if (!mm || !mirror || !mirror->ops) return -EINVAL; +again: mirror->hmm = hmm_register(mm); if (!mirror->hmm) return -ENOMEM; down_write(&mirror->hmm->mirrors_sem); - list_add(&mirror->list, &mirror->hmm->mirrors); - up_write(&mirror->hmm->mirrors_sem); + if (mirror->hmm->mm == NULL) { + /* + * A racing hmm_mirror_unregister() is about to destroy the hmm + * struct. Try again to allocate a new one. + */ + up_write(&mirror->hmm->mirrors_sem); + mirror->hmm = NULL; + goto again; + } else { + list_add(&mirror->list, &mirror->hmm->mirrors); + up_write(&mirror->hmm->mirrors_sem); + } return 0; } @@ -227,11 +265,32 @@ EXPORT_SYMBOL(hmm_mirror_register); */ void hmm_mirror_unregister(struct hmm_mirror *mirror) { - struct hmm *hmm = mirror->hmm; + bool should_unregister = false; + struct mm_struct *mm; + struct hmm *hmm; + if (mirror->hmm == NULL) + return; + + hmm = mirror->hmm; down_write(&hmm->mirrors_sem); - list_del(&mirror->list); + list_del_init(&mirror->list); + should_unregister = list_empty(&hmm->mirrors); + mirror->hmm = NULL; + mm = hmm->mm; + hmm->mm = NULL; up_write(&hmm->mirrors_sem); + + if (!should_unregister || mm == NULL) + return; + + spin_lock(&mm->page_table_lock); + if (mm->hmm == hmm) + mm->hmm = NULL; + spin_unlock(&mm->page_table_lock); + + mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); + kfree(hmm); } EXPORT_SYMBOL(hmm_mirror_unregister); @@ -240,110 +299,275 @@ struct hmm_vma_walk { unsigned long last; bool fault; bool block; - bool write; }; -static int hmm_vma_do_fault(struct mm_walk *walk, - unsigned long addr, - hmm_pfn_t *pfn) +static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, + bool write_fault, uint64_t *pfn) { unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE; struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; int r; flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; - flags |= hmm_vma_walk->write ? FAULT_FLAG_WRITE : 0; + flags |= write_fault ? FAULT_FLAG_WRITE : 0; r = handle_mm_fault(vma, addr, flags); if (r & VM_FAULT_RETRY) return -EBUSY; if (r & VM_FAULT_ERROR) { - *pfn = HMM_PFN_ERROR; + *pfn = range->values[HMM_PFN_ERROR]; return -EFAULT; } return -EAGAIN; } -static void hmm_pfns_special(hmm_pfn_t *pfns, - unsigned long addr, - unsigned long end) -{ - for (; addr < end; addr += PAGE_SIZE, pfns++) - *pfns = HMM_PFN_SPECIAL; -} - static int hmm_pfns_bad(unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct hmm_range *range = walk->private; - hmm_pfn_t *pfns = range->pfns; + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + uint64_t *pfns = range->pfns; unsigned long i; i = (addr - range->start) >> PAGE_SHIFT; for (; addr < end; addr += PAGE_SIZE, i++) - pfns[i] = HMM_PFN_ERROR; + pfns[i] = range->values[HMM_PFN_ERROR]; return 0; } -static void hmm_pfns_clear(hmm_pfn_t *pfns, - unsigned long addr, - unsigned long end) -{ - for (; addr < end; addr += PAGE_SIZE, pfns++) - *pfns = 0; -} - -static int hmm_vma_walk_hole(unsigned long addr, - unsigned long end, - struct mm_walk *walk) +/* + * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s) + * @start: range virtual start address (inclusive) + * @end: range virtual end address (exclusive) + * @fault: should we fault or not ? + * @write_fault: write fault ? + * @walk: mm_walk structure + * Returns: 0 on success, -EAGAIN after page fault, or page fault error + * + * This function will be called whenever pmd_none() or pte_none() returns true, + * or whenever there is no page directory covering the virtual address range. + */ +static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, + bool fault, bool write_fault, + struct mm_walk *walk) { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - hmm_pfn_t *pfns = range->pfns; + uint64_t *pfns = range->pfns; unsigned long i; hmm_vma_walk->last = addr; i = (addr - range->start) >> PAGE_SHIFT; for (; addr < end; addr += PAGE_SIZE, i++) { - pfns[i] = HMM_PFN_EMPTY; - if (hmm_vma_walk->fault) { + pfns[i] = range->values[HMM_PFN_NONE]; + if (fault || write_fault) { int ret; - ret = hmm_vma_do_fault(walk, addr, &pfns[i]); + ret = hmm_vma_do_fault(walk, addr, write_fault, + &pfns[i]); if (ret != -EAGAIN) return ret; } } - return hmm_vma_walk->fault ? -EAGAIN : 0; + return (fault || write_fault) ? -EAGAIN : 0; } -static int hmm_vma_walk_clear(unsigned long addr, - unsigned long end, - struct mm_walk *walk) +static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, + uint64_t pfns, uint64_t cpu_flags, + bool *fault, bool *write_fault) { - struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - hmm_pfn_t *pfns = range->pfns; + + *fault = *write_fault = false; + if (!hmm_vma_walk->fault) + return; + + /* We aren't ask to do anything ... */ + if (!(pfns & range->flags[HMM_PFN_VALID])) + return; + /* If this is device memory than only fault if explicitly requested */ + if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { + /* Do we fault on device memory ? */ + if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { + *write_fault = pfns & range->flags[HMM_PFN_WRITE]; + *fault = true; + } + return; + } + + /* If CPU page table is not valid then we need to fault */ + *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]); + /* Need to write fault ? */ + if ((pfns & range->flags[HMM_PFN_WRITE]) && + !(cpu_flags & range->flags[HMM_PFN_WRITE])) { + *write_fault = true; + *fault = true; + } +} + +static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, + const uint64_t *pfns, unsigned long npages, + uint64_t cpu_flags, bool *fault, + bool *write_fault) +{ unsigned long i; - hmm_vma_walk->last = addr; + if (!hmm_vma_walk->fault) { + *fault = *write_fault = false; + return; + } + + for (i = 0; i < npages; ++i) { + hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, + fault, write_fault); + if ((*fault) || (*write_fault)) + return; + } +} + +static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + bool fault, write_fault; + unsigned long i, npages; + uint64_t *pfns; + i = (addr - range->start) >> PAGE_SHIFT; - for (; addr < end; addr += PAGE_SIZE, i++) { - pfns[i] = 0; - if (hmm_vma_walk->fault) { - int ret; + npages = (end - addr) >> PAGE_SHIFT; + pfns = &range->pfns[i]; + hmm_range_need_fault(hmm_vma_walk, pfns, npages, + 0, &fault, &write_fault); + return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); +} - ret = hmm_vma_do_fault(walk, addr, &pfns[i]); - if (ret != -EAGAIN) - return ret; +static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) +{ + if (pmd_protnone(pmd)) + return 0; + return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] | + range->flags[HMM_PFN_WRITE] : + range->flags[HMM_PFN_VALID]; +} + +static int hmm_vma_handle_pmd(struct mm_walk *walk, + unsigned long addr, + unsigned long end, + uint64_t *pfns, + pmd_t pmd) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + unsigned long pfn, npages, i; + bool fault, write_fault; + uint64_t cpu_flags; + + npages = (end - addr) >> PAGE_SHIFT; + cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); + hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags, + &fault, &write_fault); + + if (pmd_protnone(pmd) || fault || write_fault) + return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); + + pfn = pmd_pfn(pmd) + pte_index(addr); + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) + pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; + hmm_vma_walk->last = end; + return 0; +} + +static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) +{ + if (pte_none(pte) || !pte_present(pte)) + return 0; + return pte_write(pte) ? range->flags[HMM_PFN_VALID] | + range->flags[HMM_PFN_WRITE] : + range->flags[HMM_PFN_VALID]; +} + +static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, + unsigned long end, pmd_t *pmdp, pte_t *ptep, + uint64_t *pfn) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + struct vm_area_struct *vma = walk->vma; + bool fault, write_fault; + uint64_t cpu_flags; + pte_t pte = *ptep; + uint64_t orig_pfn = *pfn; + + *pfn = range->values[HMM_PFN_NONE]; + cpu_flags = pte_to_hmm_pfn_flags(range, pte); + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, + &fault, &write_fault); + + if (pte_none(pte)) { + if (fault || write_fault) + goto fault; + return 0; + } + + if (!pte_present(pte)) { + swp_entry_t entry = pte_to_swp_entry(pte); + + if (!non_swap_entry(entry)) { + if (fault || write_fault) + goto fault; + return 0; } + + /* + * This is a special swap entry, ignore migration, use + * device and report anything else as error. + */ + if (is_device_private_entry(entry)) { + cpu_flags = range->flags[HMM_PFN_VALID] | + range->flags[HMM_PFN_DEVICE_PRIVATE]; + cpu_flags |= is_write_device_private_entry(entry) ? + range->flags[HMM_PFN_WRITE] : 0; + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, + &fault, &write_fault); + if (fault || write_fault) + goto fault; + *pfn = hmm_pfn_from_pfn(range, swp_offset(entry)); + *pfn |= cpu_flags; + return 0; + } + + if (is_migration_entry(entry)) { + if (fault || write_fault) { + pte_unmap(ptep); + hmm_vma_walk->last = addr; + migration_entry_wait(vma->vm_mm, + pmdp, addr); + return -EAGAIN; + } + return 0; + } + + /* Report error for everything else */ + *pfn = range->values[HMM_PFN_ERROR]; + return -EFAULT; } - return hmm_vma_walk->fault ? -EAGAIN : 0; + if (fault || write_fault) + goto fault; + + *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags; + return 0; + +fault: + pte_unmap(ptep); + /* Fault any virtual address we were asked to fault */ + return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); } static int hmm_vma_walk_pmd(pmd_t *pmdp, @@ -353,26 +577,20 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - struct vm_area_struct *vma = walk->vma; - hmm_pfn_t *pfns = range->pfns; + uint64_t *pfns = range->pfns; unsigned long addr = start, i; - bool write_fault; - hmm_pfn_t flag; pte_t *ptep; i = (addr - range->start) >> PAGE_SHIFT; - flag = vma->vm_flags & VM_READ ? HMM_PFN_READ : 0; - write_fault = hmm_vma_walk->fault & hmm_vma_walk->write; again: if (pmd_none(*pmdp)) return hmm_vma_walk_hole(start, end, walk); - if (pmd_huge(*pmdp) && vma->vm_flags & VM_HUGETLB) + if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB)) return hmm_pfns_bad(start, end, walk); if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) { - unsigned long pfn; pmd_t pmd; /* @@ -388,17 +606,8 @@ again: barrier(); if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) goto again; - if (pmd_protnone(pmd)) - return hmm_vma_walk_clear(start, end, walk); - if (write_fault && !pmd_write(pmd)) - return hmm_vma_walk_clear(start, end, walk); - - pfn = pmd_pfn(pmd) + pte_index(addr); - flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0; - for (; addr < end; addr += PAGE_SIZE, i++, pfn++) - pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag; - return 0; + return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd); } if (pmd_bad(*pmdp)) @@ -406,79 +615,43 @@ again: ptep = pte_offset_map(pmdp, addr); for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { - pte_t pte = *ptep; - - pfns[i] = 0; + int r; - if (pte_none(pte)) { - pfns[i] = HMM_PFN_EMPTY; - if (hmm_vma_walk->fault) - goto fault; - continue; + r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]); + if (r) { + /* hmm_vma_handle_pte() did unmap pte directory */ + hmm_vma_walk->last = addr; + return r; } - - if (!pte_present(pte)) { - swp_entry_t entry = pte_to_swp_entry(pte); - - if (!non_swap_entry(entry)) { - if (hmm_vma_walk->fault) - goto fault; - continue; - } - - /* - * This is a special swap entry, ignore migration, use - * device and report anything else as error. - */ - if (is_device_private_entry(entry)) { - pfns[i] = hmm_pfn_t_from_pfn(swp_offset(entry)); - if (is_write_device_private_entry(entry)) { - pfns[i] |= HMM_PFN_WRITE; - } else if (write_fault) - goto fault; - pfns[i] |= HMM_PFN_DEVICE_UNADDRESSABLE; - pfns[i] |= flag; - } else if (is_migration_entry(entry)) { - if (hmm_vma_walk->fault) { - pte_unmap(ptep); - hmm_vma_walk->last = addr; - migration_entry_wait(vma->vm_mm, - pmdp, addr); - return -EAGAIN; - } - continue; - } else { - /* Report error for everything else */ - pfns[i] = HMM_PFN_ERROR; - } - continue; - } - - if (write_fault && !pte_write(pte)) - goto fault; - - pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag; - pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0; - continue; - -fault: - pte_unmap(ptep); - /* Fault all pages in range */ - return hmm_vma_walk_clear(start, end, walk); } pte_unmap(ptep - 1); + hmm_vma_walk->last = addr; return 0; } +static void hmm_pfns_clear(struct hmm_range *range, + uint64_t *pfns, + unsigned long addr, + unsigned long end) +{ + for (; addr < end; addr += PAGE_SIZE, pfns++) + *pfns = range->values[HMM_PFN_NONE]; +} + +static void hmm_pfns_special(struct hmm_range *range) +{ + unsigned long addr = range->start, i = 0; + + for (; addr < range->end; addr += PAGE_SIZE, i++) + range->pfns[i] = range->values[HMM_PFN_SPECIAL]; +} + /* * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses - * @vma: virtual memory area containing the virtual address range - * @range: used to track snapshot validity - * @start: range virtual start address (inclusive) - * @end: range virtual end address (exclusive) - * @entries: array of hmm_pfn_t: provided by the caller, filled in by function - * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, 0 success + * @range: range being snapshotted + * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid + * vma permission, 0 success * * This snapshots the CPU page table for a range of virtual addresses. Snapshot * validity is tracked by range struct. See hmm_vma_range_done() for further @@ -491,26 +664,17 @@ fault: * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED ! */ -int hmm_vma_get_pfns(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns) +int hmm_vma_get_pfns(struct hmm_range *range) { + struct vm_area_struct *vma = range->vma; struct hmm_vma_walk hmm_vma_walk; struct mm_walk mm_walk; struct hmm *hmm; - /* FIXME support hugetlb fs */ - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { - hmm_pfns_special(pfns, start, end); - return -EINVAL; - } - /* Sanity check, this really should not happen ! */ - if (start < vma->vm_start || start >= vma->vm_end) + if (range->start < vma->vm_start || range->start >= vma->vm_end) return -EINVAL; - if (end < vma->vm_start || end > vma->vm_end) + if (range->end < vma->vm_start || range->end > vma->vm_end) return -EINVAL; hmm = hmm_register(vma->vm_mm); @@ -520,10 +684,24 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma, if (!hmm->mmu_notifier.ops) return -EINVAL; + /* FIXME support hugetlb fs */ + if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { + hmm_pfns_special(range); + return -EINVAL; + } + + if (!(vma->vm_flags & VM_READ)) { + /* + * If vma do not allow read access, then assume that it does + * not allow write access, either. Architecture that allow + * write without read access are not supported by HMM, because + * operations such has atomic access would not work. + */ + hmm_pfns_clear(range, range->pfns, range->start, range->end); + return -EPERM; + } + /* Initialize range to track CPU page table update */ - range->start = start; - range->pfns = pfns; - range->end = end; spin_lock(&hmm->lock); range->valid = true; list_add_rcu(&range->list, &hmm->ranges); @@ -541,14 +719,13 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma, mm_walk.pmd_entry = hmm_vma_walk_pmd; mm_walk.pte_hole = hmm_vma_walk_hole; - walk_page_range(start, end, &mm_walk); + walk_page_range(range->start, range->end, &mm_walk); return 0; } EXPORT_SYMBOL(hmm_vma_get_pfns); /* * hmm_vma_range_done() - stop tracking change to CPU page table over a range - * @vma: virtual memory area containing the virtual address range * @range: range being tracked * Returns: false if range data has been invalidated, true otherwise * @@ -568,10 +745,10 @@ EXPORT_SYMBOL(hmm_vma_get_pfns); * * There are two ways to use this : * again: - * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...); + * hmm_vma_get_pfns(range); or hmm_vma_fault(...); * trans = device_build_page_table_update_transaction(pfns); * device_page_table_lock(); - * if (!hmm_vma_range_done(vma, range)) { + * if (!hmm_vma_range_done(range)) { * device_page_table_unlock(); * goto again; * } @@ -579,13 +756,13 @@ EXPORT_SYMBOL(hmm_vma_get_pfns); * device_page_table_unlock(); * * Or: - * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...); + * hmm_vma_get_pfns(range); or hmm_vma_fault(...); * device_page_table_lock(); - * hmm_vma_range_done(vma, range); - * device_update_page_table(pfns); + * hmm_vma_range_done(range); + * device_update_page_table(range->pfns); * device_page_table_unlock(); */ -bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range) +bool hmm_vma_range_done(struct hmm_range *range) { unsigned long npages = (range->end - range->start) >> PAGE_SHIFT; struct hmm *hmm; @@ -595,7 +772,7 @@ bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range) return false; } - hmm = hmm_register(vma->vm_mm); + hmm = hmm_register(range->vma->vm_mm); if (!hmm) { memset(range->pfns, 0, sizeof(*range->pfns) * npages); return false; @@ -611,36 +788,34 @@ EXPORT_SYMBOL(hmm_vma_range_done); /* * hmm_vma_fault() - try to fault some address in a virtual address range - * @vma: virtual memory area containing the virtual address range - * @range: use to track pfns array content validity - * @start: fault range virtual start address (inclusive) - * @end: fault range virtual end address (exclusive) - * @pfns: array of hmm_pfn_t, only entry with fault flag set will be faulted - * @write: is it a write fault + * @range: range being faulted * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop) * * This is similar to a regular CPU page fault except that it will not trigger * any memory migration if the memory being faulted is not accessible by CPUs. * - * On error, for one virtual address in the range, the function will set the - * hmm_pfn_t error flag for the corresponding pfn entry. + * On error, for one virtual address in the range, the function will mark the + * corresponding HMM pfn entry with an error flag. * * Expected use pattern: * retry: * down_read(&mm->mmap_sem); * // Find vma and address device wants to fault, initialize hmm_pfn_t * // array accordingly - * ret = hmm_vma_fault(vma, start, end, pfns, allow_retry); + * ret = hmm_vma_fault(range, write, block); * switch (ret) { * case -EAGAIN: - * hmm_vma_range_done(vma, range); + * hmm_vma_range_done(range); * // You might want to rate limit or yield to play nicely, you may * // also commit any valid pfn in the array assuming that you are * // getting true from hmm_vma_range_monitor_end() * goto retry; * case 0: * break; + * case -ENOMEM: + * case -EINVAL: + * case -EPERM: * default: * // Handle error ! * up_read(&mm->mmap_sem) @@ -648,7 +823,7 @@ EXPORT_SYMBOL(hmm_vma_range_done); * } * // Take device driver lock that serialize device page table update * driver_lock_device_page_table_update(); - * hmm_vma_range_done(vma, range); + * hmm_vma_range_done(range); * // Commit pfns we got from hmm_vma_fault() * driver_unlock_device_page_table_update(); * up_read(&mm->mmap_sem) @@ -658,51 +833,54 @@ EXPORT_SYMBOL(hmm_vma_range_done); * * YOU HAVE BEEN WARNED ! */ -int hmm_vma_fault(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns, - bool write, - bool block) +int hmm_vma_fault(struct hmm_range *range, bool block) { + struct vm_area_struct *vma = range->vma; + unsigned long start = range->start; struct hmm_vma_walk hmm_vma_walk; struct mm_walk mm_walk; struct hmm *hmm; int ret; /* Sanity check, this really should not happen ! */ - if (start < vma->vm_start || start >= vma->vm_end) + if (range->start < vma->vm_start || range->start >= vma->vm_end) return -EINVAL; - if (end < vma->vm_start || end > vma->vm_end) + if (range->end < vma->vm_start || range->end > vma->vm_end) return -EINVAL; hmm = hmm_register(vma->vm_mm); if (!hmm) { - hmm_pfns_clear(pfns, start, end); + hmm_pfns_clear(range, range->pfns, range->start, range->end); return -ENOMEM; } /* Caller must have registered a mirror using hmm_mirror_register() */ if (!hmm->mmu_notifier.ops) return -EINVAL; + /* FIXME support hugetlb fs */ + if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { + hmm_pfns_special(range); + return -EINVAL; + } + + if (!(vma->vm_flags & VM_READ)) { + /* + * If vma do not allow read access, then assume that it does + * not allow write access, either. Architecture that allow + * write without read access are not supported by HMM, because + * operations such has atomic access would not work. + */ + hmm_pfns_clear(range, range->pfns, range->start, range->end); + return -EPERM; + } + /* Initialize range to track CPU page table update */ - range->start = start; - range->pfns = pfns; - range->end = end; spin_lock(&hmm->lock); range->valid = true; list_add_rcu(&range->list, &hmm->ranges); spin_unlock(&hmm->lock); - /* FIXME support hugetlb fs */ - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { - hmm_pfns_special(pfns, start, end); - return 0; - } - hmm_vma_walk.fault = true; - hmm_vma_walk.write = write; hmm_vma_walk.block = block; hmm_vma_walk.range = range; mm_walk.private = &hmm_vma_walk; @@ -717,7 +895,7 @@ int hmm_vma_fault(struct vm_area_struct *vma, mm_walk.pte_hole = hmm_vma_walk_hole; do { - ret = walk_page_range(start, end, &mm_walk); + ret = walk_page_range(start, range->end, &mm_walk); start = hmm_vma_walk.last; } while (ret == -EAGAIN); @@ -725,8 +903,9 @@ int hmm_vma_fault(struct vm_area_struct *vma, unsigned long i; i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; - hmm_pfns_clear(&pfns[i], hmm_vma_walk.last, end); - hmm_vma_range_done(vma, range); + hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last, + range->end); + hmm_vma_range_done(range); } return ret; } @@ -845,13 +1024,6 @@ static void hmm_devmem_release(struct device *dev, void *data) hmm_devmem_radix_release(resource); } -static struct hmm_devmem *hmm_devmem_find(resource_size_t phys) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - - return radix_tree_lookup(&hmm_devmem_radix, phys >> PA_SECTION_SHIFT); -} - static int hmm_devmem_pages_create(struct hmm_devmem *devmem) { resource_size_t key, align_start, align_size, align_end; @@ -892,9 +1064,8 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem) for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) { struct hmm_devmem *dup; - rcu_read_lock(); - dup = hmm_devmem_find(key); - rcu_read_unlock(); + dup = radix_tree_lookup(&hmm_devmem_radix, + key >> PA_SECTION_SHIFT); if (dup) { dev_err(device, "%s: collides with mapping for %s\n", __func__, dev_name(dup->device)); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f0ae8d1d4329..14ed6ee5e02f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -555,8 +555,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, vma->vm_mm, gfp | __GFP_NORETRY, &memcg, - true)) { + if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1317,7 +1316,7 @@ alloc: } if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, - huge_gfp | __GFP_NORETRY, &memcg, true))) { + huge_gfp, &memcg, true))) { put_page(new_page); split_huge_pmd(vma, vmf->pmd, vmf->address); if (page) @@ -2402,6 +2401,12 @@ static void __split_huge_page_tail(struct page *head, int tail, page_tail->index = head->index + tail; page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); + + /* + * always add to the tail because some iterators expect new + * pages to show after the currently processed elements - e.g. + * migrate_pages + */ lru_add_page_tail(head, page_tail, lruvec, list); } @@ -2445,7 +2450,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, } else { /* Additional pin to radix tree */ page_ref_add(head, 2); - spin_unlock(&head->mapping->tree_lock); + xa_unlock(&head->mapping->i_pages); } spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); @@ -2653,15 +2658,15 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mapping) { void **pslot; - spin_lock(&mapping->tree_lock); - pslot = radix_tree_lookup_slot(&mapping->page_tree, + xa_lock(&mapping->i_pages); + pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(head)); /* * Check if the head page is present in radix tree. * We assume all tail are present too, if head is there. */ if (radix_tree_deref_slot_protected(pslot, - &mapping->tree_lock) != head) + &mapping->i_pages.xa_lock) != head) goto fail; } @@ -2695,7 +2700,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } spin_unlock(&pgdata->split_queue_lock); fail: if (mapping) - spin_unlock(&mapping->tree_lock); + xa_unlock(&mapping->i_pages); spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); unfreeze_page(head); ret = -EBUSY; diff --git a/mm/internal.h b/mm/internal.h index e6bd35182dae..62d8c34e63d5 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -168,6 +168,9 @@ extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern int user_min_free_kbytes; +extern void set_zone_contiguous(struct zone *zone); +extern void clear_zone_contiguous(struct zone *zone); + #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* @@ -495,7 +498,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_HARDER 0x10 /* try to alloc harder */ #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ -#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ enum ttu_flags; struct tlbflush_unmap_batch; @@ -538,4 +540,5 @@ static inline bool is_migrate_highatomic_page(struct page *page) } void setup_zone_pageset(struct zone *zone); +extern struct page *alloc_new_node_page(struct page *page, unsigned long node); #endif /* __MM_INTERNAL_H */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e42568284e06..d7b2a4bf8671 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -965,9 +965,7 @@ static void collapse_huge_page(struct mm_struct *mm, goto out_nolock; } - /* Do not oom kill for khugepaged charges */ - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY, - &memcg, true))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out_nolock; } @@ -1326,9 +1324,7 @@ static void collapse_shmem(struct mm_struct *mm, goto out; } - /* Do not oom kill for khugepaged charges */ - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY, - &memcg, true))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out; } @@ -1348,8 +1344,8 @@ static void collapse_shmem(struct mm_struct *mm, */ index = start; - spin_lock_irq(&mapping->tree_lock); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + xa_lock_irq(&mapping->i_pages); + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { int n = min(iter.index, end) - index; /* @@ -1362,7 +1358,7 @@ static void collapse_shmem(struct mm_struct *mm, } nr_none += n; for (; index < min(iter.index, end); index++) { - radix_tree_insert(&mapping->page_tree, index, + radix_tree_insert(&mapping->i_pages, index, new_page + (index % HPAGE_PMD_NR)); } @@ -1371,16 +1367,16 @@ static void collapse_shmem(struct mm_struct *mm, break; page = radix_tree_deref_slot_protected(slot, - &mapping->tree_lock); + &mapping->i_pages.xa_lock); if (radix_tree_exceptional_entry(page) || !PageUptodate(page)) { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); /* swap in or instantiate fallocated page */ if (shmem_getpage(mapping->host, index, &page, SGP_NOHUGE)) { result = SCAN_FAIL; goto tree_unlocked; } - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); } else if (trylock_page(page)) { get_page(page); } else { @@ -1389,7 +1385,7 @@ static void collapse_shmem(struct mm_struct *mm, } /* - * The page must be locked, so we can drop the tree_lock + * The page must be locked, so we can drop the i_pages lock * without racing with truncate. */ VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -1400,7 +1396,7 @@ static void collapse_shmem(struct mm_struct *mm, result = SCAN_TRUNCATED; goto out_unlock; } - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); if (isolate_lru_page(page)) { result = SCAN_DEL_PAGE_LRU; @@ -1410,11 +1406,11 @@ static void collapse_shmem(struct mm_struct *mm, if (page_mapped(page)) unmap_mapping_pages(mapping, index, 1, false); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); - slot = radix_tree_lookup_slot(&mapping->page_tree, index); + slot = radix_tree_lookup_slot(&mapping->i_pages, index); VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot, - &mapping->tree_lock), page); + &mapping->i_pages.xa_lock), page); VM_BUG_ON_PAGE(page_mapped(page), page); /* @@ -1435,14 +1431,14 @@ static void collapse_shmem(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - radix_tree_replace_slot(&mapping->page_tree, slot, + radix_tree_replace_slot(&mapping->i_pages, slot, new_page + (index % HPAGE_PMD_NR)); slot = radix_tree_iter_resume(slot, &iter); index++; continue; out_lru: - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); putback_lru_page(page); out_isolate_failed: unlock_page(page); @@ -1468,14 +1464,14 @@ out_unlock: } for (; index < end; index++) { - radix_tree_insert(&mapping->page_tree, index, + radix_tree_insert(&mapping->i_pages, index, new_page + (index % HPAGE_PMD_NR)); } nr_none += n; } tree_locked: - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); tree_unlocked: if (result == SCAN_SUCCEED) { @@ -1524,9 +1520,8 @@ tree_unlocked: } else { /* Something went wrong: rollback changes to the radix-tree */ shmem_uncharge(mapping->host, nr_none); - spin_lock_irq(&mapping->tree_lock); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, - start) { + xa_lock_irq(&mapping->i_pages); + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { if (iter.index >= end) break; page = list_first_entry_or_null(&pagelist, @@ -1536,8 +1531,7 @@ tree_unlocked: break; nr_none--; /* Put holes back where they were */ - radix_tree_delete(&mapping->page_tree, - iter.index); + radix_tree_delete(&mapping->i_pages, iter.index); continue; } @@ -1546,16 +1540,15 @@ tree_unlocked: /* Unfreeze the page. */ list_del(&page->lru); page_ref_unfreeze(page, 2); - radix_tree_replace_slot(&mapping->page_tree, - slot, page); + radix_tree_replace_slot(&mapping->i_pages, slot, page); slot = radix_tree_iter_resume(slot, &iter); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); putback_lru_page(page); unlock_page(page); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); } VM_BUG_ON(nr_none); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); /* Unfreeze new_page, caller would take care about freeing it */ page_ref_unfreeze(new_page, 1); @@ -1583,7 +1576,7 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, swap = 0; memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { if (iter.index >= start + HPAGE_PMD_NR) break; @@ -1883,8 +1876,16 @@ static void set_recommended_min_free_kbytes(void) int nr_zones = 0; unsigned long recommended_min; - for_each_populated_zone(zone) + for_each_populated_zone(zone) { + /* + * We don't need to worry about fragmentation of + * ZONE_MOVABLE since it only has movable pages. + */ + if (zone_idx(zone) > gfp_zone(GFP_USER)) + continue; + nr_zones++; + } /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ recommended_min = pageblock_nr_pages * nr_zones * 2; @@ -1131,6 +1131,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, } else { newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot)); + /* + * We're replacing an anonymous page with a zero page, which is + * not anonymous. We need to do proper accounting otherwise we + * will get wrong values in /proc, and a BUG message in dmesg + * when tearing down the mm. + */ + dec_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, addr, pte_pfn(*ptep)); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9ec024b862ac..e074f7c637aa 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1485,7 +1485,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - if (!current->memcg_may_oom) + if (!current->memcg_may_oom || order > PAGE_ALLOC_COSTLY_ORDER) return; /* * We are in the middle of the charge context here, so we @@ -1839,7 +1839,7 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) } } - for (i = 0; i < MEMCG_NR_EVENTS; i++) { + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { long x; x = this_cpu_xchg(memcg->stat_cpu->events[i], 0); @@ -1858,7 +1858,7 @@ static void reclaim_high(struct mem_cgroup *memcg, do { if (page_counter_read(&memcg->memory) <= memcg->high) continue; - mem_cgroup_event(memcg, MEMCG_HIGH); + memcg_memory_event(memcg, MEMCG_HIGH); try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); } while ((memcg = parent_mem_cgroup(memcg))); } @@ -1949,7 +1949,7 @@ retry: if (!gfpflags_allow_blocking(gfp_mask)) goto nomem; - mem_cgroup_event(mem_over_limit, MEMCG_MAX); + memcg_memory_event(mem_over_limit, MEMCG_MAX); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, gfp_mask, may_swap); @@ -1992,7 +1992,7 @@ retry: if (fatal_signal_pending(current)) goto force; - mem_cgroup_event(mem_over_limit, MEMCG_OOM); + memcg_memory_event(mem_over_limit, MEMCG_OOM); mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages * PAGE_SIZE)); @@ -2688,10 +2688,10 @@ static void tree_events(struct mem_cgroup *memcg, unsigned long *events) struct mem_cgroup *iter; int i; - memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS); + memset(events, 0, sizeof(*events) * NR_VM_EVENT_ITEMS); for_each_mem_cgroup_tree(iter, memcg) { - for (i = 0; i < MEMCG_NR_EVENTS; i++) + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) events[i] += memcg_sum_events(iter, i); } } @@ -4108,6 +4108,9 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; + if (!pn) + return; + free_percpu(pn->lruvec_stat_cpu); kfree(pn); } @@ -5178,7 +5181,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, continue; } - mem_cgroup_event(memcg, MEMCG_OOM); + memcg_memory_event(memcg, MEMCG_OOM); if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) break; } @@ -5191,10 +5194,14 @@ static int memory_events_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - seq_printf(m, "low %lu\n", memcg_sum_events(memcg, MEMCG_LOW)); - seq_printf(m, "high %lu\n", memcg_sum_events(memcg, MEMCG_HIGH)); - seq_printf(m, "max %lu\n", memcg_sum_events(memcg, MEMCG_MAX)); - seq_printf(m, "oom %lu\n", memcg_sum_events(memcg, MEMCG_OOM)); + seq_printf(m, "low %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_LOW])); + seq_printf(m, "high %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_HIGH])); + seq_printf(m, "max %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_MAX])); + seq_printf(m, "oom %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_OOM])); seq_printf(m, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL)); return 0; @@ -5204,7 +5211,7 @@ static int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); unsigned long stat[MEMCG_NR_STAT]; - unsigned long events[MEMCG_NR_EVENTS]; + unsigned long events[NR_VM_EVENT_ITEMS]; int i; /* @@ -5967,9 +5974,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) /* * Interrupts should be disabled here because the caller holds the - * mapping->tree_lock lock which is taken with interrupts-off. It is + * i_pages lock which is taken with interrupts-off. It is * important here to have the interrupts disabled because it is the - * only synchronisation we have for udpating the per-CPU variables. + * only synchronisation we have for updating the per-CPU variables. */ VM_BUG_ON(!irqs_disabled()); mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page), diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2d4bf647cf01..9d142b9b86dc 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1487,7 +1487,7 @@ int unpoison_memory(unsigned long pfn) } EXPORT_SYMBOL(unpoison_memory); -static struct page *new_page(struct page *p, unsigned long private, int **x) +static struct page *new_page(struct page *p, unsigned long private) { int nid = page_to_nid(p); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index cc6dfa5832ca..f74826cdceea 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1329,8 +1329,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) return 0; } -static struct page *new_node_page(struct page *page, unsigned long private, - int **result) +static struct page *new_node_page(struct page *page, unsigned long private) { int nid = page_to_nid(page); nodemask_t nmask = node_states[N_MEMORY]; @@ -1373,7 +1372,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (isolate_huge_page(page, &source)) move_pages -= 1 << compound_order(head); continue; - } else if (thp_migration_supported() && PageTransHuge(page)) + } else if (PageTransHuge(page)) pfn = page_to_pfn(compound_head(page)) + hpage_nr_pages(page) - 1; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 01cbb7078d6c..9ac49ef17b4e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -446,15 +446,6 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, __split_huge_pmd(walk->vma, pmd, addr, false, NULL); goto out; } - if (!thp_migration_supported()) { - get_page(page); - spin_unlock(ptl); - lock_page(page); - ret = split_huge_page(page); - unlock_page(page); - put_page(page); - goto out; - } if (!queue_pages_required(page, qp)) { ret = 1; goto unlock; @@ -495,7 +486,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, if (pmd_trans_unstable(pmd)) return 0; -retry: + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { if (!pte_present(*pte)) @@ -511,22 +502,6 @@ retry: continue; if (!queue_pages_required(page, qp)) continue; - if (PageTransCompound(page) && !thp_migration_supported()) { - get_page(page); - pte_unmap_unlock(pte, ptl); - lock_page(page); - ret = split_huge_page(page); - unlock_page(page); - put_page(page); - /* Failed to split -- skip. */ - if (ret) { - pte = pte_offset_map_lock(walk->mm, pmd, - addr, &ptl); - continue; - } - goto retry; - } - migrate_page_add(page, qp->pagelist, flags); } pte_unmap_unlock(pte - 1, ptl); @@ -942,12 +917,13 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, } } -static struct page *new_node_page(struct page *page, unsigned long node, int **x) +/* page allocation callback for NUMA node migration */ +struct page *alloc_new_node_page(struct page *page, unsigned long node) { if (PageHuge(page)) return alloc_huge_page_node(page_hstate(compound_head(page)), node); - else if (thp_migration_supported() && PageTransHuge(page)) { + else if (PageTransHuge(page)) { struct page *thp; thp = alloc_pages_node(node, @@ -986,7 +962,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_node_page, NULL, dest, + err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(&pagelist); @@ -1107,7 +1083,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, * list of pages handed to migrate_pages()--which is how we get here-- * is in virtual address order. */ -static struct page *new_page(struct page *page, unsigned long start, int **x) +static struct page *new_page(struct page *page, unsigned long start) { struct vm_area_struct *vma; unsigned long uninitialized_var(address); @@ -1123,7 +1099,7 @@ static struct page *new_page(struct page *page, unsigned long start, int **x) if (PageHuge(page)) { return alloc_huge_page_vma(page_hstate(compound_head(page)), vma, address); - } else if (thp_migration_supported() && PageTransHuge(page)) { + } else if (PageTransHuge(page)) { struct page *thp; thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address, @@ -1152,7 +1128,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, return -ENOSYS; } -static struct page *new_page(struct page *page, unsigned long start, int **x) +static struct page *new_page(struct page *page, unsigned long start) { return NULL; } diff --git a/mm/migrate.c b/mm/migrate.c index 003886606a22..f65dd69e1fd1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -467,20 +467,21 @@ int migrate_page_move_mapping(struct address_space *mapping, oldzone = page_zone(page); newzone = page_zone(newpage); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); - pslot = radix_tree_lookup_slot(&mapping->page_tree, + pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); expected_count += 1 + page_has_private(page); if (page_count(page) != expected_count || - radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { - spin_unlock_irq(&mapping->tree_lock); + radix_tree_deref_slot_protected(pslot, + &mapping->i_pages.xa_lock) != page) { + xa_unlock_irq(&mapping->i_pages); return -EAGAIN; } if (!page_ref_freeze(page, expected_count)) { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return -EAGAIN; } @@ -494,7 +495,7 @@ int migrate_page_move_mapping(struct address_space *mapping, if (mode == MIGRATE_ASYNC && head && !buffer_migrate_lock_buffers(head, mode)) { page_ref_unfreeze(page, expected_count); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return -EAGAIN; } @@ -522,7 +523,7 @@ int migrate_page_move_mapping(struct address_space *mapping, SetPageDirty(newpage); } - radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); + radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); /* * Drop cache reference from old page by unfreezing @@ -531,7 +532,7 @@ int migrate_page_move_mapping(struct address_space *mapping, */ page_ref_unfreeze(page, expected_count - 1); - spin_unlock(&mapping->tree_lock); + xa_unlock(&mapping->i_pages); /* Leave irq disabled to prevent preemption while updating stats */ /* @@ -574,20 +575,19 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, int expected_count; void **pslot; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); - pslot = radix_tree_lookup_slot(&mapping->page_tree, - page_index(page)); + pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); expected_count = 2 + page_has_private(page); if (page_count(page) != expected_count || - radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { - spin_unlock_irq(&mapping->tree_lock); + radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) { + xa_unlock_irq(&mapping->i_pages); return -EAGAIN; } if (!page_ref_freeze(page, expected_count)) { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return -EAGAIN; } @@ -596,11 +596,11 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, get_page(newpage); - radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); + radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); page_ref_unfreeze(page, expected_count - 1); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return MIGRATEPAGE_SUCCESS; } @@ -1137,10 +1137,12 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, enum migrate_reason reason) { int rc = MIGRATEPAGE_SUCCESS; - int *result = NULL; struct page *newpage; - newpage = get_new_page(page, private, &result); + if (!thp_migration_supported() && PageTransHuge(page)) + return -ENOMEM; + + newpage = get_new_page(page, private); if (!newpage) return -ENOMEM; @@ -1161,14 +1163,6 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, goto out; } - if (unlikely(PageTransHuge(page) && !PageTransHuge(newpage))) { - lock_page(page); - rc = split_huge_page(page); - unlock_page(page); - if (rc) - goto out; - } - rc = __unmap_and_move(page, newpage, force, mode); if (rc == MIGRATEPAGE_SUCCESS) set_page_owner_migrate_reason(newpage, reason); @@ -1231,12 +1225,6 @@ put_new: put_page(newpage); } - if (result) { - if (rc) - *result = rc; - else - *result = page_to_nid(newpage); - } return rc; } @@ -1264,7 +1252,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, enum migrate_mode mode, int reason) { int rc = -EAGAIN; - int *result = NULL; int page_was_mapped = 0; struct page *new_hpage; struct anon_vma *anon_vma = NULL; @@ -1281,7 +1268,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, return -ENOSYS; } - new_hpage = get_new_page(hpage, private, &result); + new_hpage = get_new_page(hpage, private); if (!new_hpage) return -ENOMEM; @@ -1345,12 +1332,6 @@ out: else putback_active_hugepage(new_hpage); - if (result) { - if (rc) - *result = rc; - else - *result = page_to_nid(new_hpage); - } return rc; } @@ -1395,6 +1376,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, retry = 0; list_for_each_entry_safe(page, page2, from, lru) { +retry: cond_resched(); if (PageHuge(page)) @@ -1408,6 +1390,26 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, switch(rc) { case -ENOMEM: + /* + * THP migration might be unsupported or the + * allocation could've failed so we should + * retry on the same page with the THP split + * to base pages. + * + * Head page is retried immediately and tail + * pages are added to the tail of the list so + * we encounter them after the rest of the list + * is processed. + */ + if (PageTransHuge(page)) { + lock_page(page); + rc = split_huge_page_to_list(page, from); + unlock_page(page); + if (!rc) { + list_safe_reset_next(page, page2, lru); + goto retry; + } + } nr_failed++; goto out; case -EAGAIN: @@ -1444,141 +1446,101 @@ out: } #ifdef CONFIG_NUMA -/* - * Move a list of individual pages - */ -struct page_to_node { - unsigned long addr; - struct page *page; - int node; - int status; -}; -static struct page *new_page_node(struct page *p, unsigned long private, - int **result) +static int store_status(int __user *status, int start, int value, int nr) { - struct page_to_node *pm = (struct page_to_node *)private; - - while (pm->node != MAX_NUMNODES && pm->page != p) - pm++; + while (nr-- > 0) { + if (put_user(value, status + start)) + return -EFAULT; + start++; + } - if (pm->node == MAX_NUMNODES) - return NULL; + return 0; +} - *result = &pm->status; +static int do_move_pages_to_node(struct mm_struct *mm, + struct list_head *pagelist, int node) +{ + int err; - if (PageHuge(p)) - return alloc_huge_page_node(page_hstate(compound_head(p)), - pm->node); - else if (thp_migration_supported() && PageTransHuge(p)) { - struct page *thp; + if (list_empty(pagelist)) + return 0; - thp = alloc_pages_node(pm->node, - (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM, - HPAGE_PMD_ORDER); - if (!thp) - return NULL; - prep_transhuge_page(thp); - return thp; - } else - return __alloc_pages_node(pm->node, - GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0); + err = migrate_pages(pagelist, alloc_new_node_page, NULL, node, + MIGRATE_SYNC, MR_SYSCALL); + if (err) + putback_movable_pages(pagelist); + return err; } /* - * Move a set of pages as indicated in the pm array. The addr - * field must be set to the virtual address of the page to be moved - * and the node number must contain a valid target node. - * The pm array ends with node = MAX_NUMNODES. + * Resolves the given address to a struct page, isolates it from the LRU and + * puts it to the given pagelist. + * Returns -errno if the page cannot be found/isolated or 0 when it has been + * queued or the page doesn't need to be migrated because it is already on + * the target node */ -static int do_move_page_to_node_array(struct mm_struct *mm, - struct page_to_node *pm, - int migrate_all) +static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, + int node, struct list_head *pagelist, bool migrate_all) { + struct vm_area_struct *vma; + struct page *page; + unsigned int follflags; int err; - struct page_to_node *pp; - LIST_HEAD(pagelist); down_read(&mm->mmap_sem); + err = -EFAULT; + vma = find_vma(mm, addr); + if (!vma || addr < vma->vm_start || !vma_migratable(vma)) + goto out; - /* - * Build a list of pages to migrate - */ - for (pp = pm; pp->node != MAX_NUMNODES; pp++) { - struct vm_area_struct *vma; - struct page *page; - struct page *head; - unsigned int follflags; - - err = -EFAULT; - vma = find_vma(mm, pp->addr); - if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) - goto set_status; - - /* FOLL_DUMP to ignore special (like zero) pages */ - follflags = FOLL_GET | FOLL_DUMP; - if (!thp_migration_supported()) - follflags |= FOLL_SPLIT; - page = follow_page(vma, pp->addr, follflags); + /* FOLL_DUMP to ignore special (like zero) pages */ + follflags = FOLL_GET | FOLL_DUMP; + page = follow_page(vma, addr, follflags); - err = PTR_ERR(page); - if (IS_ERR(page)) - goto set_status; + err = PTR_ERR(page); + if (IS_ERR(page)) + goto out; - err = -ENOENT; - if (!page) - goto set_status; + err = -ENOENT; + if (!page) + goto out; - err = page_to_nid(page); + err = 0; + if (page_to_nid(page) == node) + goto out_putpage; - if (err == pp->node) - /* - * Node already in the right place - */ - goto put_and_set; + err = -EACCES; + if (page_mapcount(page) > 1 && !migrate_all) + goto out_putpage; - err = -EACCES; - if (page_mapcount(page) > 1 && - !migrate_all) - goto put_and_set; - - if (PageHuge(page)) { - if (PageHead(page)) { - isolate_huge_page(page, &pagelist); - err = 0; - pp->page = page; - } - goto put_and_set; + if (PageHuge(page)) { + if (PageHead(page)) { + isolate_huge_page(page, pagelist); + err = 0; } + } else { + struct page *head; - pp->page = compound_head(page); head = compound_head(page); err = isolate_lru_page(head); - if (!err) { - list_add_tail(&head->lru, &pagelist); - mod_node_page_state(page_pgdat(head), - NR_ISOLATED_ANON + page_is_file_cache(head), - hpage_nr_pages(head)); - } -put_and_set: - /* - * Either remove the duplicate refcount from - * isolate_lru_page() or drop the page ref if it was - * not isolated. - */ - put_page(page); -set_status: - pp->status = err; - } - - err = 0; - if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_page_node, NULL, - (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); if (err) - putback_movable_pages(&pagelist); - } + goto out_putpage; + err = 0; + list_add_tail(&head->lru, pagelist); + mod_node_page_state(page_pgdat(head), + NR_ISOLATED_ANON + page_is_file_cache(head), + hpage_nr_pages(head)); + } +out_putpage: + /* + * Either remove the duplicate refcount from + * isolate_lru_page() or drop the page ref if it was + * not isolated. + */ + put_page(page); +out: up_read(&mm->mmap_sem); return err; } @@ -1593,79 +1555,79 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, const int __user *nodes, int __user *status, int flags) { - struct page_to_node *pm; - unsigned long chunk_nr_pages; - unsigned long chunk_start; - int err; - - err = -ENOMEM; - pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); - if (!pm) - goto out; + int current_node = NUMA_NO_NODE; + LIST_HEAD(pagelist); + int start, i; + int err = 0, err1; migrate_prep(); - /* - * Store a chunk of page_to_node array in a page, - * but keep the last one as a marker - */ - chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1; - - for (chunk_start = 0; - chunk_start < nr_pages; - chunk_start += chunk_nr_pages) { - int j; + for (i = start = 0; i < nr_pages; i++) { + const void __user *p; + unsigned long addr; + int node; - if (chunk_start + chunk_nr_pages > nr_pages) - chunk_nr_pages = nr_pages - chunk_start; - - /* fill the chunk pm with addrs and nodes from user-space */ - for (j = 0; j < chunk_nr_pages; j++) { - const void __user *p; - int node; - - err = -EFAULT; - if (get_user(p, pages + j + chunk_start)) - goto out_pm; - pm[j].addr = (unsigned long) p; - - if (get_user(node, nodes + j + chunk_start)) - goto out_pm; - - err = -ENODEV; - if (node < 0 || node >= MAX_NUMNODES) - goto out_pm; - - if (!node_state(node, N_MEMORY)) - goto out_pm; - - err = -EACCES; - if (!node_isset(node, task_nodes)) - goto out_pm; + err = -EFAULT; + if (get_user(p, pages + i)) + goto out_flush; + if (get_user(node, nodes + i)) + goto out_flush; + addr = (unsigned long)p; + + err = -ENODEV; + if (node < 0 || node >= MAX_NUMNODES) + goto out_flush; + if (!node_state(node, N_MEMORY)) + goto out_flush; - pm[j].node = node; + err = -EACCES; + if (!node_isset(node, task_nodes)) + goto out_flush; + + if (current_node == NUMA_NO_NODE) { + current_node = node; + start = i; + } else if (node != current_node) { + err = do_move_pages_to_node(mm, &pagelist, current_node); + if (err) + goto out; + err = store_status(status, start, current_node, i - start); + if (err) + goto out; + start = i; + current_node = node; } - /* End marker for this chunk */ - pm[chunk_nr_pages].node = MAX_NUMNODES; - - /* Migrate this chunk */ - err = do_move_page_to_node_array(mm, pm, - flags & MPOL_MF_MOVE_ALL); - if (err < 0) - goto out_pm; + /* + * Errors in the page lookup or isolation are not fatal and we simply + * report them via status + */ + err = add_page_for_migration(mm, addr, current_node, + &pagelist, flags & MPOL_MF_MOVE_ALL); + if (!err) + continue; - /* Return status information */ - for (j = 0; j < chunk_nr_pages; j++) - if (put_user(pm[j].status, status + j + chunk_start)) { - err = -EFAULT; - goto out_pm; - } - } - err = 0; + err = store_status(status, i, err, 1); + if (err) + goto out_flush; -out_pm: - free_page((unsigned long)pm); + err = do_move_pages_to_node(mm, &pagelist, current_node); + if (err) + goto out; + if (i > start) { + err = store_status(status, start, current_node, i - start); + if (err) + goto out; + } + current_node = NUMA_NO_NODE; + } +out_flush: + /* Make sure we do not overwrite the existing error */ + err1 = do_move_pages_to_node(mm, &pagelist, current_node); + if (!err1) + err1 = store_status(status, start, current_node, i - start); + if (!err) + err = err1; out: return err; } @@ -1866,8 +1828,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, } static struct page *alloc_misplaced_dst_page(struct page *page, - unsigned long data, - int **result) + unsigned long data) { int nid = (int) data; struct page *newpage; @@ -1987,6 +1948,13 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, goto out; /* + * Also do not migrate dirty pages as not all filesystems can move + * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles. + */ + if (page_is_file_cache(page) && PageDirty(page)) + goto out; + + /* * Rate-limit the amount of data that is being migrated to a node. * Optimal placement is no good if the memory bus is saturated and * all the time is being spent migrating! @@ -2339,7 +2307,8 @@ again: ptep_get_and_clear(mm, addr, ptep); /* Setup special migration page table entry */ - entry = make_migration_entry(page, pte_write(pte)); + entry = make_migration_entry(page, mpfn & + MIGRATE_PFN_WRITE); swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pte)) swp_pte = pte_swp_mksoft_dirty(swp_pte); diff --git a/mm/mmap.c b/mm/mmap.c index f2154fc2548b..188f195883b9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1342,6 +1342,10 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (!(file && path_noexec(&file->f_path))) prot |= PROT_EXEC; + /* force arch specific MAP_FIXED handling in get_unmapped_area */ + if (flags & MAP_FIXED_NOREPLACE) + flags |= MAP_FIXED; + if (!(flags & MAP_FIXED)) addr = round_hint_to_min(addr); @@ -1365,6 +1369,13 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (offset_in_page(addr)) return addr; + if (flags & MAP_FIXED_NOREPLACE) { + struct vm_area_struct *vma = find_vma(mm, addr); + + if (vma && vma->vm_start <= addr) + return -EEXIST; + } + if (prot == PROT_EXEC) { pkey = execute_only_pkey(mm); if (pkey < 0) diff --git a/mm/mprotect.c b/mm/mprotect.c index c1d6af7455da..625608bc8962 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -27,6 +27,7 @@ #include <linux/pkeys.h> #include <linux/ksm.h> #include <linux/uaccess.h> +#include <linux/mm_inline.h> #include <asm/pgtable.h> #include <asm/cacheflush.h> #include <asm/mmu_context.h> @@ -89,6 +90,14 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, page_mapcount(page) != 1) continue; + /* + * While migration can move some dirty pages, + * it cannot move them all from MIGRATE_ASYNC + * context. + */ + if (page_is_file_cache(page) && PageDirty(page)) + continue; + /* Avoid TLB flush if possible */ if (pte_protnone(oldpte)) continue; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 586f31261c83..5c1a3279e63f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2099,7 +2099,8 @@ void __init page_writeback_init(void) * so that it can tag pages faster than a dirtying process can create them). */ /* - * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency. + * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce the i_pages lock + * latency. */ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) @@ -2109,22 +2110,22 @@ void tag_pages_for_writeback(struct address_space *mapping, struct radix_tree_iter iter; void **slot; - spin_lock_irq(&mapping->tree_lock); - radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start, + xa_lock_irq(&mapping->i_pages); + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, PAGECACHE_TAG_DIRTY) { if (iter.index > end) break; - radix_tree_iter_tag_set(&mapping->page_tree, &iter, + radix_tree_iter_tag_set(&mapping->i_pages, &iter, PAGECACHE_TAG_TOWRITE); tagged++; if ((tagged % WRITEBACK_TAG_BATCH) != 0) continue; slot = radix_tree_iter_resume(slot, &iter); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); cond_resched(); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); } - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); } EXPORT_SYMBOL(tag_pages_for_writeback); @@ -2467,13 +2468,13 @@ int __set_page_dirty_nobuffers(struct page *page) return 1; } - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); BUG_ON(page_mapping(page) != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->page_tree, page_index(page), + radix_tree_tag_set(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); unlock_page_memcg(page); if (mapping->host) { @@ -2718,11 +2719,10 @@ int test_clear_page_writeback(struct page *page) struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); ret = TestClearPageWriteback(page); if (ret) { - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) { struct bdi_writeback *wb = inode_to_wb(inode); @@ -2736,7 +2736,7 @@ int test_clear_page_writeback(struct page *page) PAGECACHE_TAG_WRITEBACK)) sb_clear_inode_writeback(mapping->host); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); } else { ret = TestClearPageWriteback(page); } @@ -2766,7 +2766,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); ret = TestSetPageWriteback(page); if (!ret) { bool on_wblist; @@ -2774,8 +2774,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); - radix_tree_tag_set(&mapping->page_tree, - page_index(page), + radix_tree_tag_set(&mapping->i_pages, page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); @@ -2789,14 +2788,12 @@ int __test_set_page_writeback(struct page *page, bool keep_write) sb_mark_inode_writeback(mapping->host); } if (!PageDirty(page)) - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); if (!keep_write) - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_TOWRITE); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); } else { ret = TestSetPageWriteback(page); } @@ -2816,7 +2813,7 @@ EXPORT_SYMBOL(__test_set_page_writeback); */ int mapping_tagged(struct address_space *mapping, int tag) { - return radix_tree_tagged(&mapping->page_tree, tag); + return radix_tree_tagged(&mapping->i_pages, tag); } EXPORT_SYMBOL(mapping_tagged); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0b97b8ece4a9..905db9d7962f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -46,7 +46,6 @@ #include <linux/stop_machine.h> #include <linux/sort.h> #include <linux/pfn.h> -#include <xen/xen.h> #include <linux/backing-dev.h> #include <linux/fault-inject.h> #include <linux/page-isolation.h> @@ -205,17 +204,18 @@ static void __free_pages_ok(struct page *page, unsigned int order); * TBD: should special case ZONE_DMA32 machines here - in those we normally * don't need any ZONE_NORMAL reservation */ -int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { +int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = { #ifdef CONFIG_ZONE_DMA - 256, + [ZONE_DMA] = 256, #endif #ifdef CONFIG_ZONE_DMA32 - 256, + [ZONE_DMA32] = 256, #endif + [ZONE_NORMAL] = 32, #ifdef CONFIG_HIGHMEM - 32, + [ZONE_HIGHMEM] = 0, #endif - 32, + [ZONE_MOVABLE] = 0, }; EXPORT_SYMBOL(totalram_pages); @@ -316,9 +316,6 @@ static inline bool update_defer_init(pg_data_t *pgdat, /* Always populate low zones for address-constrained allocations */ if (zone_end < pgdat_end_pfn(pgdat)) return true; - /* Xen PV domains need page structures early */ - if (xen_pv_domain()) - return true; (*nr_initialised)++; if ((*nr_initialised > pgdat->static_init_pgcnt) && (pfn & (PAGES_PER_SECTION - 1)) == 0) { @@ -1746,16 +1743,38 @@ void __init page_alloc_init_late(void) } #ifdef CONFIG_CMA +static void __init adjust_present_page_count(struct page *page, long count) +{ + struct zone *zone = page_zone(page); + + /* We don't need to hold a lock since it is boot-up process */ + zone->present_pages += count; +} + /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void __init init_cma_reserved_pageblock(struct page *page) { unsigned i = pageblock_nr_pages; + unsigned long pfn = page_to_pfn(page); struct page *p = page; + int nid = page_to_nid(page); + + /* + * ZONE_MOVABLE will steal present pages from other zones by + * changing page links so page_zone() is changed. Before that, + * we need to adjust previous zone's page count first. + */ + adjust_present_page_count(page, -pageblock_nr_pages); do { __ClearPageReserved(p); set_page_count(p, 0); - } while (++p, --i); + + /* Steal pages from other zones */ + set_page_links(p, ZONE_MOVABLE, nid, pfn); + } while (++p, ++pfn, --i); + + adjust_present_page_count(page, pageblock_nr_pages); set_pageblock_migratetype(page, MIGRATE_CMA); @@ -2870,7 +2889,7 @@ int __isolate_free_page(struct page *page, unsigned int order) * exists. */ watermark = min_wmark_pages(zone) + (1UL << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) + if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) return 0; __mod_zone_freepage_state(zone, -(1UL << order), mt); @@ -3146,12 +3165,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, } -#ifdef CONFIG_CMA - /* If allocation can't use CMA areas don't use free CMA pages */ - if (!(alloc_flags & ALLOC_CMA)) - free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); -#endif - /* * Check watermarks for an order-0 allocation request. If these * are not met, then a high-order request also cannot go ahead @@ -3178,10 +3191,8 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, } #ifdef CONFIG_CMA - if ((alloc_flags & ALLOC_CMA) && - !list_empty(&area->free_list[MIGRATE_CMA])) { + if (!list_empty(&area->free_list[MIGRATE_CMA])) return true; - } #endif if (alloc_harder && !list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) @@ -3201,13 +3212,6 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, unsigned int alloc_flags) { long free_pages = zone_page_state(z, NR_FREE_PAGES); - long cma_pages = 0; - -#ifdef CONFIG_CMA - /* If allocation can't use CMA areas don't use free CMA pages */ - if (!(alloc_flags & ALLOC_CMA)) - cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES); -#endif /* * Fast check for order-0 only. If this fails then the reserves @@ -3216,7 +3220,7 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, * the caller is !atomic then it'll uselessly search the free * list. That corner case is then slower but it is harmless. */ - if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx]) + if (!order && free_pages > mark + z->lowmem_reserve[classzone_idx]) return true; return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, @@ -3852,10 +3856,6 @@ gfp_to_alloc_flags(gfp_t gfp_mask) } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; -#ifdef CONFIG_CMA - if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) - alloc_flags |= ALLOC_CMA; -#endif return alloc_flags; } @@ -4322,9 +4322,6 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, if (should_fail_alloc_page(gfp_mask, order)) return false; - if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE) - *alloc_flags |= ALLOC_CMA; - return true; } @@ -4734,6 +4731,13 @@ long si_mem_available(void) min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); + /* + * Part of the kernel memory, which can be released under memory + * pressure. + */ + available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >> + PAGE_SHIFT; + if (available < 0) available = 0; return available; @@ -6200,6 +6204,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; int nid = pgdat->node_id; + unsigned long node_end_pfn = 0; pgdat_resize_init(pgdat); #ifdef CONFIG_NUMA_BALANCING @@ -6227,9 +6232,13 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; unsigned long zone_start_pfn = zone->zone_start_pfn; + unsigned long movable_size = 0; size = zone->spanned_pages; realsize = freesize = zone->present_pages; + if (zone_end_pfn(zone) > node_end_pfn) + node_end_pfn = zone_end_pfn(zone); + /* * Adjust freesize so that it accounts for how much memory @@ -6278,12 +6287,30 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) zone_seqlock_init(zone); zone_pcp_init(zone); - if (!size) + /* + * The size of the CMA area is unknown now so we need to + * prepare the memory for the usemap at maximum. + */ + if (IS_ENABLED(CONFIG_CMA) && j == ZONE_MOVABLE && + pgdat->node_spanned_pages) { + movable_size = node_end_pfn - pgdat->node_start_pfn; + } + + if (!size && !movable_size) continue; set_pageblock_order(); - setup_usemap(pgdat, zone, zone_start_pfn, size); - init_currently_empty_zone(zone, zone_start_pfn, size); + if (movable_size) { + zone->zone_start_pfn = pgdat->node_start_pfn; + zone->spanned_pages = movable_size; + setup_usemap(pgdat, zone, + pgdat->node_start_pfn, movable_size); + init_currently_empty_zone(zone, + pgdat->node_start_pfn, movable_size); + } else { + setup_usemap(pgdat, zone, zone_start_pfn, size); + init_currently_empty_zone(zone, zone_start_pfn, size); + } memmap_init(size, nid, j, zone_start_pfn); } } @@ -7125,13 +7152,15 @@ static void setup_per_zone_lowmem_reserve(void) struct zone *lower_zone; idx--; - - if (sysctl_lowmem_reserve_ratio[idx] < 1) - sysctl_lowmem_reserve_ratio[idx] = 1; - lower_zone = pgdat->node_zones + idx; - lower_zone->lowmem_reserve[j] = managed_pages / - sysctl_lowmem_reserve_ratio[idx]; + + if (sysctl_lowmem_reserve_ratio[idx] < 1) { + sysctl_lowmem_reserve_ratio[idx] = 0; + lower_zone->lowmem_reserve[j] = 0; + } else { + lower_zone->lowmem_reserve[j] = + managed_pages / sysctl_lowmem_reserve_ratio[idx]; + } managed_pages += lower_zone->managed_pages; } } @@ -7922,7 +7951,7 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) } #endif -#ifdef CONFIG_MEMORY_HOTPLUG +#if defined CONFIG_MEMORY_HOTPLUG || defined CONFIG_CMA /* * The zone indicated has a new number of managed_pages; batch sizes and percpu * page high values need to be recalulated. diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 61dee77bb211..43e085608846 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -309,8 +309,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, return pfn < end_pfn ? -EBUSY : 0; } -struct page *alloc_migrate_target(struct page *page, unsigned long private, - int **resultp) +struct page *alloc_migrate_target(struct page *page, unsigned long private) { return new_page_nodemask(page, numa_node_id(), &node_states[N_MEMORY]); } diff --git a/mm/readahead.c b/mm/readahead.c index 4d57b4644f98..539bbb6c1fad 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -175,7 +175,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, break; rcu_read_lock(); - page = radix_tree_lookup(&mapping->page_tree, page_offset); + page = radix_tree_lookup(&mapping->i_pages, page_offset); rcu_read_unlock(); if (page && !radix_tree_exceptional_entry(page)) continue; diff --git a/mm/rmap.c b/mm/rmap.c index 9122787c4947..f0dd4e4565bc 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -32,11 +32,11 @@ * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) * mem_cgroup_{begin,end}_page_stat (memcg->move_lock) - * mapping->tree_lock (widely used) + * i_pages lock (widely used) * inode->i_lock (in set_page_dirty's __mark_inode_dirty) * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) - * mapping->tree_lock (widely used, in set_page_dirty, + * i_pages lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, * within bdi.wb->list_lock in __sync_single_inode) * diff --git a/mm/shmem.c b/mm/shmem.c index 4424fc0c33aa..9d6c7e595415 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -332,12 +332,12 @@ static int shmem_radix_tree_replace(struct address_space *mapping, VM_BUG_ON(!expected); VM_BUG_ON(!replacement); - item = __radix_tree_lookup(&mapping->page_tree, index, &node, &pslot); + item = __radix_tree_lookup(&mapping->i_pages, index, &node, &pslot); if (!item) return -ENOENT; if (item != expected) return -ENOENT; - __radix_tree_replace(&mapping->page_tree, node, pslot, + __radix_tree_replace(&mapping->i_pages, node, pslot, replacement, NULL); return 0; } @@ -355,7 +355,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, void *item; rcu_read_lock(); - item = radix_tree_lookup(&mapping->page_tree, index); + item = radix_tree_lookup(&mapping->i_pages, index); rcu_read_unlock(); return item == swp_to_radix_entry(swap); } @@ -590,14 +590,14 @@ static int shmem_add_to_page_cache(struct page *page, page->mapping = mapping; page->index = index; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); if (PageTransHuge(page)) { void __rcu **results; pgoff_t idx; int i; error = 0; - if (radix_tree_gang_lookup_slot(&mapping->page_tree, + if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx, index, 1) && idx < index + HPAGE_PMD_NR) { error = -EEXIST; @@ -605,14 +605,14 @@ static int shmem_add_to_page_cache(struct page *page, if (!error) { for (i = 0; i < HPAGE_PMD_NR; i++) { - error = radix_tree_insert(&mapping->page_tree, + error = radix_tree_insert(&mapping->i_pages, index + i, page + i); VM_BUG_ON(error); } count_vm_event(THP_FILE_ALLOC); } } else if (!expected) { - error = radix_tree_insert(&mapping->page_tree, index, page); + error = radix_tree_insert(&mapping->i_pages, index, page); } else { error = shmem_radix_tree_replace(mapping, index, expected, page); @@ -624,10 +624,10 @@ static int shmem_add_to_page_cache(struct page *page, __inc_node_page_state(page, NR_SHMEM_THPS); __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); } else { page->mapping = NULL; - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); page_ref_sub(page, nr); } return error; @@ -643,13 +643,13 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) VM_BUG_ON_PAGE(PageCompound(page), page); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); error = shmem_radix_tree_replace(mapping, page->index, page, radswap); page->mapping = NULL; mapping->nrpages--; __dec_node_page_state(page, NR_FILE_PAGES); __dec_node_page_state(page, NR_SHMEM); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); put_page(page); BUG_ON(error); } @@ -662,9 +662,9 @@ static int shmem_free_swap(struct address_space *mapping, { void *old; - spin_lock_irq(&mapping->tree_lock); - old = radix_tree_delete_item(&mapping->page_tree, index, radswap); - spin_unlock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); + old = radix_tree_delete_item(&mapping->i_pages, index, radswap); + xa_unlock_irq(&mapping->i_pages); if (old != radswap) return -ENOENT; free_swap_and_cache(radix_to_swp_entry(radswap)); @@ -675,7 +675,7 @@ static int shmem_free_swap(struct address_space *mapping, * Determine (in bytes) how many of the shmem object's pages mapped by the * given offsets are swapped out. * - * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU, + * This is safe to call without i_mutex or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ unsigned long shmem_partial_swap_usage(struct address_space *mapping, @@ -688,7 +688,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { if (iter.index >= end) break; @@ -717,7 +717,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, * Determine (in bytes) how many of the shmem object's pages mapped by the * given vma is swapped out. * - * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU, + * This is safe to call without i_mutex or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ unsigned long shmem_swap_usage(struct vm_area_struct *vma) @@ -1132,7 +1132,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, int error = 0; radswap = swp_to_radix_entry(swap); - index = find_swap_entry(&mapping->page_tree, radswap); + index = find_swap_entry(&mapping->i_pages, radswap); if (index == -1) return -EAGAIN; /* tell shmem_unuse we found nothing */ @@ -1448,7 +1448,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, hindex = round_down(index, HPAGE_PMD_NR); rcu_read_lock(); - if (radix_tree_gang_lookup_slot(&mapping->page_tree, &results, &idx, + if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx, hindex, 1) && idx < hindex + HPAGE_PMD_NR) { rcu_read_unlock(); return NULL; @@ -1561,14 +1561,14 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, * Our caller will very soon move newpage out of swapcache, but it's * a nice clean interface for us to replace oldpage by newpage there. */ - spin_lock_irq(&swap_mapping->tree_lock); + xa_lock_irq(&swap_mapping->i_pages); error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, newpage); if (!error) { __inc_node_page_state(newpage, NR_FILE_PAGES); __dec_node_page_state(oldpage, NR_FILE_PAGES); } - spin_unlock_irq(&swap_mapping->tree_lock); + xa_unlock_irq(&swap_mapping->i_pages); if (unlikely(error)) { /* @@ -2634,7 +2634,7 @@ static void shmem_tag_pins(struct address_space *mapping) start = 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { page = radix_tree_deref_slot(slot); if (!page || radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { @@ -2642,10 +2642,10 @@ static void shmem_tag_pins(struct address_space *mapping) continue; } } else if (page_count(page) - page_mapcount(page) > 1) { - spin_lock_irq(&mapping->tree_lock); - radix_tree_tag_set(&mapping->page_tree, iter.index, + xa_lock_irq(&mapping->i_pages); + radix_tree_tag_set(&mapping->i_pages, iter.index, SHMEM_TAG_PINNED); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); } if (need_resched()) { @@ -2677,7 +2677,7 @@ static int shmem_wait_for_pins(struct address_space *mapping) error = 0; for (scan = 0; scan <= LAST_SCAN; scan++) { - if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED)) + if (!radix_tree_tagged(&mapping->i_pages, SHMEM_TAG_PINNED)) break; if (!scan) @@ -2687,7 +2687,7 @@ static int shmem_wait_for_pins(struct address_space *mapping) start = 0; rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, SHMEM_TAG_PINNED) { page = radix_tree_deref_slot(slot); @@ -2713,10 +2713,10 @@ static int shmem_wait_for_pins(struct address_space *mapping) error = -EBUSY; } - spin_lock_irq(&mapping->tree_lock); - radix_tree_tag_clear(&mapping->page_tree, + xa_lock_irq(&mapping->i_pages); + radix_tree_tag_clear(&mapping->i_pages, iter.index, SHMEM_TAG_PINNED); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); continue_resched: if (need_resched()) { slot = radix_tree_iter_resume(slot, &iter); diff --git a/mm/slub.c b/mm/slub.c index 4fb037c98782..44aa7847324a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1363,10 +1363,8 @@ static __always_inline void kfree_hook(void *x) kasan_kfree_large(x, _RET_IP_); } -static __always_inline void *slab_free_hook(struct kmem_cache *s, void *x) +static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x) { - void *freeptr; - kmemleak_free_recursive(x, s->flags); /* @@ -1386,17 +1384,12 @@ static __always_inline void *slab_free_hook(struct kmem_cache *s, void *x) if (!(s->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(x, s->object_size); - freeptr = get_freepointer(s, x); - /* - * kasan_slab_free() may put x into memory quarantine, delaying its - * reuse. In this case the object's freelist pointer is changed. - */ - kasan_slab_free(s, x, _RET_IP_); - return freeptr; + /* KASAN might put x into memory quarantine, delaying its reuse */ + return kasan_slab_free(s, x, _RET_IP_); } -static inline void slab_free_freelist_hook(struct kmem_cache *s, - void *head, void *tail) +static inline bool slab_free_freelist_hook(struct kmem_cache *s, + void **head, void **tail) { /* * Compiler cannot detect this function can be removed if slab_free_hook() @@ -1407,13 +1400,33 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s, defined(CONFIG_DEBUG_OBJECTS_FREE) || \ defined(CONFIG_KASAN) - void *object = head; - void *tail_obj = tail ? : head; - void *freeptr; + void *object; + void *next = *head; + void *old_tail = *tail ? *tail : *head; + + /* Head and tail of the reconstructed freelist */ + *head = NULL; + *tail = NULL; do { - freeptr = slab_free_hook(s, object); - } while ((object != tail_obj) && (object = freeptr)); + object = next; + next = get_freepointer(s, object); + /* If object's reuse doesn't have to be delayed */ + if (!slab_free_hook(s, object)) { + /* Move object to the new freelist */ + set_freepointer(s, object, *head); + *head = object; + if (!*tail) + *tail = object; + } + } while (object != old_tail); + + if (*head == *tail) + *tail = NULL; + + return *head != NULL; +#else + return true; #endif } @@ -2968,14 +2981,12 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page, void *head, void *tail, int cnt, unsigned long addr) { - slab_free_freelist_hook(s, head, tail); /* - * slab_free_freelist_hook() could have put the items into quarantine. - * If so, no need to free them. + * With KASAN enabled slab_free_freelist_hook modifies the freelist + * to remove objects, whose reuse must be delayed. */ - if (s->flags & SLAB_KASAN && !(s->flags & SLAB_TYPESAFE_BY_RCU)) - return; - do_slab_free(s, page, head, tail, cnt, addr); + if (slab_free_freelist_hook(s, &head, &tail)) + do_slab_free(s, page, head, tail, cnt, addr); } #ifdef CONFIG_KASAN diff --git a/mm/swap_state.c b/mm/swap_state.c index f233dccd3b1b..07f9aa2340c3 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -124,10 +124,10 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) SetPageSwapCache(page); address_space = swap_address_space(entry); - spin_lock_irq(&address_space->tree_lock); + xa_lock_irq(&address_space->i_pages); for (i = 0; i < nr; i++) { set_page_private(page + i, entry.val + i); - error = radix_tree_insert(&address_space->page_tree, + error = radix_tree_insert(&address_space->i_pages, idx + i, page + i); if (unlikely(error)) break; @@ -145,13 +145,13 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) VM_BUG_ON(error == -EEXIST); set_page_private(page + i, 0UL); while (i--) { - radix_tree_delete(&address_space->page_tree, idx + i); + radix_tree_delete(&address_space->i_pages, idx + i); set_page_private(page + i, 0UL); } ClearPageSwapCache(page); page_ref_sub(page, nr); } - spin_unlock_irq(&address_space->tree_lock); + xa_unlock_irq(&address_space->i_pages); return error; } @@ -188,7 +188,7 @@ void __delete_from_swap_cache(struct page *page) address_space = swap_address_space(entry); idx = swp_offset(entry); for (i = 0; i < nr; i++) { - radix_tree_delete(&address_space->page_tree, idx + i); + radix_tree_delete(&address_space->i_pages, idx + i); set_page_private(page + i, 0); } ClearPageSwapCache(page); @@ -272,9 +272,9 @@ void delete_from_swap_cache(struct page *page) entry.val = page_private(page); address_space = swap_address_space(entry); - spin_lock_irq(&address_space->tree_lock); + xa_lock_irq(&address_space->i_pages); __delete_from_swap_cache(page); - spin_unlock_irq(&address_space->tree_lock); + xa_unlock_irq(&address_space->i_pages); put_swap_page(page, entry); page_ref_sub(page, hpage_nr_pages(page)); @@ -628,12 +628,11 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages) return -ENOMEM; for (i = 0; i < nr; i++) { space = spaces + i; - INIT_RADIX_TREE(&space->page_tree, GFP_ATOMIC|__GFP_NOWARN); + INIT_RADIX_TREE(&space->i_pages, GFP_ATOMIC|__GFP_NOWARN); atomic_set(&space->i_mmap_writable, 0); space->a_ops = &swap_aops; /* swap cache doesn't use writeback related tags */ mapping_set_no_writeback_tags(space); - spin_lock_init(&space->tree_lock); } nr_swapper_spaces[type] = nr; rcu_assign_pointer(swapper_spaces[type], spaces); diff --git a/mm/swapfile.c b/mm/swapfile.c index c7a33717d079..cc2cf04d9018 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -85,7 +85,7 @@ PLIST_HEAD(swap_active_head); * is held and the locking order requires swap_lock to be taken * before any swap_info_struct->lock. */ -struct plist_head *swap_avail_heads; +static struct plist_head *swap_avail_heads; static DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; @@ -2961,6 +2961,10 @@ static unsigned long read_swap_header(struct swap_info_struct *p, maxpages = swp_offset(pte_to_swp_entry( swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; last_page = swap_header->info.last_page; + if (!last_page) { + pr_warn("Empty swap-file\n"); + return 0; + } if (last_page > maxpages) { pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", maxpages << (PAGE_SHIFT - 10), diff --git a/mm/truncate.c b/mm/truncate.c index c34e2fd4f583..1d2fb2dca96f 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -36,11 +36,11 @@ static inline void __clear_shadow_entry(struct address_space *mapping, struct radix_tree_node *node; void **slot; - if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) + if (!__radix_tree_lookup(&mapping->i_pages, index, &node, &slot)) return; if (*slot != entry) return; - __radix_tree_replace(&mapping->page_tree, node, slot, NULL, + __radix_tree_replace(&mapping->i_pages, node, slot, NULL, workingset_update_node); mapping->nrexceptional--; } @@ -48,9 +48,9 @@ static inline void __clear_shadow_entry(struct address_space *mapping, static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, void *entry) { - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); __clear_shadow_entry(mapping, index, entry); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); } /* @@ -79,7 +79,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, dax = dax_mapping(mapping); lock = !dax && indices[j] < end; if (lock) - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); for (i = j; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; @@ -102,7 +102,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, } if (lock) - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); pvec->nr = j; } @@ -518,8 +518,8 @@ void truncate_inode_pages_final(struct address_space *mapping) * modification that does not see AS_EXITING is * completed before starting the final truncate. */ - spin_lock_irq(&mapping->tree_lock); - spin_unlock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); + xa_unlock_irq(&mapping->i_pages); truncate_inode_pages(mapping, 0); } @@ -627,13 +627,13 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); if (PageDirty(page)) goto failed; BUG_ON(page_has_private(page)); __delete_from_page_cache(page, NULL); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); @@ -641,7 +641,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) put_page(page); /* pagecache ref */ return 1; failed: - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); return 0; } diff --git a/mm/util.c b/mm/util.c index 029fc2f3b395..1fc4fa7576f7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -287,7 +287,7 @@ int vma_is_stack_for_current(struct vm_area_struct *vma) } #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; @@ -668,6 +668,13 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) free += global_node_page_state(NR_SLAB_RECLAIMABLE); /* + * Part of the kernel memory, which can be released + * under memory pressure. + */ + free += global_node_page_state( + NR_INDIRECTLY_RECLAIMABLE_BYTES) >> PAGE_SHIFT; + + /* * Leave reserved pages. The pages are not for anonymous pages. */ if (free <= totalreserve_pages) diff --git a/mm/vmscan.c b/mm/vmscan.c index 4390a8d5be41..8b920ce3ae02 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -116,6 +116,16 @@ struct scan_control { /* Number of pages freed so far during a call to shrink_zones() */ unsigned long nr_reclaimed; + + struct { + unsigned int dirty; + unsigned int unqueued_dirty; + unsigned int congested; + unsigned int writeback; + unsigned int immediate; + unsigned int file_taken; + unsigned int taken; + } nr; }; #ifdef ARCH_HAS_PREFETCH @@ -190,6 +200,29 @@ static bool sane_reclaim(struct scan_control *sc) #endif return false; } + +static void set_memcg_congestion(pg_data_t *pgdat, + struct mem_cgroup *memcg, + bool congested) +{ + struct mem_cgroup_per_node *mn; + + if (!memcg) + return; + + mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id); + WRITE_ONCE(mn->congested, congested); +} + +static bool memcg_congested(pg_data_t *pgdat, + struct mem_cgroup *memcg) +{ + struct mem_cgroup_per_node *mn; + + mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id); + return READ_ONCE(mn->congested); + +} #else static bool global_reclaim(struct scan_control *sc) { @@ -200,6 +233,18 @@ static bool sane_reclaim(struct scan_control *sc) { return true; } + +static inline void set_memcg_congestion(struct pglist_data *pgdat, + struct mem_cgroup *memcg, bool congested) +{ +} + +static inline bool memcg_congested(struct pglist_data *pgdat, + struct mem_cgroup *memcg) +{ + return false; + +} #endif /* @@ -648,7 +693,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); /* * The non racy check for a busy page. * @@ -672,7 +717,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * load is not satisfied before that of page->_refcount. * * Note that if SetPageDirty is always performed via set_page_dirty, - * and thus under tree_lock, then this ordering is not required. + * and thus under the i_pages lock, then this ordering is not required. */ if (unlikely(PageTransHuge(page)) && PageSwapCache(page)) refcount = 1 + HPAGE_PMD_NR; @@ -690,7 +735,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, swp_entry_t swap = { .val = page_private(page) }; mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); put_swap_page(page, swap); } else { void (*freepage)(struct page *); @@ -711,13 +756,13 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * only page cache pages found in these are zero pages * covering holes, and because we don't want to mix DAX * exceptional entries and shadow exceptional entries in the - * same page_tree. + * same address_space. */ if (reclaimed && page_is_file_cache(page) && !mapping_exiting(mapping) && !dax_mapping(mapping)) shadow = workingset_eviction(mapping, page); __delete_from_page_cache(page, shadow); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); if (freepage != NULL) freepage(page); @@ -726,7 +771,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, return 1; cannot_free: - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); return 0; } @@ -857,17 +902,6 @@ static void page_check_dirty_writeback(struct page *page, mapping->a_ops->is_dirty_writeback(page, dirty, writeback); } -struct reclaim_stat { - unsigned nr_dirty; - unsigned nr_unqueued_dirty; - unsigned nr_congested; - unsigned nr_writeback; - unsigned nr_immediate; - unsigned nr_activate; - unsigned nr_ref_keep; - unsigned nr_unmap_fail; -}; - /* * shrink_page_list() returns the number of reclaimed pages */ @@ -926,7 +960,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); /* - * The number of dirty pages determines if a zone is marked + * The number of dirty pages determines if a node is marked * reclaim_congested which affects wait_iff_congested. kswapd * will stall and start writing pages if the tail of the LRU * is all dirty unqueued pages. @@ -1755,23 +1789,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, free_unref_page_list(&page_list); /* - * If reclaim is isolating dirty pages under writeback, it implies - * that the long-lived page allocation rate is exceeding the page - * laundering rate. Either the global limits are not being effective - * at throttling processes due to the page distribution throughout - * zones or there is heavy usage of a slow backing device. The - * only option is to throttle from reclaim context which is not ideal - * as there is no guarantee the dirtying process is throttled in the - * same way balance_dirty_pages() manages. - * - * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number - * of pages under pages flagged for immediate reclaim and stall if any - * are encountered in the nr_immediate check below. - */ - if (stat.nr_writeback && stat.nr_writeback == nr_taken) - set_bit(PGDAT_WRITEBACK, &pgdat->flags); - - /* * If dirty pages are scanned that are not queued for IO, it * implies that flushers are not doing their job. This can * happen when memory pressure pushes dirty pages to the end of @@ -1785,48 +1802,17 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (stat.nr_unqueued_dirty == nr_taken) wakeup_flusher_threads(WB_REASON_VMSCAN); - /* - * Legacy memcg will stall in page writeback so avoid forcibly - * stalling here. - */ - if (sane_reclaim(sc)) { - /* - * Tag a zone as congested if all the dirty pages scanned were - * backed by a congested BDI and wait_iff_congested will stall. - */ - if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested) - set_bit(PGDAT_CONGESTED, &pgdat->flags); - - /* Allow kswapd to start writing pages during reclaim. */ - if (stat.nr_unqueued_dirty == nr_taken) - set_bit(PGDAT_DIRTY, &pgdat->flags); - - /* - * If kswapd scans pages marked marked for immediate - * reclaim and under writeback (nr_immediate), it implies - * that pages are cycling through the LRU faster than - * they are written so also forcibly stall. - */ - if (stat.nr_immediate && current_may_throttle()) - congestion_wait(BLK_RW_ASYNC, HZ/10); - } - - /* - * Stall direct reclaim for IO completions if underlying BDIs or zone - * is congested. Allow kswapd to continue until it starts encountering - * unqueued dirty pages or cycling through the LRU too quickly. - */ - if (!sc->hibernation_mode && !current_is_kswapd() && - current_may_throttle()) - wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10); + sc->nr.dirty += stat.nr_dirty; + sc->nr.congested += stat.nr_congested; + sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; + sc->nr.writeback += stat.nr_writeback; + sc->nr.immediate += stat.nr_immediate; + sc->nr.taken += nr_taken; + if (file) + sc->nr.file_taken += nr_taken; trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, - nr_scanned, nr_reclaimed, - stat.nr_dirty, stat.nr_writeback, - stat.nr_congested, stat.nr_immediate, - stat.nr_activate, stat.nr_ref_keep, - stat.nr_unmap_fail, - sc->priority, file); + nr_scanned, nr_reclaimed, &stat, sc->priority, file); return nr_reclaimed; } @@ -2507,6 +2493,12 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, return true; } +static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg) +{ + return test_bit(PGDAT_CONGESTED, &pgdat->flags) || + (memcg && memcg_congested(pgdat, memcg)); +} + static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) { struct reclaim_state *reclaim_state = current->reclaim_state; @@ -2522,6 +2514,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) unsigned long node_lru_pages = 0; struct mem_cgroup *memcg; + memset(&sc->nr, 0, sizeof(sc->nr)); + nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; @@ -2536,7 +2530,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) sc->memcg_low_skipped = 1; continue; } - mem_cgroup_event(memcg, MEMCG_LOW); + memcg_memory_event(memcg, MEMCG_LOW); } reclaimed = sc->nr_reclaimed; @@ -2587,6 +2581,67 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) if (sc->nr_reclaimed - nr_reclaimed) reclaimable = true; + if (current_is_kswapd()) { + /* + * If reclaim is isolating dirty pages under writeback, + * it implies that the long-lived page allocation rate + * is exceeding the page laundering rate. Either the + * global limits are not being effective at throttling + * processes due to the page distribution throughout + * zones or there is heavy usage of a slow backing + * device. The only option is to throttle from reclaim + * context which is not ideal as there is no guarantee + * the dirtying process is throttled in the same way + * balance_dirty_pages() manages. + * + * Once a node is flagged PGDAT_WRITEBACK, kswapd will + * count the number of pages under pages flagged for + * immediate reclaim and stall if any are encountered + * in the nr_immediate check below. + */ + if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) + set_bit(PGDAT_WRITEBACK, &pgdat->flags); + + /* + * Tag a node as congested if all the dirty pages + * scanned were backed by a congested BDI and + * wait_iff_congested will stall. + */ + if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) + set_bit(PGDAT_CONGESTED, &pgdat->flags); + + /* Allow kswapd to start writing pages during reclaim.*/ + if (sc->nr.unqueued_dirty == sc->nr.file_taken) + set_bit(PGDAT_DIRTY, &pgdat->flags); + + /* + * If kswapd scans pages marked marked for immediate + * reclaim and under writeback (nr_immediate), it + * implies that pages are cycling through the LRU + * faster than they are written so also forcibly stall. + */ + if (sc->nr.immediate) + congestion_wait(BLK_RW_ASYNC, HZ/10); + } + + /* + * Legacy memcg will stall in page writeback so avoid forcibly + * stalling in wait_iff_congested(). + */ + if (!global_reclaim(sc) && sane_reclaim(sc) && + sc->nr.dirty && sc->nr.dirty == sc->nr.congested) + set_memcg_congestion(pgdat, root, true); + + /* + * Stall direct reclaim for IO completions if underlying BDIs + * and node is congested. Allow kswapd to continue until it + * starts encountering unqueued dirty pages or cycling through + * the LRU too quickly. + */ + if (!sc->hibernation_mode && !current_is_kswapd() && + current_may_throttle() && pgdat_memcg_congested(pgdat, root)) + wait_iff_congested(BLK_RW_ASYNC, HZ/10); + } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); @@ -2802,6 +2857,7 @@ retry: continue; last_pgdat = zone->zone_pgdat; snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat); + set_memcg_congestion(last_pgdat, sc->target_mem_cgroup, false); } delayacct_freepages_end(); @@ -3808,7 +3864,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) { /* - * Free memory by calling shrink zone with increasing + * Free memory by calling shrink node with increasing * priorities until we have enough memory freed. */ do { diff --git a/mm/vmstat.c b/mm/vmstat.c index 33581be705f0..536332e988b8 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1161,6 +1161,7 @@ const char * const vmstat_text[] = { "nr_vmscan_immediate_reclaim", "nr_dirtied", "nr_written", + "nr_indirectly_reclaimable", /* enum writeback_stat_item counters */ "nr_dirty_threshold", diff --git a/mm/workingset.c b/mm/workingset.c index b7d616a3bbbe..40ee02c83978 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -202,7 +202,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, * @mapping: address space the page was backing * @page: the page being evicted * - * Returns a shadow entry to be stored in @mapping->page_tree in place + * Returns a shadow entry to be stored in @mapping->i_pages in place * of the evicted @page so that a later refault can be detected. */ void *workingset_eviction(struct address_space *mapping, struct page *page) @@ -348,7 +348,7 @@ void workingset_update_node(struct radix_tree_node *node) * * Avoid acquiring the list_lru lock when the nodes are * already where they should be. The list_empty() test is safe - * as node->private_list is protected by &mapping->tree_lock. + * as node->private_list is protected by the i_pages lock. */ if (node->count && node->count == node->exceptional) { if (list_empty(&node->private_list)) @@ -366,7 +366,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, unsigned long nodes; unsigned long cache; - /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ + /* list_lru lock nests inside the IRQ-safe i_pages lock */ local_irq_disable(); nodes = list_lru_shrink_count(&shadow_nodes, sc); local_irq_enable(); @@ -419,21 +419,21 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, /* * Page cache insertions and deletions synchroneously maintain - * the shadow node LRU under the mapping->tree_lock and the + * the shadow node LRU under the i_pages lock and the * lru_lock. Because the page cache tree is emptied before * the inode can be destroyed, holding the lru_lock pins any * address_space that has radix tree nodes on the LRU. * - * We can then safely transition to the mapping->tree_lock to + * We can then safely transition to the i_pages lock to * pin only the address_space of the particular node we want * to reclaim, take the node off-LRU, and drop the lru_lock. */ node = container_of(item, struct radix_tree_node, private_list); - mapping = container_of(node->root, struct address_space, page_tree); + mapping = container_of(node->root, struct address_space, i_pages); /* Coming from the list, invert the lock order */ - if (!spin_trylock(&mapping->tree_lock)) { + if (!xa_trylock(&mapping->i_pages)) { spin_unlock(lru_lock); ret = LRU_RETRY; goto out; @@ -468,11 +468,11 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, if (WARN_ON_ONCE(node->exceptional)) goto out_invalid; inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); - __radix_tree_delete_node(&mapping->page_tree, node, + __radix_tree_delete_node(&mapping->i_pages, node, workingset_lookup_update(mapping)); out_invalid: - spin_unlock(&mapping->tree_lock); + xa_unlock(&mapping->i_pages); ret = LRU_REMOVED_RETRY; out: local_irq_enable(); @@ -487,7 +487,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, { unsigned long ret; - /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ + /* list_lru lock nests inside the IRQ-safe i_pages lock */ local_irq_disable(); ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL); local_irq_enable(); @@ -503,7 +503,7 @@ static struct shrinker workingset_shadow_shrinker = { /* * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe - * mapping->tree_lock. + * i_pages lock. */ static struct lock_class_key shadow_nodes_key; diff --git a/mm/z3fold.c b/mm/z3fold.c index f579ad4a8100..c0bca6153b95 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -467,6 +467,8 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, spin_lock_init(&pool->lock); spin_lock_init(&pool->stale_lock); pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); + if (!pool->unbuddied) + goto out_pool; for_each_possible_cpu(cpu) { struct list_head *unbuddied = per_cpu_ptr(pool->unbuddied, cpu); @@ -479,7 +481,7 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, pool->name = name; pool->compact_wq = create_singlethread_workqueue(pool->name); if (!pool->compact_wq) - goto out; + goto out_unbuddied; pool->release_wq = create_singlethread_workqueue(pool->name); if (!pool->release_wq) goto out_wq; @@ -489,8 +491,11 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, out_wq: destroy_workqueue(pool->compact_wq); -out: +out_unbuddied: + free_percpu(pool->unbuddied); +out_pool: kfree(pool); +out: return NULL; } @@ -533,7 +538,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, struct z3fold_header *zhdr = NULL; struct page *page = NULL; enum buddy bud; - bool can_sleep = (gfp & __GFP_RECLAIM) == __GFP_RECLAIM; + bool can_sleep = gfpflags_allow_blocking(gfp); if (!size || (gfp & __GFP_HIGHMEM)) return -EINVAL; |