diff options
Diffstat (limited to 'mm')
39 files changed, 1018 insertions, 2362 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index de64ea658716..d85e39da47ae 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -127,9 +127,6 @@ config SPARSEMEM_VMEMMAP pfn_to_page and page_to_pfn operations. This is the most efficient option when sufficient kernel resources are available. -config HAVE_MEMBLOCK - bool - config HAVE_MEMBLOCK_NODE_MAP bool @@ -142,9 +139,6 @@ config HAVE_GENERIC_GUP config ARCH_DISCARD_MEMBLOCK bool -config NO_BOOTMEM - bool - config MEMORY_ISOLATION bool @@ -379,7 +373,7 @@ config TRANSPARENT_HUGEPAGE bool "Transparent Hugepage Support" depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE select COMPACTION - select RADIX_TREE_MULTIORDER + select XARRAY_MULTI help Transparent Hugepages allows the kernel to use huge pages and huge tlb transparently to the applications whenever possible. @@ -481,7 +475,7 @@ config FRONTSWAP config CMA bool "Contiguous Memory Allocator" - depends on HAVE_MEMBLOCK && MMU + depends on MMU select MIGRATION select MEMORY_ISOLATION help @@ -634,7 +628,6 @@ config MAX_STACK_SIZE_MB config DEFERRED_STRUCT_PAGE_INIT bool "Defer initialisation of struct pages to kthreads" default n - depends on NO_BOOTMEM depends on SPARSEMEM depends on !NEED_PER_CPU_KM depends on 64BIT @@ -671,7 +664,7 @@ config ZONE_DEVICE depends on MEMORY_HOTREMOVE depends on SPARSEMEM_VMEMMAP depends on ARCH_HAS_ZONE_DEVICE - select RADIX_TREE_MULTIORDER + select XARRAY_MULTI help Device memory hotplug support allows for establishing pmem, diff --git a/mm/Makefile b/mm/Makefile index 6485d5745dd7..d210cc9d6f80 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -42,17 +42,11 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ debug.o $(mmu-y) obj-y += init-mm.o - -ifdef CONFIG_NO_BOOTMEM - obj-y += nobootmem.o -else - obj-y += bootmem.o -endif +obj-y += memblock.o ifdef CONFIG_MMU obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o endif -obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o swap_slots.o obj-$(CONFIG_FRONTSWAP) += frontswap.o diff --git a/mm/bootmem.c b/mm/bootmem.c deleted file mode 100644 index 97db0e8e362b..000000000000 --- a/mm/bootmem.c +++ /dev/null @@ -1,811 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * bootmem - A boot-time physical memory allocator and configurator - * - * Copyright (C) 1999 Ingo Molnar - * 1999 Kanoj Sarcar, SGI - * 2008 Johannes Weiner - * - * Access to this subsystem has to be serialized externally (which is true - * for the boot process anyway). - */ -#include <linux/init.h> -#include <linux/pfn.h> -#include <linux/slab.h> -#include <linux/export.h> -#include <linux/kmemleak.h> -#include <linux/range.h> -#include <linux/bug.h> -#include <linux/io.h> -#include <linux/bootmem.h> - -#include "internal.h" - -/** - * DOC: bootmem overview - * - * Bootmem is a boot-time physical memory allocator and configurator. - * - * It is used early in the boot process before the page allocator is - * set up. - * - * Bootmem is based on the most basic of allocators, a First Fit - * allocator which uses a bitmap to represent memory. If a bit is 1, - * the page is allocated and 0 if unallocated. To satisfy allocations - * of sizes smaller than a page, the allocator records the Page Frame - * Number (PFN) of the last allocation and the offset the allocation - * ended at. Subsequent small allocations are merged together and - * stored on the same page. - * - * The information used by the bootmem allocator is represented by - * :c:type:`struct bootmem_data`. An array to hold up to %MAX_NUMNODES - * such structures is statically allocated and then it is discarded - * when the system initialization completes. Each entry in this array - * corresponds to a node with memory. For UMA systems only entry 0 is - * used. - * - * The bootmem allocator is initialized during early architecture - * specific setup. Each architecture is required to supply a - * :c:func:`setup_arch` function which, among other tasks, is - * responsible for acquiring the necessary parameters to initialise - * the boot memory allocator. These parameters define limits of usable - * physical memory: - * - * * @min_low_pfn - the lowest PFN that is available in the system - * * @max_low_pfn - the highest PFN that may be addressed by low - * memory (%ZONE_NORMAL) - * * @max_pfn - the last PFN available to the system. - * - * After those limits are determined, the :c:func:`init_bootmem` or - * :c:func:`init_bootmem_node` function should be called to initialize - * the bootmem allocator. The UMA case should use the `init_bootmem` - * function. It will initialize ``contig_page_data`` structure that - * represents the only memory node in the system. In the NUMA case the - * `init_bootmem_node` function should be called to initialize the - * bootmem allocator for each node. - * - * Once the allocator is set up, it is possible to use either single - * node or NUMA variant of the allocation APIs. - */ - -#ifndef CONFIG_NEED_MULTIPLE_NODES -struct pglist_data __refdata contig_page_data = { - .bdata = &bootmem_node_data[0] -}; -EXPORT_SYMBOL(contig_page_data); -#endif - -unsigned long max_low_pfn; -unsigned long min_low_pfn; -unsigned long max_pfn; -unsigned long long max_possible_pfn; - -bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; - -static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); - -static int bootmem_debug; - -static int __init bootmem_debug_setup(char *buf) -{ - bootmem_debug = 1; - return 0; -} -early_param("bootmem_debug", bootmem_debug_setup); - -#define bdebug(fmt, args...) ({ \ - if (unlikely(bootmem_debug)) \ - pr_info("bootmem::%s " fmt, \ - __func__, ## args); \ -}) - -static unsigned long __init bootmap_bytes(unsigned long pages) -{ - unsigned long bytes = DIV_ROUND_UP(pages, BITS_PER_BYTE); - - return ALIGN(bytes, sizeof(long)); -} - -/** - * bootmem_bootmap_pages - calculate bitmap size in pages - * @pages: number of pages the bitmap has to represent - * - * Return: the number of pages needed to hold the bitmap. - */ -unsigned long __init bootmem_bootmap_pages(unsigned long pages) -{ - unsigned long bytes = bootmap_bytes(pages); - - return PAGE_ALIGN(bytes) >> PAGE_SHIFT; -} - -/* - * link bdata in order - */ -static void __init link_bootmem(bootmem_data_t *bdata) -{ - bootmem_data_t *ent; - - list_for_each_entry(ent, &bdata_list, list) { - if (bdata->node_min_pfn < ent->node_min_pfn) { - list_add_tail(&bdata->list, &ent->list); - return; - } - } - - list_add_tail(&bdata->list, &bdata_list); -} - -/* - * Called once to set up the allocator itself. - */ -static unsigned long __init init_bootmem_core(bootmem_data_t *bdata, - unsigned long mapstart, unsigned long start, unsigned long end) -{ - unsigned long mapsize; - - mminit_validate_memmodel_limits(&start, &end); - bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart)); - bdata->node_min_pfn = start; - bdata->node_low_pfn = end; - link_bootmem(bdata); - - /* - * Initially all pages are reserved - setup_arch() has to - * register free RAM areas explicitly. - */ - mapsize = bootmap_bytes(end - start); - memset(bdata->node_bootmem_map, 0xff, mapsize); - - bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n", - bdata - bootmem_node_data, start, mapstart, end, mapsize); - - return mapsize; -} - -/** - * init_bootmem_node - register a node as boot memory - * @pgdat: node to register - * @freepfn: pfn where the bitmap for this node is to be placed - * @startpfn: first pfn on the node - * @endpfn: first pfn after the node - * - * Return: the number of bytes needed to hold the bitmap for this node. - */ -unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, - unsigned long startpfn, unsigned long endpfn) -{ - return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn); -} - -/** - * init_bootmem - register boot memory - * @start: pfn where the bitmap is to be placed - * @pages: number of available physical pages - * - * Return: the number of bytes needed to hold the bitmap. - */ -unsigned long __init init_bootmem(unsigned long start, unsigned long pages) -{ - max_low_pfn = pages; - min_low_pfn = start; - return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); -} - -void __init free_bootmem_late(unsigned long physaddr, unsigned long size) -{ - unsigned long cursor, end; - - kmemleak_free_part_phys(physaddr, size); - - cursor = PFN_UP(physaddr); - end = PFN_DOWN(physaddr + size); - - for (; cursor < end; cursor++) { - __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); - totalram_pages++; - } -} - -static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) -{ - struct page *page; - unsigned long *map, start, end, pages, cur, count = 0; - - if (!bdata->node_bootmem_map) - return 0; - - map = bdata->node_bootmem_map; - start = bdata->node_min_pfn; - end = bdata->node_low_pfn; - - bdebug("nid=%td start=%lx end=%lx\n", - bdata - bootmem_node_data, start, end); - - while (start < end) { - unsigned long idx, vec; - unsigned shift; - - idx = start - bdata->node_min_pfn; - shift = idx & (BITS_PER_LONG - 1); - /* - * vec holds at most BITS_PER_LONG map bits, - * bit 0 corresponds to start. - */ - vec = ~map[idx / BITS_PER_LONG]; - - if (shift) { - vec >>= shift; - if (end - start >= BITS_PER_LONG) - vec |= ~map[idx / BITS_PER_LONG + 1] << - (BITS_PER_LONG - shift); - } - /* - * If we have a properly aligned and fully unreserved - * BITS_PER_LONG block of pages in front of us, free - * it in one go. - */ - if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) { - int order = ilog2(BITS_PER_LONG); - - __free_pages_bootmem(pfn_to_page(start), start, order); - count += BITS_PER_LONG; - start += BITS_PER_LONG; - } else { - cur = start; - - start = ALIGN(start + 1, BITS_PER_LONG); - while (vec && cur != start) { - if (vec & 1) { - page = pfn_to_page(cur); - __free_pages_bootmem(page, cur, 0); - count++; - } - vec >>= 1; - ++cur; - } - } - } - - cur = bdata->node_min_pfn; - page = virt_to_page(bdata->node_bootmem_map); - pages = bdata->node_low_pfn - bdata->node_min_pfn; - pages = bootmem_bootmap_pages(pages); - count += pages; - while (pages--) - __free_pages_bootmem(page++, cur++, 0); - bdata->node_bootmem_map = NULL; - - bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); - - return count; -} - -static int reset_managed_pages_done __initdata; - -void reset_node_managed_pages(pg_data_t *pgdat) -{ - struct zone *z; - - for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) - z->managed_pages = 0; -} - -void __init reset_all_zones_managed_pages(void) -{ - struct pglist_data *pgdat; - - if (reset_managed_pages_done) - return; - - for_each_online_pgdat(pgdat) - reset_node_managed_pages(pgdat); - - reset_managed_pages_done = 1; -} - -unsigned long __init free_all_bootmem(void) -{ - unsigned long total_pages = 0; - bootmem_data_t *bdata; - - reset_all_zones_managed_pages(); - - list_for_each_entry(bdata, &bdata_list, list) - total_pages += free_all_bootmem_core(bdata); - - totalram_pages += total_pages; - - return total_pages; -} - -static void __init __free(bootmem_data_t *bdata, - unsigned long sidx, unsigned long eidx) -{ - unsigned long idx; - - bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data, - sidx + bdata->node_min_pfn, - eidx + bdata->node_min_pfn); - - if (WARN_ON(bdata->node_bootmem_map == NULL)) - return; - - if (bdata->hint_idx > sidx) - bdata->hint_idx = sidx; - - for (idx = sidx; idx < eidx; idx++) - if (!test_and_clear_bit(idx, bdata->node_bootmem_map)) - BUG(); -} - -static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx, - unsigned long eidx, int flags) -{ - unsigned long idx; - int exclusive = flags & BOOTMEM_EXCLUSIVE; - - bdebug("nid=%td start=%lx end=%lx flags=%x\n", - bdata - bootmem_node_data, - sidx + bdata->node_min_pfn, - eidx + bdata->node_min_pfn, - flags); - - if (WARN_ON(bdata->node_bootmem_map == NULL)) - return 0; - - for (idx = sidx; idx < eidx; idx++) - if (test_and_set_bit(idx, bdata->node_bootmem_map)) { - if (exclusive) { - __free(bdata, sidx, idx); - return -EBUSY; - } - bdebug("silent double reserve of PFN %lx\n", - idx + bdata->node_min_pfn); - } - return 0; -} - -static int __init mark_bootmem_node(bootmem_data_t *bdata, - unsigned long start, unsigned long end, - int reserve, int flags) -{ - unsigned long sidx, eidx; - - bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n", - bdata - bootmem_node_data, start, end, reserve, flags); - - BUG_ON(start < bdata->node_min_pfn); - BUG_ON(end > bdata->node_low_pfn); - - sidx = start - bdata->node_min_pfn; - eidx = end - bdata->node_min_pfn; - - if (reserve) - return __reserve(bdata, sidx, eidx, flags); - else - __free(bdata, sidx, eidx); - return 0; -} - -static int __init mark_bootmem(unsigned long start, unsigned long end, - int reserve, int flags) -{ - unsigned long pos; - bootmem_data_t *bdata; - - pos = start; - list_for_each_entry(bdata, &bdata_list, list) { - int err; - unsigned long max; - - if (pos < bdata->node_min_pfn || - pos >= bdata->node_low_pfn) { - BUG_ON(pos != start); - continue; - } - - max = min(bdata->node_low_pfn, end); - - err = mark_bootmem_node(bdata, pos, max, reserve, flags); - if (reserve && err) { - mark_bootmem(start, pos, 0, 0); - return err; - } - - if (max == end) - return 0; - pos = bdata->node_low_pfn; - } - BUG(); -} - -void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, - unsigned long size) -{ - unsigned long start, end; - - kmemleak_free_part_phys(physaddr, size); - - start = PFN_UP(physaddr); - end = PFN_DOWN(physaddr + size); - - mark_bootmem_node(pgdat->bdata, start, end, 0, 0); -} - -void __init free_bootmem(unsigned long physaddr, unsigned long size) -{ - unsigned long start, end; - - kmemleak_free_part_phys(physaddr, size); - - start = PFN_UP(physaddr); - end = PFN_DOWN(physaddr + size); - - mark_bootmem(start, end, 0, 0); -} - -/** - * reserve_bootmem_node - mark a page range as reserved - * @pgdat: node the range resides on - * @physaddr: starting address of the range - * @size: size of the range in bytes - * @flags: reservation flags (see linux/bootmem.h) - * - * Partial pages will be reserved. - * - * The range must reside completely on the specified node. - * - * Return: 0 on success, -errno on failure. - */ -int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, - unsigned long size, int flags) -{ - unsigned long start, end; - - start = PFN_DOWN(physaddr); - end = PFN_UP(physaddr + size); - - return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); -} - -/** - * reserve_bootmem - mark a page range as reserved - * @addr: starting address of the range - * @size: size of the range in bytes - * @flags: reservation flags (see linux/bootmem.h) - * - * Partial pages will be reserved. - * - * The range must be contiguous but may span node boundaries. - * - * Return: 0 on success, -errno on failure. - */ -int __init reserve_bootmem(unsigned long addr, unsigned long size, - int flags) -{ - unsigned long start, end; - - start = PFN_DOWN(addr); - end = PFN_UP(addr + size); - - return mark_bootmem(start, end, 1, flags); -} - -static unsigned long __init align_idx(struct bootmem_data *bdata, - unsigned long idx, unsigned long step) -{ - unsigned long base = bdata->node_min_pfn; - - /* - * Align the index with respect to the node start so that the - * combination of both satisfies the requested alignment. - */ - - return ALIGN(base + idx, step) - base; -} - -static unsigned long __init align_off(struct bootmem_data *bdata, - unsigned long off, unsigned long align) -{ - unsigned long base = PFN_PHYS(bdata->node_min_pfn); - - /* Same as align_idx for byte offsets */ - - return ALIGN(base + off, align) - base; -} - -static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata, - unsigned long size, unsigned long align, - unsigned long goal, unsigned long limit) -{ - unsigned long fallback = 0; - unsigned long min, max, start, sidx, midx, step; - - bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n", - bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT, - align, goal, limit); - - BUG_ON(!size); - BUG_ON(align & (align - 1)); - BUG_ON(limit && goal + size > limit); - - if (!bdata->node_bootmem_map) - return NULL; - - min = bdata->node_min_pfn; - max = bdata->node_low_pfn; - - goal >>= PAGE_SHIFT; - limit >>= PAGE_SHIFT; - - if (limit && max > limit) - max = limit; - if (max <= min) - return NULL; - - step = max(align >> PAGE_SHIFT, 1UL); - - if (goal && min < goal && goal < max) - start = ALIGN(goal, step); - else - start = ALIGN(min, step); - - sidx = start - bdata->node_min_pfn; - midx = max - bdata->node_min_pfn; - - if (bdata->hint_idx > sidx) { - /* - * Handle the valid case of sidx being zero and still - * catch the fallback below. - */ - fallback = sidx + 1; - sidx = align_idx(bdata, bdata->hint_idx, step); - } - - while (1) { - int merge; - void *region; - unsigned long eidx, i, start_off, end_off; -find_block: - sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx); - sidx = align_idx(bdata, sidx, step); - eidx = sidx + PFN_UP(size); - - if (sidx >= midx || eidx > midx) - break; - - for (i = sidx; i < eidx; i++) - if (test_bit(i, bdata->node_bootmem_map)) { - sidx = align_idx(bdata, i, step); - if (sidx == i) - sidx += step; - goto find_block; - } - - if (bdata->last_end_off & (PAGE_SIZE - 1) && - PFN_DOWN(bdata->last_end_off) + 1 == sidx) - start_off = align_off(bdata, bdata->last_end_off, align); - else - start_off = PFN_PHYS(sidx); - - merge = PFN_DOWN(start_off) < sidx; - end_off = start_off + size; - - bdata->last_end_off = end_off; - bdata->hint_idx = PFN_UP(end_off); - - /* - * Reserve the area now: - */ - if (__reserve(bdata, PFN_DOWN(start_off) + merge, - PFN_UP(end_off), BOOTMEM_EXCLUSIVE)) - BUG(); - - region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + - start_off); - memset(region, 0, size); - /* - * The min_count is set to 0 so that bootmem allocated blocks - * are never reported as leaks. - */ - kmemleak_alloc(region, size, 0, 0); - return region; - } - - if (fallback) { - sidx = align_idx(bdata, fallback - 1, step); - fallback = 0; - goto find_block; - } - - return NULL; -} - -static void * __init alloc_bootmem_core(unsigned long size, - unsigned long align, - unsigned long goal, - unsigned long limit) -{ - bootmem_data_t *bdata; - void *region; - - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc(size, GFP_NOWAIT); - - list_for_each_entry(bdata, &bdata_list, list) { - if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) - continue; - if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) - break; - - region = alloc_bootmem_bdata(bdata, size, align, goal, limit); - if (region) - return region; - } - - return NULL; -} - -static void * __init ___alloc_bootmem_nopanic(unsigned long size, - unsigned long align, - unsigned long goal, - unsigned long limit) -{ - void *ptr; - -restart: - ptr = alloc_bootmem_core(size, align, goal, limit); - if (ptr) - return ptr; - if (goal) { - goal = 0; - goto restart; - } - - return NULL; -} - -void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, - unsigned long goal) -{ - unsigned long limit = 0; - - return ___alloc_bootmem_nopanic(size, align, goal, limit); -} - -static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, - unsigned long goal, unsigned long limit) -{ - void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit); - - if (mem) - return mem; - /* - * Whoops, we cannot satisfy the allocation request. - */ - pr_alert("bootmem alloc of %lu bytes failed!\n", size); - panic("Out of memory"); - return NULL; -} - -void * __init __alloc_bootmem(unsigned long size, unsigned long align, - unsigned long goal) -{ - unsigned long limit = 0; - - return ___alloc_bootmem(size, align, goal, limit); -} - -void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, - unsigned long size, unsigned long align, - unsigned long goal, unsigned long limit) -{ - void *ptr; - - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); -again: - - /* do not panic in alloc_bootmem_bdata() */ - if (limit && goal + size > limit) - limit = 0; - - ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit); - if (ptr) - return ptr; - - ptr = alloc_bootmem_core(size, align, goal, limit); - if (ptr) - return ptr; - - if (goal) { - goal = 0; - goto again; - } - - return NULL; -} - -void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ - return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); -} - -void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal, - unsigned long limit) -{ - void *ptr; - - ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); - if (ptr) - return ptr; - - pr_alert("bootmem alloc of %lu bytes failed!\n", size); - panic("Out of memory"); - return NULL; -} - -void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - - return ___alloc_bootmem_node(pgdat, size, align, goal, 0); -} - -void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ -#ifdef MAX_DMA32_PFN - unsigned long end_pfn; - - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - - /* update goal according ...MAX_DMA32_PFN */ - end_pfn = pgdat_end_pfn(pgdat); - - if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && - (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { - void *ptr; - unsigned long new_goal; - - new_goal = MAX_DMA32_PFN << PAGE_SHIFT; - ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, - new_goal, 0); - if (ptr) - return ptr; - } -#endif - - return __alloc_bootmem_node(pgdat, size, align, goal); - -} - -void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, - unsigned long goal) -{ - return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); -} - -void * __init __alloc_bootmem_low_nopanic(unsigned long size, - unsigned long align, - unsigned long goal) -{ - return ___alloc_bootmem_nopanic(size, align, goal, - ARCH_LOW_ADDRESS_LIMIT); -} - -void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - - return ___alloc_bootmem_node(pgdat, size, align, - goal, ARCH_LOW_ADDRESS_LIMIT); -} diff --git a/mm/filemap.c b/mm/filemap.c index 3968da1f7f5a..218d0b2ec82d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -113,60 +113,26 @@ * ->tasklist_lock (memory_failure, collect_procs_ao) */ -static int page_cache_tree_insert(struct address_space *mapping, - struct page *page, void **shadowp) -{ - struct radix_tree_node *node; - void **slot; - int error; - - error = __radix_tree_create(&mapping->i_pages, page->index, 0, - &node, &slot); - if (error) - return error; - if (*slot) { - void *p; - - p = radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock); - if (!radix_tree_exceptional_entry(p)) - return -EEXIST; - - mapping->nrexceptional--; - if (shadowp) - *shadowp = p; - } - __radix_tree_replace(&mapping->i_pages, node, slot, page, - workingset_lookup_update(mapping)); - mapping->nrpages++; - return 0; -} - -static void page_cache_tree_delete(struct address_space *mapping, +static void page_cache_delete(struct address_space *mapping, struct page *page, void *shadow) { - int i, nr; + XA_STATE(xas, &mapping->i_pages, page->index); + unsigned int nr = 1; + + mapping_set_update(&xas, mapping); - /* hugetlb pages are represented by one entry in the radix tree */ - nr = PageHuge(page) ? 1 : hpage_nr_pages(page); + /* hugetlb pages are represented by a single entry in the xarray */ + if (!PageHuge(page)) { + xas_set_order(&xas, page->index, compound_order(page)); + nr = 1U << compound_order(page); + } VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(nr != 1 && shadow, page); - for (i = 0; i < nr; i++) { - struct radix_tree_node *node; - void **slot; - - __radix_tree_lookup(&mapping->i_pages, page->index + i, - &node, &slot); - - VM_BUG_ON_PAGE(!node && nr != 1, page); - - radix_tree_clear_tags(&mapping->i_pages, node, slot); - __radix_tree_replace(&mapping->i_pages, node, slot, shadow, - workingset_lookup_update(mapping)); - } + xas_store(&xas, shadow); + xas_init_marks(&xas); page->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ @@ -265,7 +231,7 @@ void __delete_from_page_cache(struct page *page, void *shadow) trace_mm_filemap_delete_from_page_cache(page); unaccount_page_cache_page(mapping, page); - page_cache_tree_delete(mapping, page, shadow); + page_cache_delete(mapping, page, shadow); } static void page_cache_free_page(struct address_space *mapping, @@ -308,7 +274,7 @@ void delete_from_page_cache(struct page *page) EXPORT_SYMBOL(delete_from_page_cache); /* - * page_cache_tree_delete_batch - delete several pages from page cache + * page_cache_delete_batch - delete several pages from page cache * @mapping: the mapping to which pages belong * @pvec: pagevec with pages to delete * @@ -321,24 +287,19 @@ EXPORT_SYMBOL(delete_from_page_cache); * * The function expects the i_pages lock to be held. */ -static void -page_cache_tree_delete_batch(struct address_space *mapping, +static void page_cache_delete_batch(struct address_space *mapping, struct pagevec *pvec) { - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index); int total_pages = 0; int i = 0, tail_pages = 0; struct page *page; - pgoff_t start; - start = pvec->pages[0]->index; - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { + mapping_set_update(&xas, mapping); + xas_for_each(&xas, page, ULONG_MAX) { if (i >= pagevec_count(pvec) && !tail_pages) break; - page = radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock); - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) continue; if (!tail_pages) { /* @@ -346,8 +307,11 @@ page_cache_tree_delete_batch(struct address_space *mapping, * have our pages locked so they are protected from * being removed. */ - if (page != pvec->pages[i]) + if (page != pvec->pages[i]) { + VM_BUG_ON_PAGE(page->index > + pvec->pages[i]->index, page); continue; + } WARN_ON_ONCE(!PageLocked(page)); if (PageTransHuge(page) && !PageHuge(page)) tail_pages = HPAGE_PMD_NR - 1; @@ -358,11 +322,11 @@ page_cache_tree_delete_batch(struct address_space *mapping, */ i++; } else { + VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages + != pvec->pages[i]->index, page); tail_pages--; } - radix_tree_clear_tags(&mapping->i_pages, iter.node, slot); - __radix_tree_replace(&mapping->i_pages, iter.node, slot, NULL, - workingset_lookup_update(mapping)); + xas_store(&xas, NULL); total_pages++; } mapping->nrpages -= total_pages; @@ -383,7 +347,7 @@ void delete_from_page_cache_batch(struct address_space *mapping, unaccount_page_cache_page(mapping, pvec->pages[i]); } - page_cache_tree_delete_batch(mapping, pvec); + page_cache_delete_batch(mapping, pvec); xa_unlock_irqrestore(&mapping->i_pages, flags); for (i = 0; i < pagevec_count(pvec); i++) @@ -493,20 +457,31 @@ EXPORT_SYMBOL(filemap_flush); bool filemap_range_has_page(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { - pgoff_t index = start_byte >> PAGE_SHIFT; - pgoff_t end = end_byte >> PAGE_SHIFT; struct page *page; + XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); + pgoff_t max = end_byte >> PAGE_SHIFT; if (end_byte < start_byte) return false; - if (mapping->nrpages == 0) - return false; + rcu_read_lock(); + for (;;) { + page = xas_find(&xas, max); + if (xas_retry(&xas, page)) + continue; + /* Shadow entries don't count */ + if (xa_is_value(page)) + continue; + /* + * We don't need to try to pin this page; we're about to + * release the RCU lock anyway. It is enough to know that + * there was a page here recently. + */ + break; + } + rcu_read_unlock(); - if (!find_get_pages_range(mapping, &index, end, 1, &page)) - return false; - put_page(page); - return true; + return page != NULL; } EXPORT_SYMBOL(filemap_range_has_page); @@ -777,51 +752,44 @@ EXPORT_SYMBOL(file_write_and_wait_range); * locked. This function does not add the new page to the LRU, the * caller must do that. * - * The remove + add is atomic. The only way this function can fail is - * memory allocation failure. + * The remove + add is atomic. This function cannot fail. */ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) { - int error; + struct address_space *mapping = old->mapping; + void (*freepage)(struct page *) = mapping->a_ops->freepage; + pgoff_t offset = old->index; + XA_STATE(xas, &mapping->i_pages, offset); + unsigned long flags; VM_BUG_ON_PAGE(!PageLocked(old), old); VM_BUG_ON_PAGE(!PageLocked(new), new); VM_BUG_ON_PAGE(new->mapping, new); - error = radix_tree_preload(gfp_mask & GFP_RECLAIM_MASK); - if (!error) { - struct address_space *mapping = old->mapping; - void (*freepage)(struct page *); - unsigned long flags; - - pgoff_t offset = old->index; - freepage = mapping->a_ops->freepage; - - get_page(new); - new->mapping = mapping; - new->index = offset; + get_page(new); + new->mapping = mapping; + new->index = offset; - xa_lock_irqsave(&mapping->i_pages, flags); - __delete_from_page_cache(old, NULL); - error = page_cache_tree_insert(mapping, new, NULL); - BUG_ON(error); + xas_lock_irqsave(&xas, flags); + xas_store(&xas, new); - /* - * hugetlb pages do not participate in page cache accounting. - */ - if (!PageHuge(new)) - __inc_node_page_state(new, NR_FILE_PAGES); - if (PageSwapBacked(new)) - __inc_node_page_state(new, NR_SHMEM); - xa_unlock_irqrestore(&mapping->i_pages, flags); - mem_cgroup_migrate(old, new); - radix_tree_preload_end(); - if (freepage) - freepage(old); - put_page(old); - } + old->mapping = NULL; + /* hugetlb pages do not participate in page cache accounting. */ + if (!PageHuge(old)) + __dec_node_page_state(new, NR_FILE_PAGES); + if (!PageHuge(new)) + __inc_node_page_state(new, NR_FILE_PAGES); + if (PageSwapBacked(old)) + __dec_node_page_state(new, NR_SHMEM); + if (PageSwapBacked(new)) + __inc_node_page_state(new, NR_SHMEM); + xas_unlock_irqrestore(&xas, flags); + mem_cgroup_migrate(old, new); + if (freepage) + freepage(old); + put_page(old); - return error; + return 0; } EXPORT_SYMBOL_GPL(replace_page_cache_page); @@ -830,12 +798,15 @@ static int __add_to_page_cache_locked(struct page *page, pgoff_t offset, gfp_t gfp_mask, void **shadowp) { + XA_STATE(xas, &mapping->i_pages, offset); int huge = PageHuge(page); struct mem_cgroup *memcg; int error; + void *old; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapBacked(page), page); + mapping_set_update(&xas, mapping); if (!huge) { error = mem_cgroup_try_charge(page, current->mm, @@ -844,39 +815,47 @@ static int __add_to_page_cache_locked(struct page *page, return error; } - error = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK); - if (error) { - if (!huge) - mem_cgroup_cancel_charge(page, memcg, false); - return error; - } - get_page(page); page->mapping = mapping; page->index = offset; - xa_lock_irq(&mapping->i_pages); - error = page_cache_tree_insert(mapping, page, shadowp); - radix_tree_preload_end(); - if (unlikely(error)) - goto err_insert; + do { + xas_lock_irq(&xas); + old = xas_load(&xas); + if (old && !xa_is_value(old)) + xas_set_err(&xas, -EEXIST); + xas_store(&xas, page); + if (xas_error(&xas)) + goto unlock; + + if (xa_is_value(old)) { + mapping->nrexceptional--; + if (shadowp) + *shadowp = old; + } + mapping->nrpages++; + + /* hugetlb pages do not participate in page cache accounting */ + if (!huge) + __inc_node_page_state(page, NR_FILE_PAGES); +unlock: + xas_unlock_irq(&xas); + } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)); + + if (xas_error(&xas)) + goto error; - /* hugetlb pages do not participate in page cache accounting. */ - if (!huge) - __inc_node_page_state(page, NR_FILE_PAGES); - xa_unlock_irq(&mapping->i_pages); if (!huge) mem_cgroup_commit_charge(page, memcg, false, false); trace_mm_filemap_add_to_page_cache(page); return 0; -err_insert: +error: page->mapping = NULL; /* Leave page->index set: truncation relies upon it */ - xa_unlock_irq(&mapping->i_pages); if (!huge) mem_cgroup_cancel_charge(page, memcg, false); put_page(page); - return error; + return xas_error(&xas); } /** @@ -1341,86 +1320,76 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, } /** - * page_cache_next_hole - find the next hole (not-present entry) - * @mapping: mapping - * @index: index - * @max_scan: maximum range to search - * - * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the - * lowest indexed hole. - * - * Returns: the index of the hole if found, otherwise returns an index - * outside of the set specified (in which case 'return - index >= - * max_scan' will be true). In rare cases of index wrap-around, 0 will - * be returned. - * - * page_cache_next_hole may be called under rcu_read_lock. However, - * like radix_tree_gang_lookup, this will not atomically search a - * snapshot of the tree at a single point in time. For example, if a - * hole is created at index 5, then subsequently a hole is created at - * index 10, page_cache_next_hole covering both indexes may return 10 - * if called under rcu_read_lock. + * page_cache_next_miss() - Find the next gap in the page cache. + * @mapping: Mapping. + * @index: Index. + * @max_scan: Maximum range to search. + * + * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the + * gap with the lowest index. + * + * This function may be called under the rcu_read_lock. However, this will + * not atomically search a snapshot of the cache at a single point in time. + * For example, if a gap is created at index 5, then subsequently a gap is + * created at index 10, page_cache_next_miss covering both indices may + * return 10 if called under the rcu_read_lock. + * + * Return: The index of the gap if found, otherwise an index outside the + * range specified (in which case 'return - index >= max_scan' will be true). + * In the rare case of index wrap-around, 0 will be returned. */ -pgoff_t page_cache_next_hole(struct address_space *mapping, +pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { - unsigned long i; - - for (i = 0; i < max_scan; i++) { - struct page *page; + XA_STATE(xas, &mapping->i_pages, index); - page = radix_tree_lookup(&mapping->i_pages, index); - if (!page || radix_tree_exceptional_entry(page)) + while (max_scan--) { + void *entry = xas_next(&xas); + if (!entry || xa_is_value(entry)) break; - index++; - if (index == 0) + if (xas.xa_index == 0) break; } - return index; + return xas.xa_index; } -EXPORT_SYMBOL(page_cache_next_hole); +EXPORT_SYMBOL(page_cache_next_miss); /** - * page_cache_prev_hole - find the prev hole (not-present entry) - * @mapping: mapping - * @index: index - * @max_scan: maximum range to search - * - * Search backwards in the range [max(index-max_scan+1, 0), index] for - * the first hole. - * - * Returns: the index of the hole if found, otherwise returns an index - * outside of the set specified (in which case 'index - return >= - * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX - * will be returned. - * - * page_cache_prev_hole may be called under rcu_read_lock. However, - * like radix_tree_gang_lookup, this will not atomically search a - * snapshot of the tree at a single point in time. For example, if a - * hole is created at index 10, then subsequently a hole is created at - * index 5, page_cache_prev_hole covering both indexes may return 5 if - * called under rcu_read_lock. + * page_cache_prev_miss() - Find the next gap in the page cache. + * @mapping: Mapping. + * @index: Index. + * @max_scan: Maximum range to search. + * + * Search the range [max(index - max_scan + 1, 0), index] for the + * gap with the highest index. + * + * This function may be called under the rcu_read_lock. However, this will + * not atomically search a snapshot of the cache at a single point in time. + * For example, if a gap is created at index 10, then subsequently a gap is + * created at index 5, page_cache_prev_miss() covering both indices may + * return 5 if called under the rcu_read_lock. + * + * Return: The index of the gap if found, otherwise an index outside the + * range specified (in which case 'index - return >= max_scan' will be true). + * In the rare case of wrap-around, ULONG_MAX will be returned. */ -pgoff_t page_cache_prev_hole(struct address_space *mapping, +pgoff_t page_cache_prev_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { - unsigned long i; - - for (i = 0; i < max_scan; i++) { - struct page *page; + XA_STATE(xas, &mapping->i_pages, index); - page = radix_tree_lookup(&mapping->i_pages, index); - if (!page || radix_tree_exceptional_entry(page)) + while (max_scan--) { + void *entry = xas_prev(&xas); + if (!entry || xa_is_value(entry)) break; - index--; - if (index == ULONG_MAX) + if (xas.xa_index == ULONG_MAX) break; } - return index; + return xas.xa_index; } -EXPORT_SYMBOL(page_cache_prev_hole); +EXPORT_SYMBOL(page_cache_prev_miss); /** * find_get_entry - find and get a page cache entry @@ -1437,47 +1406,40 @@ EXPORT_SYMBOL(page_cache_prev_hole); */ struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { - void **pagep; + XA_STATE(xas, &mapping->i_pages, offset); struct page *head, *page; rcu_read_lock(); repeat: - page = NULL; - pagep = radix_tree_lookup_slot(&mapping->i_pages, offset); - if (pagep) { - page = radix_tree_deref_slot(pagep); - if (unlikely(!page)) - goto out; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto repeat; - /* - * A shadow entry of a recently evicted page, - * or a swap entry from shmem/tmpfs. Return - * it without attempting to raise page count. - */ - goto out; - } + xas_reset(&xas); + page = xas_load(&xas); + if (xas_retry(&xas, page)) + goto repeat; + /* + * A shadow entry of a recently evicted page, or a swap entry from + * shmem/tmpfs. Return it without attempting to raise page count. + */ + if (!page || xa_is_value(page)) + goto out; - head = compound_head(page); - if (!page_cache_get_speculative(head)) - goto repeat; + head = compound_head(page); + if (!page_cache_get_speculative(head)) + goto repeat; - /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + /* The page was split under us? */ + if (compound_head(page) != head) { + put_page(head); + goto repeat; + } - /* - * Has the page moved? - * This is part of the lockless pagecache protocol. See - * include/linux/pagemap.h for details. - */ - if (unlikely(page != *pagep)) { - put_page(head); - goto repeat; - } + /* + * Has the page moved? + * This is part of the lockless pagecache protocol. See + * include/linux/pagemap.h for details. + */ + if (unlikely(page != xas_reload(&xas))) { + put_page(head); + goto repeat; } out: rcu_read_unlock(); @@ -1508,7 +1470,7 @@ struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) repeat: page = find_get_entry(mapping, offset); - if (page && !radix_tree_exception(page)) { + if (page && !xa_is_value(page)) { lock_page(page); /* Has the page been truncated? */ if (unlikely(page_mapping(page) != mapping)) { @@ -1554,7 +1516,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, repeat: page = find_get_entry(mapping, offset); - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) page = NULL; if (!page) goto no_page; @@ -1640,53 +1602,48 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t start, unsigned int nr_entries, struct page **entries, pgoff_t *indices) { - void **slot; + XA_STATE(xas, &mapping->i_pages, start); + struct page *page; unsigned int ret = 0; - struct radix_tree_iter iter; if (!nr_entries) return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - struct page *head, *page; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + xas_for_each(&xas, page, ULONG_MAX) { + struct page *head; + if (xas_retry(&xas, page)) continue; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - /* - * A shadow entry of a recently evicted page, a swap - * entry from shmem/tmpfs or a DAX entry. Return it - * without attempting to raise page count. - */ + /* + * A shadow entry of a recently evicted page, a swap + * entry from shmem/tmpfs or a DAX entry. Return it + * without attempting to raise page count. + */ + if (xa_is_value(page)) goto export; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; + export: - indices[ret] = iter.index; + indices[ret] = xas.xa_index; entries[ret] = page; if (++ret == nr_entries) break; + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); } rcu_read_unlock(); return ret; @@ -1717,64 +1674,50 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, pgoff_t end, unsigned int nr_pages, struct page **pages) { - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, *start); + struct page *page; unsigned ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, *start) { - struct page *head, *page; - - if (iter.index > end) - break; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + xas_for_each(&xas, page, end) { + struct page *head; + if (xas_retry(&xas, page)) continue; - - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - /* - * A shadow entry of a recently evicted page, - * or a swap entry from shmem/tmpfs. Skip - * over it. - */ + /* Skip over shadow, swap and DAX entries */ + if (xa_is_value(page)) continue; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; pages[ret] = page; if (++ret == nr_pages) { - *start = pages[ret - 1]->index + 1; + *start = page->index + 1; goto out; } + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); } /* * We come here when there is no page beyond @end. We take care to not * overflow the index @start as it confuses some of the callers. This - * breaks the iteration when there is page at index -1 but that is + * breaks the iteration when there is a page at index -1 but that is * already broken anyway. */ if (end == (pgoff_t)-1) @@ -1802,57 +1745,43 @@ out: unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, unsigned int nr_pages, struct page **pages) { - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, index); + struct page *page; unsigned int ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - radix_tree_for_each_contig(slot, &mapping->i_pages, &iter, index) { - struct page *head, *page; -repeat: - page = radix_tree_deref_slot(slot); - /* The hole, there no reason to continue */ - if (unlikely(!page)) - break; - - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - /* - * A shadow entry of a recently evicted page, - * or a swap entry from shmem/tmpfs. Stop - * looking for contiguous pages. - */ + for (page = xas_load(&xas); page; page = xas_next(&xas)) { + struct page *head; + if (xas_retry(&xas, page)) + continue; + /* + * If the entry has been swapped out, we can stop looking. + * No current caller is looking for DAX entries. + */ + if (xa_is_value(page)) break; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; /* * must check mapping and index after taking the ref. * otherwise we can get both false positives and false * negatives, which is just confusing to the caller. */ - if (page->mapping == NULL || page_to_pgoff(page) != iter.index) { + if (!page->mapping || page_to_pgoff(page) != xas.xa_index) { put_page(page); break; } @@ -1860,6 +1789,11 @@ repeat: pages[ret] = page; if (++ret == nr_pages) break; + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); } rcu_read_unlock(); return ret; @@ -1879,74 +1813,58 @@ EXPORT_SYMBOL(find_get_pages_contig); * @tag. We update @index to index the next page for the traversal. */ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, - pgoff_t end, int tag, unsigned int nr_pages, + pgoff_t end, xa_mark_t tag, unsigned int nr_pages, struct page **pages) { - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, *index); + struct page *page; unsigned ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, *index, tag) { - struct page *head, *page; - - if (iter.index > end) - break; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + xas_for_each_marked(&xas, page, end, tag) { + struct page *head; + if (xas_retry(&xas, page)) continue; - - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - /* - * A shadow entry of a recently evicted page. - * - * Those entries should never be tagged, but - * this tree walk is lockless and the tags are - * looked up in bulk, one radix tree node at a - * time, so there is a sizable window for page - * reclaim to evict a page we saw tagged. - * - * Skip over it. - */ + /* + * Shadow entries should never be tagged, but this iteration + * is lockless so there is a window for page reclaim to evict + * a page we saw tagged. Skip over it. + */ + if (xa_is_value(page)) continue; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; pages[ret] = page; if (++ret == nr_pages) { - *index = pages[ret - 1]->index + 1; + *index = page->index + 1; goto out; } + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); } /* - * We come here when we got at @end. We take care to not overflow the + * We come here when we got to @end. We take care to not overflow the * index @index as it confuses some of the callers. This breaks the - * iteration when there is page at index -1 but that is already broken - * anyway. + * iteration when there is a page at index -1 but that is already + * broken anyway. */ if (end == (pgoff_t)-1) *index = (pgoff_t)-1; @@ -1972,57 +1890,51 @@ EXPORT_SYMBOL(find_get_pages_range_tag); * @tag. */ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, - int tag, unsigned int nr_entries, + xa_mark_t tag, unsigned int nr_entries, struct page **entries, pgoff_t *indices) { - void **slot; + XA_STATE(xas, &mapping->i_pages, start); + struct page *page; unsigned int ret = 0; - struct radix_tree_iter iter; if (!nr_entries) return 0; rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, tag) { - struct page *head, *page; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + xas_for_each_marked(&xas, page, ULONG_MAX, tag) { + struct page *head; + if (xas_retry(&xas, page)) continue; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - - /* - * A shadow entry of a recently evicted page, a swap - * entry from shmem/tmpfs or a DAX entry. Return it - * without attempting to raise page count. - */ + /* + * A shadow entry of a recently evicted page, a swap + * entry from shmem/tmpfs or a DAX entry. Return it + * without attempting to raise page count. + */ + if (xa_is_value(page)) goto export; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; + export: - indices[ret] = iter.index; + indices[ret] = xas.xa_index; entries[ret] = page; if (++ret == nr_entries) break; + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); } rcu_read_unlock(); return ret; @@ -2626,45 +2538,31 @@ EXPORT_SYMBOL(filemap_fault); void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { - struct radix_tree_iter iter; - void **slot; struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; unsigned long max_idx; + XA_STATE(xas, &mapping->i_pages, start_pgoff); struct page *head, *page; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start_pgoff) { - if (iter.index > end_pgoff) - break; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) - goto next; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } + xas_for_each(&xas, page, end_pgoff) { + if (xas_retry(&xas, page)) + continue; + if (xa_is_value(page)) goto next; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto next; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto skip; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto skip; if (!PageUptodate(page) || PageReadahead(page) || @@ -2683,10 +2581,10 @@ repeat: if (file->f_ra.mmap_miss > 0) file->f_ra.mmap_miss--; - vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT; + vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT; if (vmf->pte) - vmf->pte += iter.index - last_pgoff; - last_pgoff = iter.index; + vmf->pte += xas.xa_index - last_pgoff; + last_pgoff = xas.xa_index; if (alloc_set_pte(vmf, NULL, page)) goto unlock; unlock_page(page); @@ -2699,8 +2597,6 @@ next: /* Huge page is mapped? No need to proceed. */ if (pmd_trans_huge(*vmf->pmd)) break; - if (iter.index == end_pgoff) - break; } rcu_read_unlock(); } @@ -2810,7 +2706,7 @@ repeat: put_page(page); if (err == -EEXIST) goto repeat; - /* Presumably ENOMEM for radix tree node */ + /* Presumably ENOMEM for xarray node */ return ERR_PTR(err); } @@ -1817,8 +1817,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * interrupts disabled by get_futex_key. * * With interrupts disabled, we block page table pages from being - * freed from under us. See mmu_gather_tlb in asm-generic/tlb.h - * for more details. + * freed from under us. See struct mmu_table_batch comments in + * include/asm-generic/tlb.h for more details. * * We do not adopt an rcu_read_lock(.) here as we also want to * block IPIs that come from THPs splitting. diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c index debf11388a60..5b42d3d4b60a 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_benchmark.c @@ -27,6 +27,9 @@ static int __gup_benchmark_ioctl(unsigned int cmd, int nr; struct page **pages; + if (gup->size > ULONG_MAX) + return -EINVAL; + nr_pages = gup->size / PAGE_SIZE; pages = kvcalloc(nr_pages, sizeof(void *), GFP_KERNEL); if (!pages) @@ -11,7 +11,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * Authors: Jérôme Glisse <jglisse@redhat.com> + * Authors: Jérôme Glisse <jglisse@redhat.com> */ /* * Refer to include/linux/hmm.h for information about heterogeneous memory @@ -43,7 +43,6 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops; * * @mm: mm struct this HMM struct is bound to * @lock: lock protecting ranges list - * @sequence: we track updates to the CPU page table with a sequence number * @ranges: list of range being snapshotted * @mirrors: list of mirrors for this mm * @mmu_notifier: mmu notifier to track updates to CPU page table @@ -52,7 +51,6 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops; struct hmm { struct mm_struct *mm; spinlock_t lock; - atomic_t sequence; struct list_head ranges; struct list_head mirrors; struct mmu_notifier mmu_notifier; @@ -85,22 +83,11 @@ static struct hmm *hmm_register(struct mm_struct *mm) return NULL; INIT_LIST_HEAD(&hmm->mirrors); init_rwsem(&hmm->mirrors_sem); - atomic_set(&hmm->sequence, 0); hmm->mmu_notifier.ops = NULL; INIT_LIST_HEAD(&hmm->ranges); spin_lock_init(&hmm->lock); hmm->mm = mm; - /* - * We should only get here if hold the mmap_sem in write mode ie on - * registration of first mirror through hmm_mirror_register() - */ - hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; - if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) { - kfree(hmm); - return NULL; - } - spin_lock(&mm->page_table_lock); if (!mm->hmm) mm->hmm = hmm; @@ -108,12 +95,27 @@ static struct hmm *hmm_register(struct mm_struct *mm) cleanup = true; spin_unlock(&mm->page_table_lock); - if (cleanup) { - mmu_notifier_unregister(&hmm->mmu_notifier, mm); - kfree(hmm); - } + if (cleanup) + goto error; + + /* + * We should only get here if hold the mmap_sem in write mode ie on + * registration of first mirror through hmm_mirror_register() + */ + hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; + if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) + goto error_mm; return mm->hmm; + +error_mm: + spin_lock(&mm->page_table_lock); + if (mm->hmm == hmm) + mm->hmm = NULL; + spin_unlock(&mm->page_table_lock); +error: + kfree(hmm); + return NULL; } void hmm_mm_destroy(struct mm_struct *mm) @@ -121,10 +123,8 @@ void hmm_mm_destroy(struct mm_struct *mm) kfree(mm->hmm); } -static void hmm_invalidate_range(struct hmm *hmm, - enum hmm_update_type action, - unsigned long start, - unsigned long end) +static int hmm_invalidate_range(struct hmm *hmm, bool device, + const struct hmm_update *update) { struct hmm_mirror *mirror; struct hmm_range *range; @@ -133,22 +133,33 @@ static void hmm_invalidate_range(struct hmm *hmm, list_for_each_entry(range, &hmm->ranges, list) { unsigned long addr, idx, npages; - if (end < range->start || start >= range->end) + if (update->end < range->start || update->start >= range->end) continue; range->valid = false; - addr = max(start, range->start); + addr = max(update->start, range->start); idx = (addr - range->start) >> PAGE_SHIFT; - npages = (min(range->end, end) - addr) >> PAGE_SHIFT; + npages = (min(range->end, update->end) - addr) >> PAGE_SHIFT; memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages); } spin_unlock(&hmm->lock); + if (!device) + return 0; + down_read(&hmm->mirrors_sem); - list_for_each_entry(mirror, &hmm->mirrors, list) - mirror->ops->sync_cpu_device_pagetables(mirror, action, - start, end); + list_for_each_entry(mirror, &hmm->mirrors, list) { + int ret; + + ret = mirror->ops->sync_cpu_device_pagetables(mirror, update); + if (!update->blockable && ret == -EAGAIN) { + up_read(&hmm->mirrors_sem); + return -EAGAIN; + } + } up_read(&hmm->mirrors_sem); + + return 0; } static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) @@ -178,18 +189,21 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) } static int hmm_invalidate_range_start(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, - unsigned long end, - bool blockable) + struct mm_struct *mm, + unsigned long start, + unsigned long end, + bool blockable) { + struct hmm_update update; struct hmm *hmm = mm->hmm; VM_BUG_ON(!hmm); - atomic_inc(&hmm->sequence); - - return 0; + update.start = start; + update.end = end; + update.event = HMM_UPDATE_INVALIDATE; + update.blockable = blockable; + return hmm_invalidate_range(hmm, true, &update); } static void hmm_invalidate_range_end(struct mmu_notifier *mn, @@ -197,11 +211,16 @@ static void hmm_invalidate_range_end(struct mmu_notifier *mn, unsigned long start, unsigned long end) { + struct hmm_update update; struct hmm *hmm = mm->hmm; VM_BUG_ON(!hmm); - hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end); + update.start = start; + update.end = end; + update.event = HMM_UPDATE_INVALIDATE; + update.blockable = true; + hmm_invalidate_range(hmm, false, &update); } static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { @@ -278,12 +297,13 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror) if (!should_unregister || mm == NULL) return; + mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); + spin_lock(&mm->page_table_lock); if (mm->hmm == hmm) mm->hmm = NULL; spin_unlock(&mm->page_table_lock); - mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); kfree(hmm); } EXPORT_SYMBOL(hmm_mirror_unregister); @@ -571,22 +591,42 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; + struct vm_area_struct *vma = walk->vma; uint64_t *pfns = range->pfns; unsigned long addr = start, i; pte_t *ptep; + pmd_t pmd; - i = (addr - range->start) >> PAGE_SHIFT; again: - if (pmd_none(*pmdp)) + pmd = READ_ONCE(*pmdp); + if (pmd_none(pmd)) return hmm_vma_walk_hole(start, end, walk); - if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB)) + if (pmd_huge(pmd) && (range->vma->vm_flags & VM_HUGETLB)) return hmm_pfns_bad(start, end, walk); - if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) { - pmd_t pmd; + if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { + bool fault, write_fault; + unsigned long npages; + uint64_t *pfns; + + i = (addr - range->start) >> PAGE_SHIFT; + npages = (end - addr) >> PAGE_SHIFT; + pfns = &range->pfns[i]; + hmm_range_need_fault(hmm_vma_walk, pfns, npages, + 0, &fault, &write_fault); + if (fault || write_fault) { + hmm_vma_walk->last = addr; + pmd_migration_entry_wait(vma->vm_mm, pmdp); + return -EAGAIN; + } + return 0; + } else if (!pmd_present(pmd)) + return hmm_pfns_bad(start, end, walk); + + if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { /* * No need to take pmd_lock here, even if some other threads * is splitting the huge pmd we will get that event through @@ -601,13 +641,21 @@ again: if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) goto again; + i = (addr - range->start) >> PAGE_SHIFT; return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd); } - if (pmd_bad(*pmdp)) + /* + * We have handled all the valid case above ie either none, migration, + * huge or transparent huge. At this point either it is a valid pmd + * entry pointing to pte directory or it is a bad pmd that will not + * recover. + */ + if (pmd_bad(pmd)) return hmm_pfns_bad(start, end, walk); ptep = pte_offset_map(pmdp, addr); + i = (addr - range->start) >> PAGE_SHIFT; for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { int r; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 25ef59b7ee34..4e4ef8fa479d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2450,13 +2450,13 @@ static void __split_huge_page(struct page *page, struct list_head *list, ClearPageCompound(head); /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { - /* Additional pin to radix tree of swap cache */ + /* Additional pin to swap cache */ if (PageSwapCache(head)) page_ref_add(head, 2); else page_ref_inc(head); } else { - /* Additional pin to radix tree */ + /* Additional pin to page cache */ page_ref_add(head, 2); xa_unlock(&head->mapping->i_pages); } @@ -2568,7 +2568,7 @@ bool can_split_huge_page(struct page *page, int *pextra_pins) { int extra_pins; - /* Additional pins from radix tree */ + /* Additional pins from page cache */ if (PageAnon(page)) extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0; else @@ -2664,17 +2664,14 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags); if (mapping) { - void **pslot; + XA_STATE(xas, &mapping->i_pages, page_index(head)); - xa_lock(&mapping->i_pages); - pslot = radix_tree_lookup_slot(&mapping->i_pages, - page_index(head)); /* - * Check if the head page is present in radix tree. + * Check if the head page is present in page cache. * We assume all tail are present too, if head is there. */ - if (radix_tree_deref_slot_protected(pslot, - &mapping->i_pages.xa_lock) != head) + xa_lock(&mapping->i_pages); + if (xas_load(&xas) != head) goto fail; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7b5c0ad9a6bd..c007fb5fb8d5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -15,7 +15,7 @@ #include <linux/compiler.h> #include <linux/cpuset.h> #include <linux/mutex.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/sysfs.h> #include <linux/slab.h> #include <linux/mmdebug.h> @@ -2100,9 +2100,9 @@ int __alloc_bootmem_huge_page(struct hstate *h) for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { void *addr; - addr = memblock_virt_alloc_try_nid_raw( + addr = memblock_alloc_try_nid_raw( huge_page_size(h), huge_page_size(h), - 0, BOOTMEM_ALLOC_ACCESSIBLE, node); + 0, MEMBLOCK_ALLOC_ACCESSIBLE, node); if (addr) { /* * Use the beginning of the huge page to store the diff --git a/mm/internal.h b/mm/internal.h index 87256ae1bef8..291eb2b6d1d8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -161,7 +161,7 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, } extern int __isolate_free_page(struct page *page, unsigned int order); -extern void __free_pages_bootmem(struct page *page, unsigned long pfn, +extern void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order); extern void prep_compound_page(struct page *page, unsigned int order); extern void post_alloc_hook(struct page *page, unsigned int order, diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c index 7a2a2f13f86f..c7550eb65922 100644 --- a/mm/kasan/kasan_init.c +++ b/mm/kasan/kasan_init.c @@ -10,11 +10,10 @@ * */ -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/init.h> #include <linux/kasan.h> #include <linux/kernel.h> -#include <linux/memblock.h> #include <linux/mm.h> #include <linux/pfn.h> #include <linux/slab.h> @@ -83,8 +82,8 @@ static inline bool kasan_zero_page_entry(pte_t pte) static __init void *early_alloc(size_t size, int node) { - return memblock_virt_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS), - BOOTMEM_ALLOC_ACCESSIBLE, node); + return memblock_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS), + MEMBLOCK_ALLOC_ACCESSIBLE, node); } static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr, diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a31d740e6cd1..c13625c1ad5e 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1288,17 +1288,17 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * * Basic scheme is simple, details are more complex: * - allocate and freeze a new huge page; - * - scan over radix tree replacing old pages the new one + * - scan page cache replacing old pages with the new one * + swap in pages if necessary; * + fill in gaps; - * + keep old pages around in case if rollback is required; - * - if replacing succeed: + * + keep old pages around in case rollback is required; + * - if replacing succeeds: * + copy data over; * + free old pages; * + unfreeze huge page; * - if replacing failed; * + put all pages back and unfreeze them; - * + restore gaps in the radix-tree; + * + restore gaps in the page cache; * + free huge page; */ static void collapse_shmem(struct mm_struct *mm, @@ -1306,12 +1306,11 @@ static void collapse_shmem(struct mm_struct *mm, struct page **hpage, int node) { gfp_t gfp; - struct page *page, *new_page, *tmp; + struct page *new_page; struct mem_cgroup *memcg; pgoff_t index, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); - struct radix_tree_iter iter; - void **slot; + XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); int nr_none = 0, result = SCAN_SUCCEED; VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); @@ -1336,48 +1335,49 @@ static void collapse_shmem(struct mm_struct *mm, __SetPageLocked(new_page); BUG_ON(!page_ref_freeze(new_page, 1)); - /* - * At this point the new_page is 'frozen' (page_count() is zero), locked - * and not up-to-date. It's safe to insert it into radix tree, because - * nobody would be able to map it or use it in other way until we - * unfreeze it. + * At this point the new_page is 'frozen' (page_count() is zero), + * locked and not up-to-date. It's safe to insert it into the page + * cache, because nobody would be able to map it or use it in other + * way until we unfreeze it. */ - index = start; - xa_lock_irq(&mapping->i_pages); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - int n = min(iter.index, end) - index; - - /* - * Handle holes in the radix tree: charge it from shmem and - * insert relevant subpage of new_page into the radix-tree. - */ - if (n && !shmem_charge(mapping->host, n)) { - result = SCAN_FAIL; + /* This will be less messy when we use multi-index entries */ + do { + xas_lock_irq(&xas); + xas_create_range(&xas); + if (!xas_error(&xas)) break; - } - nr_none += n; - for (; index < min(iter.index, end); index++) { - radix_tree_insert(&mapping->i_pages, index, - new_page + (index % HPAGE_PMD_NR)); - } + xas_unlock_irq(&xas); + if (!xas_nomem(&xas, GFP_KERNEL)) + goto out; + } while (1); - /* We are done. */ - if (index >= end) - break; + xas_set(&xas, start); + for (index = start; index < end; index++) { + struct page *page = xas_next(&xas); + + VM_BUG_ON(index != xas.xa_index); + if (!page) { + if (!shmem_charge(mapping->host, 1)) { + result = SCAN_FAIL; + break; + } + xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); + nr_none++; + continue; + } - page = radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock); - if (radix_tree_exceptional_entry(page) || !PageUptodate(page)) { - xa_unlock_irq(&mapping->i_pages); + if (xa_is_value(page) || !PageUptodate(page)) { + xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ if (shmem_getpage(mapping->host, index, &page, SGP_NOHUGE)) { result = SCAN_FAIL; - goto tree_unlocked; + goto xa_unlocked; } - xa_lock_irq(&mapping->i_pages); + xas_lock_irq(&xas); + xas_set(&xas, index); } else if (trylock_page(page)) { get_page(page); } else { @@ -1397,7 +1397,7 @@ static void collapse_shmem(struct mm_struct *mm, result = SCAN_TRUNCATED; goto out_unlock; } - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); if (isolate_lru_page(page)) { result = SCAN_DEL_PAGE_LRU; @@ -1407,17 +1407,16 @@ static void collapse_shmem(struct mm_struct *mm, if (page_mapped(page)) unmap_mapping_pages(mapping, index, 1, false); - xa_lock_irq(&mapping->i_pages); + xas_lock_irq(&xas); + xas_set(&xas, index); - slot = radix_tree_lookup_slot(&mapping->i_pages, index); - VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock), page); + VM_BUG_ON_PAGE(page != xas_load(&xas), page); VM_BUG_ON_PAGE(page_mapped(page), page); /* * The page is expected to have page_count() == 3: * - we hold a pin on it; - * - one reference from radix tree; + * - one reference from page cache; * - one from isolate_lru_page; */ if (!page_ref_freeze(page, 3)) { @@ -1432,56 +1431,30 @@ static void collapse_shmem(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - radix_tree_replace_slot(&mapping->i_pages, slot, - new_page + (index % HPAGE_PMD_NR)); - - slot = radix_tree_iter_resume(slot, &iter); - index++; + xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); continue; out_lru: - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); putback_lru_page(page); out_isolate_failed: unlock_page(page); put_page(page); - goto tree_unlocked; + goto xa_unlocked; out_unlock: unlock_page(page); put_page(page); break; } + xas_unlock_irq(&xas); - /* - * Handle hole in radix tree at the end of the range. - * This code only triggers if there's nothing in radix tree - * beyond 'end'. - */ - if (result == SCAN_SUCCEED && index < end) { - int n = end - index; - - if (!shmem_charge(mapping->host, n)) { - result = SCAN_FAIL; - goto tree_locked; - } - - for (; index < end; index++) { - radix_tree_insert(&mapping->i_pages, index, - new_page + (index % HPAGE_PMD_NR)); - } - nr_none += n; - } - -tree_locked: - xa_unlock_irq(&mapping->i_pages); -tree_unlocked: - +xa_unlocked: if (result == SCAN_SUCCEED) { - unsigned long flags; + struct page *page, *tmp; struct zone *zone = page_zone(new_page); /* - * Replacing old pages with new one has succeed, now we need to - * copy the content and free old pages. + * Replacing old pages with new one has succeeded, now we + * need to copy the content and free the old pages. */ list_for_each_entry_safe(page, tmp, &pagelist, lru) { copy_highpage(new_page + (page->index % HPAGE_PMD_NR), @@ -1495,16 +1468,16 @@ tree_unlocked: put_page(page); } - local_irq_save(flags); + local_irq_disable(); __inc_node_page_state(new_page, NR_SHMEM_THPS); if (nr_none) { __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none); } - local_irq_restore(flags); + local_irq_enable(); /* - * Remove pte page tables, so we can re-faulti + * Remove pte page tables, so we can re-fault * the page as huge. */ retract_page_tables(mapping, start); @@ -1521,37 +1494,37 @@ tree_unlocked: khugepaged_pages_collapsed++; } else { - /* Something went wrong: rollback changes to the radix-tree */ + struct page *page; + /* Something went wrong: roll back page cache changes */ shmem_uncharge(mapping->host, nr_none); - xa_lock_irq(&mapping->i_pages); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - if (iter.index >= end) - break; + xas_lock_irq(&xas); + xas_set(&xas, start); + xas_for_each(&xas, page, end - 1) { page = list_first_entry_or_null(&pagelist, struct page, lru); - if (!page || iter.index < page->index) { + if (!page || xas.xa_index < page->index) { if (!nr_none) break; nr_none--; /* Put holes back where they were */ - radix_tree_delete(&mapping->i_pages, iter.index); + xas_store(&xas, NULL); continue; } - VM_BUG_ON_PAGE(page->index != iter.index, page); + VM_BUG_ON_PAGE(page->index != xas.xa_index, page); /* Unfreeze the page. */ list_del(&page->lru); page_ref_unfreeze(page, 2); - radix_tree_replace_slot(&mapping->i_pages, slot, page); - slot = radix_tree_iter_resume(slot, &iter); - xa_unlock_irq(&mapping->i_pages); + xas_store(&xas, page); + xas_pause(&xas); + xas_unlock_irq(&xas); putback_lru_page(page); unlock_page(page); - xa_lock_irq(&mapping->i_pages); + xas_lock_irq(&xas); } VM_BUG_ON(nr_none); - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); /* Unfreeze new_page, caller would take care about freeing it */ page_ref_unfreeze(new_page, 1); @@ -1569,8 +1542,7 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, pgoff_t start, struct page **hpage) { struct page *page = NULL; - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, start); int present, swap; int node = NUMA_NO_NODE; int result = SCAN_SUCCEED; @@ -1579,17 +1551,11 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, swap = 0; memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - if (iter.index >= start + HPAGE_PMD_NR) - break; - - page = radix_tree_deref_slot(slot); - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); + xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) { + if (xas_retry(&xas, page)) continue; - } - if (radix_tree_exception(page)) { + if (xa_is_value(page)) { if (++swap > khugepaged_max_ptes_swap) { result = SCAN_EXCEED_SWAP_PTE; break; @@ -1628,7 +1594,7 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, present++; if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); + xas_pause(&xas); cond_resched_rcu(); } } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 4f7e4b5a2f08..877de4fa0720 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -92,7 +92,7 @@ #include <linux/stacktrace.h> #include <linux/cache.h> #include <linux/percpu.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/pfn.h> #include <linux/mmzone.h> #include <linux/slab.h> diff --git a/mm/madvise.c b/mm/madvise.c index 71d21df2a3f3..6cb1ca93e290 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -251,7 +251,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; page = find_get_entry(mapping, index); - if (!radix_tree_exceptional_entry(page)) { + if (!xa_is_value(page)) { if (page) put_page(page); continue; diff --git a/mm/memblock.c b/mm/memblock.c index a85315083b5a..7df468c8ebc8 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -20,7 +20,6 @@ #include <linux/kmemleak.h> #include <linux/seq_file.h> #include <linux/memblock.h> -#include <linux/bootmem.h> #include <asm/sections.h> #include <linux/io.h> @@ -82,6 +81,16 @@ * initialization compltes. */ +#ifndef CONFIG_NEED_MULTIPLE_NODES +struct pglist_data __refdata contig_page_data; +EXPORT_SYMBOL(contig_page_data); +#endif + +unsigned long max_low_pfn; +unsigned long min_low_pfn; +unsigned long max_pfn; +unsigned long long max_possible_pfn; + static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP @@ -1238,8 +1247,11 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, { phys_addr_t found; - if (!align) + if (!align) { + /* Can't use WARNs this early in boot on powerpc */ + dump_stack(); align = SMP_CACHE_BYTES; + } found = memblock_find_in_range_node(size, align, start, end, nid, flags); @@ -1269,7 +1281,7 @@ phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flags); } -phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) +phys_addr_t __init memblock_phys_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) { enum memblock_flags flags = choose_memblock_flags(); phys_addr_t ret; @@ -1304,23 +1316,22 @@ phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys return alloc; } -phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align) +phys_addr_t __init memblock_phys_alloc(phys_addr_t size, phys_addr_t align) { return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); } -phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid) +phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid) { - phys_addr_t res = memblock_alloc_nid(size, align, nid); + phys_addr_t res = memblock_phys_alloc_nid(size, align, nid); if (res) return res; return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); } -#if defined(CONFIG_NO_BOOTMEM) /** - * memblock_virt_alloc_internal - allocate boot memory block + * memblock_alloc_internal - allocate boot memory block * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @min_addr: the lower bound of the memory region to allocate (phys address) @@ -1333,9 +1344,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i * hold the requested memory. * * The allocation is performed from memory region limited by - * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE. - * - * The memory block is aligned on %SMP_CACHE_BYTES if @align == 0. + * memblock.current_limit if @max_addr == %MEMBLOCK_ALLOC_ACCESSIBLE. * * The phys address of allocated boot memory block is converted to virtual and * allocated memory is reset to 0. @@ -1346,7 +1355,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i * Return: * Virtual address of allocated memory block on success, NULL on failure. */ -static void * __init memblock_virt_alloc_internal( +static void * __init memblock_alloc_internal( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid) @@ -1361,13 +1370,15 @@ static void * __init memblock_virt_alloc_internal( /* * Detect any accidental use of these APIs after slab is ready, as at * this moment memblock may be deinitialized already and its - * internal data may be destroyed (after execution of free_all_bootmem) + * internal data may be destroyed (after execution of memblock_free_all) */ if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, nid); - if (!align) + if (!align) { + dump_stack(); align = SMP_CACHE_BYTES; + } if (max_addr > memblock.current_limit) max_addr = memblock.current_limit; @@ -1413,14 +1424,14 @@ done: } /** - * memblock_virt_alloc_try_nid_raw - allocate boot memory block without zeroing + * memblock_alloc_try_nid_raw - allocate boot memory block without zeroing * memory and without panicking * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @min_addr: the lower bound of the memory region from where the allocation * is preferred (phys address) * @max_addr: the upper bound of the memory region from where the allocation - * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to + * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to * allocate only from memory limited by memblock.current_limit value * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * @@ -1431,7 +1442,7 @@ done: * Return: * Virtual address of allocated memory block on success, NULL on failure. */ -void * __init memblock_virt_alloc_try_nid_raw( +void * __init memblock_alloc_try_nid_raw( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid) @@ -1442,7 +1453,7 @@ void * __init memblock_virt_alloc_try_nid_raw( __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); - ptr = memblock_virt_alloc_internal(size, align, + ptr = memblock_alloc_internal(size, align, min_addr, max_addr, nid); if (ptr && size > 0) page_init_poison(ptr, size); @@ -1451,13 +1462,13 @@ void * __init memblock_virt_alloc_try_nid_raw( } /** - * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block + * memblock_alloc_try_nid_nopanic - allocate boot memory block * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @min_addr: the lower bound of the memory region from where the allocation * is preferred (phys address) * @max_addr: the upper bound of the memory region from where the allocation - * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to + * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to * allocate only from memory limited by memblock.current_limit value * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * @@ -1467,7 +1478,7 @@ void * __init memblock_virt_alloc_try_nid_raw( * Return: * Virtual address of allocated memory block on success, NULL on failure. */ -void * __init memblock_virt_alloc_try_nid_nopanic( +void * __init memblock_alloc_try_nid_nopanic( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid) @@ -1478,7 +1489,7 @@ void * __init memblock_virt_alloc_try_nid_nopanic( __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); - ptr = memblock_virt_alloc_internal(size, align, + ptr = memblock_alloc_internal(size, align, min_addr, max_addr, nid); if (ptr) memset(ptr, 0, size); @@ -1486,24 +1497,24 @@ void * __init memblock_virt_alloc_try_nid_nopanic( } /** - * memblock_virt_alloc_try_nid - allocate boot memory block with panicking + * memblock_alloc_try_nid - allocate boot memory block with panicking * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @min_addr: the lower bound of the memory region from where the allocation * is preferred (phys address) * @max_addr: the upper bound of the memory region from where the allocation - * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to + * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to * allocate only from memory limited by memblock.current_limit value * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * - * Public panicking version of memblock_virt_alloc_try_nid_nopanic() + * Public panicking version of memblock_alloc_try_nid_nopanic() * which provides debug information (including caller info), if enabled, * and panics if the request can not be satisfied. * * Return: * Virtual address of allocated memory block on success, NULL on failure. */ -void * __init memblock_virt_alloc_try_nid( +void * __init memblock_alloc_try_nid( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid) @@ -1513,7 +1524,7 @@ void * __init memblock_virt_alloc_try_nid( memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n", __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); - ptr = memblock_virt_alloc_internal(size, align, + ptr = memblock_alloc_internal(size, align, min_addr, max_addr, nid); if (ptr) { memset(ptr, 0, size); @@ -1524,14 +1535,13 @@ void * __init memblock_virt_alloc_try_nid( __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr); return NULL; } -#endif /** * __memblock_free_early - free boot memory block * @base: phys starting address of the boot memory block * @size: size of the boot memory block in bytes * - * Free boot memory block previously allocated by memblock_virt_alloc_xx() API. + * Free boot memory block previously allocated by memblock_alloc_xx() API. * The freeing memory will not be released to the buddy allocator. */ void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) @@ -1565,7 +1575,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) end = PFN_DOWN(base + size); for (; cursor < end; cursor++) { - __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); + memblock_free_pages(pfn_to_page(cursor), cursor, 0); totalram_pages++; } } @@ -1879,6 +1889,100 @@ static int __init early_memblock(char *p) } early_param("memblock", early_memblock); +static void __init __free_pages_memory(unsigned long start, unsigned long end) +{ + int order; + + while (start < end) { + order = min(MAX_ORDER - 1UL, __ffs(start)); + + while (start + (1UL << order) > end) + order--; + + memblock_free_pages(pfn_to_page(start), start, order); + + start += (1UL << order); + } +} + +static unsigned long __init __free_memory_core(phys_addr_t start, + phys_addr_t end) +{ + unsigned long start_pfn = PFN_UP(start); + unsigned long end_pfn = min_t(unsigned long, + PFN_DOWN(end), max_low_pfn); + + if (start_pfn >= end_pfn) + return 0; + + __free_pages_memory(start_pfn, end_pfn); + + return end_pfn - start_pfn; +} + +static unsigned long __init free_low_memory_core_early(void) +{ + unsigned long count = 0; + phys_addr_t start, end; + u64 i; + + memblock_clear_hotplug(0, -1); + + for_each_reserved_mem_region(i, &start, &end) + reserve_bootmem_region(start, end); + + /* + * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id + * because in some case like Node0 doesn't have RAM installed + * low ram will be on Node1 + */ + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, + NULL) + count += __free_memory_core(start, end); + + return count; +} + +static int reset_managed_pages_done __initdata; + +void reset_node_managed_pages(pg_data_t *pgdat) +{ + struct zone *z; + + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) + z->managed_pages = 0; +} + +void __init reset_all_zones_managed_pages(void) +{ + struct pglist_data *pgdat; + + if (reset_managed_pages_done) + return; + + for_each_online_pgdat(pgdat) + reset_node_managed_pages(pgdat); + + reset_managed_pages_done = 1; +} + +/** + * memblock_free_all - release free pages to the buddy allocator + * + * Return: the number of pages actually released. + */ +unsigned long __init memblock_free_all(void) +{ + unsigned long pages; + + reset_all_zones_managed_pages(); + + pages = free_low_memory_core_early(); + totalram_pages += pages; + + return pages; +} + #if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK) static int memblock_debug_show(struct seq_file *m, void *private) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 10a9b554d69f..54920cbc46bf 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4728,7 +4728,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, /* shmem/tmpfs may report page out on swap: account for that too. */ if (shmem_mapping(mapping)) { page = find_get_entry(mapping, pgoff); - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { swp_entry_t swp = radix_to_swp_entry(page); if (do_memsw_account()) *entry = swp; diff --git a/mm/memfd.c b/mm/memfd.c index 2bb5e257080e..97264c79d2cd 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -21,44 +21,36 @@ #include <uapi/linux/memfd.h> /* - * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, + * We need a tag: a new tag would expand every xa_node by 8 bytes, * so reuse a tag which we firmly believe is never set or cleared on tmpfs * or hugetlbfs because they are memory only filesystems. */ #define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE #define LAST_SCAN 4 /* about 150ms max */ -static void memfd_tag_pins(struct address_space *mapping) +static void memfd_tag_pins(struct xa_state *xas) { - struct radix_tree_iter iter; - void __rcu **slot; - pgoff_t start; struct page *page; + unsigned int tagged = 0; lru_add_drain(); - start = 0; - rcu_read_lock(); - - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - page = radix_tree_deref_slot(slot); - if (!page || radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - } else if (page_count(page) - page_mapcount(page) > 1) { - xa_lock_irq(&mapping->i_pages); - radix_tree_tag_set(&mapping->i_pages, iter.index, - MEMFD_TAG_PINNED); - xa_unlock_irq(&mapping->i_pages); - } - if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); - cond_resched_rcu(); - } + xas_lock_irq(xas); + xas_for_each(xas, page, ULONG_MAX) { + if (xa_is_value(page)) + continue; + if (page_count(page) - page_mapcount(page) > 1) + xas_set_mark(xas, MEMFD_TAG_PINNED); + + if (++tagged % XA_CHECK_SCHED) + continue; + + xas_pause(xas); + xas_unlock_irq(xas); + cond_resched(); + xas_lock_irq(xas); } - rcu_read_unlock(); + xas_unlock_irq(xas); } /* @@ -72,17 +64,17 @@ static void memfd_tag_pins(struct address_space *mapping) */ static int memfd_wait_for_pins(struct address_space *mapping) { - struct radix_tree_iter iter; - void __rcu **slot; - pgoff_t start; + XA_STATE(xas, &mapping->i_pages, 0); struct page *page; int error, scan; - memfd_tag_pins(mapping); + memfd_tag_pins(&xas); error = 0; for (scan = 0; scan <= LAST_SCAN; scan++) { - if (!radix_tree_tagged(&mapping->i_pages, MEMFD_TAG_PINNED)) + unsigned int tagged = 0; + + if (!xas_marked(&xas, MEMFD_TAG_PINNED)) break; if (!scan) @@ -90,45 +82,34 @@ static int memfd_wait_for_pins(struct address_space *mapping) else if (schedule_timeout_killable((HZ << scan) / 200)) scan = LAST_SCAN; - start = 0; - rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, - start, MEMFD_TAG_PINNED) { - - page = radix_tree_deref_slot(slot); - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - - page = NULL; - } - - if (page && - page_count(page) - page_mapcount(page) != 1) { - if (scan < LAST_SCAN) - goto continue_resched; - + xas_set(&xas, 0); + xas_lock_irq(&xas); + xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) { + bool clear = true; + if (xa_is_value(page)) + continue; + if (page_count(page) - page_mapcount(page) != 1) { /* * On the last scan, we clean up all those tags * we inserted; but make a note that we still * found pages pinned. */ - error = -EBUSY; + if (scan == LAST_SCAN) + error = -EBUSY; + else + clear = false; } + if (clear) + xas_clear_mark(&xas, MEMFD_TAG_PINNED); + if (++tagged % XA_CHECK_SCHED) + continue; - xa_lock_irq(&mapping->i_pages); - radix_tree_tag_clear(&mapping->i_pages, - iter.index, MEMFD_TAG_PINNED); - xa_unlock_irq(&mapping->i_pages); -continue_resched: - if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); - cond_resched_rcu(); - } + xas_pause(&xas); + xas_unlock_irq(&xas); + cond_resched(); + xas_lock_irq(&xas); } - rcu_read_unlock(); + xas_unlock_irq(&xas); } return error; diff --git a/mm/memory.c b/mm/memory.c index 072139579d89..4ad2d293ddc2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1537,10 +1537,15 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, * in may not match the PFN we have mapped if the * mapped PFN is a writeable COW page. In the mkwrite * case we are creating a writable PTE for a shared - * mapping and we expect the PFNs to match. + * mapping and we expect the PFNs to match. If they + * don't match, we are likely racing with block + * allocation and mapping invalidation so just skip the + * update. */ - if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn))) + if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) { + WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte))); goto out_unlock; + } entry = *pte; goto out_mkwrite; } else diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 7e6509a53d79..61972da38d93 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -33,7 +33,6 @@ #include <linux/stop_machine.h> #include <linux/hugetlb.h> #include <linux/memblock.h> -#include <linux/bootmem.h> #include <linux/compaction.h> #include <asm/tlbflush.h> @@ -839,7 +838,6 @@ static struct zone * __meminit move_pfn_range(int online_type, int nid, return zone; } -/* Must be protected by mem_hotplug_begin() or a device_lock */ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) { unsigned long flags; @@ -851,6 +849,8 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ struct memory_notify arg; struct memory_block *mem; + mem_hotplug_begin(); + /* * We can't use pfn_to_nid() because nid might be stored in struct page * which is not yet initialized. Instead, we find nid from memory block. @@ -915,6 +915,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if (onlined_pages) memory_notify(MEM_ONLINE, &arg); + mem_hotplug_done(); return 0; failed_addition: @@ -922,6 +923,7 @@ failed_addition: (unsigned long long) pfn << PAGE_SHIFT, (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_ONLINE, &arg); + mem_hotplug_done(); return ret; } #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ @@ -1069,7 +1071,12 @@ static int online_memory_block(struct memory_block *mem, void *arg) return device_online(&mem->dev); } -/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ +/* + * NOTE: The caller must call lock_device_hotplug() to serialize hotplug + * and online/offline operations (triggered e.g. by sysfs). + * + * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG + */ int __ref add_memory_resource(int nid, struct resource *res, bool online) { u64 start, size; @@ -1121,26 +1128,26 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) /* create new memmap entry */ firmware_map_add_hotplug(start, start + size, "System RAM"); + /* device_online() will take the lock when calling online_pages() */ + mem_hotplug_done(); + /* online pages if requested */ if (online) walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, online_memory_block); - goto out; - + return ret; error: /* rollback pgdat allocation and others */ if (new_node) rollback_node_hotadd(nid); memblock_remove(start, size); - -out: mem_hotplug_done(); return ret; } -EXPORT_SYMBOL_GPL(add_memory_resource); -int __ref add_memory(int nid, u64 start, u64 size) +/* requires device_hotplug_lock, see add_memory_resource() */ +int __ref __add_memory(int nid, u64 start, u64 size) { struct resource *res; int ret; @@ -1154,6 +1161,17 @@ int __ref add_memory(int nid, u64 start, u64 size) release_memory_resource(res); return ret; } + +int add_memory(int nid, u64 start, u64 size) +{ + int rc; + + lock_device_hotplug(); + rc = __add_memory(nid, start, size); + unlock_device_hotplug(); + + return rc; +} EXPORT_SYMBOL_GPL(add_memory); #ifdef CONFIG_MEMORY_HOTREMOVE @@ -1540,10 +1558,16 @@ static int __ref __offline_pages(unsigned long start_pfn, return -EINVAL; if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) return -EINVAL; + + mem_hotplug_begin(); + /* This makes hotplug much easier...and readable. we assume this for now. .*/ - if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end)) + if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, + &valid_end)) { + mem_hotplug_done(); return -EINVAL; + } zone = page_zone(pfn_to_page(valid_start)); node = zone_to_nid(zone); @@ -1552,8 +1576,10 @@ static int __ref __offline_pages(unsigned long start_pfn, /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, true); - if (ret) + if (ret) { + mem_hotplug_done(); return ret; + } arg.start_pfn = start_pfn; arg.nr_pages = nr_pages; @@ -1624,6 +1650,7 @@ repeat: writeback_set_ratelimit(); memory_notify(MEM_OFFLINE, &arg); + mem_hotplug_done(); return 0; failed_removal: @@ -1633,10 +1660,10 @@ failed_removal: memory_notify(MEM_CANCEL_OFFLINE, &arg); /* pushback to free area */ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); + mem_hotplug_done(); return ret; } -/* Must be protected by mem_hotplug_begin() or a device_lock */ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) { return __offline_pages(start_pfn, start_pfn + nr_pages); @@ -1807,7 +1834,7 @@ EXPORT_SYMBOL(try_offline_node); * and online/offline operations before this call, as required by * try_offline_node(). */ -void __ref remove_memory(int nid, u64 start, u64 size) +void __ref __remove_memory(int nid, u64 start, u64 size) { int ret; @@ -1836,5 +1863,12 @@ void __ref remove_memory(int nid, u64 start, u64 size) mem_hotplug_done(); } + +void remove_memory(int nid, u64 start, u64 size) +{ + lock_device_hotplug(); + __remove_memory(nid, start, size); + unlock_device_hotplug(); +} EXPORT_SYMBOL_GPL(remove_memory); #endif /* CONFIG_MEMORY_HOTREMOVE */ diff --git a/mm/migrate.c b/mm/migrate.c index b6700f2962f3..f7e4bfdc13b7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -326,7 +326,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, page = migration_entry_to_page(entry); /* - * Once radix-tree replacement of page migration started, page_count + * Once page cache replacement of page migration started, page_count * *must* be zero. And, we don't want to call wait_on_page_locked() * against a page without get_page(). * So, we use get_page_unless_zero(), here. Even failed, page fault @@ -441,10 +441,10 @@ int migrate_page_move_mapping(struct address_space *mapping, struct buffer_head *head, enum migrate_mode mode, int extra_count) { + XA_STATE(xas, &mapping->i_pages, page_index(page)); struct zone *oldzone, *newzone; int dirty; int expected_count = 1 + extra_count; - void **pslot; /* * Device public or private pages have an extra refcount as they are @@ -470,21 +470,16 @@ int migrate_page_move_mapping(struct address_space *mapping, oldzone = page_zone(page); newzone = page_zone(newpage); - xa_lock_irq(&mapping->i_pages); - - pslot = radix_tree_lookup_slot(&mapping->i_pages, - page_index(page)); + xas_lock_irq(&xas); expected_count += hpage_nr_pages(page) + page_has_private(page); - if (page_count(page) != expected_count || - radix_tree_deref_slot_protected(pslot, - &mapping->i_pages.xa_lock) != page) { - xa_unlock_irq(&mapping->i_pages); + if (page_count(page) != expected_count || xas_load(&xas) != page) { + xas_unlock_irq(&xas); return -EAGAIN; } if (!page_ref_freeze(page, expected_count)) { - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); return -EAGAIN; } @@ -498,7 +493,7 @@ int migrate_page_move_mapping(struct address_space *mapping, if (mode == MIGRATE_ASYNC && head && !buffer_migrate_lock_buffers(head, mode)) { page_ref_unfreeze(page, expected_count); - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); return -EAGAIN; } @@ -526,16 +521,13 @@ int migrate_page_move_mapping(struct address_space *mapping, SetPageDirty(newpage); } - radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); + xas_store(&xas, newpage); if (PageTransHuge(page)) { int i; - int index = page_index(page); for (i = 1; i < HPAGE_PMD_NR; i++) { - pslot = radix_tree_lookup_slot(&mapping->i_pages, - index + i); - radix_tree_replace_slot(&mapping->i_pages, pslot, - newpage + i); + xas_next(&xas); + xas_store(&xas, newpage + i); } } @@ -546,7 +538,7 @@ int migrate_page_move_mapping(struct address_space *mapping, */ page_ref_unfreeze(page, expected_count - hpage_nr_pages(page)); - xa_unlock(&mapping->i_pages); + xas_unlock(&xas); /* Leave irq disabled to prevent preemption while updating stats */ /* @@ -586,22 +578,18 @@ EXPORT_SYMBOL(migrate_page_move_mapping); int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page) { + XA_STATE(xas, &mapping->i_pages, page_index(page)); int expected_count; - void **pslot; - - xa_lock_irq(&mapping->i_pages); - - pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); + xas_lock_irq(&xas); expected_count = 2 + page_has_private(page); - if (page_count(page) != expected_count || - radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) { - xa_unlock_irq(&mapping->i_pages); + if (page_count(page) != expected_count || xas_load(&xas) != page) { + xas_unlock_irq(&xas); return -EAGAIN; } if (!page_ref_freeze(page, expected_count)) { - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); return -EAGAIN; } @@ -610,11 +598,11 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, get_page(newpage); - radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); + xas_store(&xas, newpage); page_ref_unfreeze(page, expected_count - 1); - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); return MIGRATEPAGE_SUCCESS; } diff --git a/mm/mincore.c b/mm/mincore.c index fc37afe226e6..4985965aa20a 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -66,7 +66,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) * shmem/tmpfs may return swap: account for swapcache * page too. */ - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { swp_entry_t swp = radix_to_swp_entry(page); page = find_get_page(swap_address_space(swp), swp_offset(swp)); diff --git a/mm/nobootmem.c b/mm/nobootmem.c deleted file mode 100644 index 439af3b765a7..000000000000 --- a/mm/nobootmem.c +++ /dev/null @@ -1,445 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * bootmem - A boot-time physical memory allocator and configurator - * - * Copyright (C) 1999 Ingo Molnar - * 1999 Kanoj Sarcar, SGI - * 2008 Johannes Weiner - * - * Access to this subsystem has to be serialized externally (which is true - * for the boot process anyway). - */ -#include <linux/init.h> -#include <linux/pfn.h> -#include <linux/slab.h> -#include <linux/export.h> -#include <linux/kmemleak.h> -#include <linux/range.h> -#include <linux/memblock.h> -#include <linux/bootmem.h> - -#include <asm/bug.h> -#include <asm/io.h> - -#include "internal.h" - -#ifndef CONFIG_HAVE_MEMBLOCK -#error CONFIG_HAVE_MEMBLOCK not defined -#endif - -#ifndef CONFIG_NEED_MULTIPLE_NODES -struct pglist_data __refdata contig_page_data; -EXPORT_SYMBOL(contig_page_data); -#endif - -unsigned long max_low_pfn; -unsigned long min_low_pfn; -unsigned long max_pfn; -unsigned long long max_possible_pfn; - -static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, - u64 goal, u64 limit) -{ - void *ptr; - u64 addr; - enum memblock_flags flags = choose_memblock_flags(); - - if (limit > memblock.current_limit) - limit = memblock.current_limit; - -again: - addr = memblock_find_in_range_node(size, align, goal, limit, nid, - flags); - if (!addr && (flags & MEMBLOCK_MIRROR)) { - flags &= ~MEMBLOCK_MIRROR; - pr_warn("Could not allocate %pap bytes of mirrored memory\n", - &size); - goto again; - } - if (!addr) - return NULL; - - if (memblock_reserve(addr, size)) - return NULL; - - ptr = phys_to_virt(addr); - memset(ptr, 0, size); - /* - * The min_count is set to 0 so that bootmem allocated blocks - * are never reported as leaks. - */ - kmemleak_alloc(ptr, size, 0, 0); - return ptr; -} - -/** - * free_bootmem_late - free bootmem pages directly to page allocator - * @addr: starting address of the range - * @size: size of the range in bytes - * - * This is only useful when the bootmem allocator has already been torn - * down, but we are still initializing the system. Pages are given directly - * to the page allocator, no bootmem metadata is updated because it is gone. - */ -void __init free_bootmem_late(unsigned long addr, unsigned long size) -{ - unsigned long cursor, end; - - kmemleak_free_part_phys(addr, size); - - cursor = PFN_UP(addr); - end = PFN_DOWN(addr + size); - - for (; cursor < end; cursor++) { - __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); - totalram_pages++; - } -} - -static void __init __free_pages_memory(unsigned long start, unsigned long end) -{ - int order; - - while (start < end) { - order = min(MAX_ORDER - 1UL, __ffs(start)); - - while (start + (1UL << order) > end) - order--; - - __free_pages_bootmem(pfn_to_page(start), start, order); - - start += (1UL << order); - } -} - -static unsigned long __init __free_memory_core(phys_addr_t start, - phys_addr_t end) -{ - unsigned long start_pfn = PFN_UP(start); - unsigned long end_pfn = min_t(unsigned long, - PFN_DOWN(end), max_low_pfn); - - if (start_pfn >= end_pfn) - return 0; - - __free_pages_memory(start_pfn, end_pfn); - - return end_pfn - start_pfn; -} - -static unsigned long __init free_low_memory_core_early(void) -{ - unsigned long count = 0; - phys_addr_t start, end; - u64 i; - - memblock_clear_hotplug(0, -1); - - for_each_reserved_mem_region(i, &start, &end) - reserve_bootmem_region(start, end); - - /* - * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id - * because in some case like Node0 doesn't have RAM installed - * low ram will be on Node1 - */ - for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, - NULL) - count += __free_memory_core(start, end); - - return count; -} - -static int reset_managed_pages_done __initdata; - -void reset_node_managed_pages(pg_data_t *pgdat) -{ - struct zone *z; - - for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) - z->managed_pages = 0; -} - -void __init reset_all_zones_managed_pages(void) -{ - struct pglist_data *pgdat; - - if (reset_managed_pages_done) - return; - - for_each_online_pgdat(pgdat) - reset_node_managed_pages(pgdat); - - reset_managed_pages_done = 1; -} - -/** - * free_all_bootmem - release free pages to the buddy allocator - * - * Return: the number of pages actually released. - */ -unsigned long __init free_all_bootmem(void) -{ - unsigned long pages; - - reset_all_zones_managed_pages(); - - pages = free_low_memory_core_early(); - totalram_pages += pages; - - return pages; -} - -/** - * free_bootmem_node - mark a page range as usable - * @pgdat: node the range resides on - * @physaddr: starting physical address of the range - * @size: size of the range in bytes - * - * Partial pages will be considered reserved and left as they are. - * - * The range must reside completely on the specified node. - */ -void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, - unsigned long size) -{ - memblock_free(physaddr, size); -} - -/** - * free_bootmem - mark a page range as usable - * @addr: starting physical address of the range - * @size: size of the range in bytes - * - * Partial pages will be considered reserved and left as they are. - * - * The range must be contiguous but may span node boundaries. - */ -void __init free_bootmem(unsigned long addr, unsigned long size) -{ - memblock_free(addr, size); -} - -static void * __init ___alloc_bootmem_nopanic(unsigned long size, - unsigned long align, - unsigned long goal, - unsigned long limit) -{ - void *ptr; - - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc(size, GFP_NOWAIT); - -restart: - - ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit); - - if (ptr) - return ptr; - - if (goal != 0) { - goal = 0; - goto restart; - } - - return NULL; -} - -/** - * __alloc_bootmem_nopanic - allocate boot memory without panicking - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may happen on any node in the system. - * - * Return: address of the allocated region or %NULL on failure. - */ -void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, - unsigned long goal) -{ - unsigned long limit = -1UL; - - return ___alloc_bootmem_nopanic(size, align, goal, limit); -} - -static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, - unsigned long goal, unsigned long limit) -{ - void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit); - - if (mem) - return mem; - /* - * Whoops, we cannot satisfy the allocation request. - */ - pr_alert("bootmem alloc of %lu bytes failed!\n", size); - panic("Out of memory"); - return NULL; -} - -/** - * __alloc_bootmem - allocate boot memory - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may happen on any node in the system. - * - * The function panics if the request can not be satisfied. - * - * Return: address of the allocated region. - */ -void * __init __alloc_bootmem(unsigned long size, unsigned long align, - unsigned long goal) -{ - unsigned long limit = -1UL; - - return ___alloc_bootmem(size, align, goal, limit); -} - -void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, - unsigned long size, - unsigned long align, - unsigned long goal, - unsigned long limit) -{ - void *ptr; - -again: - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, - goal, limit); - if (ptr) - return ptr; - - ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, - goal, limit); - if (ptr) - return ptr; - - if (goal) { - goal = 0; - goto again; - } - - return NULL; -} - -void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - - return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); -} - -static void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal, - unsigned long limit) -{ - void *ptr; - - ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit); - if (ptr) - return ptr; - - pr_alert("bootmem alloc of %lu bytes failed!\n", size); - panic("Out of memory"); - return NULL; -} - -/** - * __alloc_bootmem_node - allocate boot memory from a specific node - * @pgdat: node to allocate from - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may fall back to any node in the system if the specified node - * can not hold the requested memory. - * - * The function panics if the request can not be satisfied. - * - * Return: address of the allocated region. - */ -void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - - return ___alloc_bootmem_node(pgdat, size, align, goal, 0); -} - -void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ - return __alloc_bootmem_node(pgdat, size, align, goal); -} - - -/** - * __alloc_bootmem_low - allocate low boot memory - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may happen on any node in the system. - * - * The function panics if the request can not be satisfied. - * - * Return: address of the allocated region. - */ -void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, - unsigned long goal) -{ - return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); -} - -void * __init __alloc_bootmem_low_nopanic(unsigned long size, - unsigned long align, - unsigned long goal) -{ - return ___alloc_bootmem_nopanic(size, align, goal, - ARCH_LOW_ADDRESS_LIMIT); -} - -/** - * __alloc_bootmem_low_node - allocate low boot memory from a specific node - * @pgdat: node to allocate from - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may fall back to any node in the system if the specified node - * can not hold the requested memory. - * - * The function panics if the request can not be satisfied. - * - * Return: address of the allocated region. - */ -void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - - return ___alloc_bootmem_node(pgdat, size, align, goal, - ARCH_LOW_ADDRESS_LIMIT); -} diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 439a304a6c92..3f690bae6b78 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2097,34 +2097,25 @@ void __init page_writeback_init(void) * dirty pages in the file (thus it is important for this function to be quick * so that it can tag pages faster than a dirtying process can create them). */ -/* - * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce the i_pages lock - * latency. - */ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) { -#define WRITEBACK_TAG_BATCH 4096 - unsigned long tagged = 0; - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, start); + unsigned int tagged = 0; + void *page; - xa_lock_irq(&mapping->i_pages); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, - PAGECACHE_TAG_DIRTY) { - if (iter.index > end) - break; - radix_tree_iter_tag_set(&mapping->i_pages, &iter, - PAGECACHE_TAG_TOWRITE); - tagged++; - if ((tagged % WRITEBACK_TAG_BATCH) != 0) + xas_lock_irq(&xas); + xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) { + xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); + if (++tagged % XA_CHECK_SCHED) continue; - slot = radix_tree_iter_resume(slot, &iter); - xa_unlock_irq(&mapping->i_pages); + + xas_pause(&xas); + xas_unlock_irq(&xas); cond_resched(); - xa_lock_irq(&mapping->i_pages); + xas_lock_irq(&xas); } - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); } EXPORT_SYMBOL(tag_pages_for_writeback); @@ -2170,7 +2161,7 @@ int write_cache_pages(struct address_space *mapping, pgoff_t end; /* Inclusive */ pgoff_t done_index; int range_whole = 0; - int tag; + xa_mark_t tag; pagevec_init(&pvec); if (wbc->range_cyclic) { @@ -2442,7 +2433,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, /* * For address_spaces which do not use buffers. Just tag the page as dirty in - * its radix tree. + * the xarray. * * This is also used when a single buffer is being dirtied: we want to set the * page dirty in that case, but not all the buffers. This is a "bottom-up" @@ -2468,7 +2459,7 @@ int __set_page_dirty_nobuffers(struct page *page) BUG_ON(page_mapping(page) != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->i_pages, page_index(page), + __xa_set_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); xa_unlock_irqrestore(&mapping->i_pages, flags); unlock_page_memcg(page); @@ -2631,13 +2622,13 @@ EXPORT_SYMBOL(__cancel_dirty_page); * Returns true if the page was previously dirty. * * This is for preparing to put the page under writeout. We leave the page - * tagged as dirty in the radix tree so that a concurrent write-for-sync + * tagged as dirty in the xarray so that a concurrent write-for-sync * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage * implementation will run either set_page_writeback() or set_page_dirty(), - * at which stage we bring the page's dirty flag and radix-tree dirty tag + * at which stage we bring the page's dirty flag and xarray dirty tag * back into sync. * - * This incoherency between the page's dirty flag and radix-tree tag is + * This incoherency between the page's dirty flag and xarray tag is * unfortunate, but it only exists while the page is locked. */ int clear_page_dirty_for_io(struct page *page) @@ -2718,7 +2709,7 @@ int test_clear_page_writeback(struct page *page) xa_lock_irqsave(&mapping->i_pages, flags); ret = TestClearPageWriteback(page); if (ret) { - radix_tree_tag_clear(&mapping->i_pages, page_index(page), + __xa_clear_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) { struct bdi_writeback *wb = inode_to_wb(inode); @@ -2758,11 +2749,13 @@ int __test_set_page_writeback(struct page *page, bool keep_write) lock_page_memcg(page); if (mapping && mapping_use_writeback_tags(mapping)) { + XA_STATE(xas, &mapping->i_pages, page_index(page)); struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; - xa_lock_irqsave(&mapping->i_pages, flags); + xas_lock_irqsave(&xas, flags); + xas_load(&xas); ret = TestSetPageWriteback(page); if (!ret) { bool on_wblist; @@ -2770,8 +2763,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); - radix_tree_tag_set(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_WRITEBACK); + xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); @@ -2784,12 +2776,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write) sb_mark_inode_writeback(mapping->host); } if (!PageDirty(page)) - radix_tree_tag_clear(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_DIRTY); + xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); if (!keep_write) - radix_tree_tag_clear(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_TOWRITE); - xa_unlock_irqrestore(&mapping->i_pages, flags); + xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); + xas_unlock_irqrestore(&xas, flags); } else { ret = TestSetPageWriteback(page); } @@ -2803,16 +2793,6 @@ int __test_set_page_writeback(struct page *page, bool keep_write) } EXPORT_SYMBOL(__test_set_page_writeback); -/* - * Return true if any of the pages in the mapping are marked with the - * passed tag. - */ -int mapping_tagged(struct address_space *mapping, int tag) -{ - return radix_tree_tagged(&mapping->i_pages, tag); -} -EXPORT_SYMBOL(mapping_tagged); - /** * wait_for_stable_page() - wait for writeback to finish, if necessary. * @page: The page to wait on. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 863d46da6586..a919ba5cb3c8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -20,7 +20,6 @@ #include <linux/interrupt.h> #include <linux/pagemap.h> #include <linux/jiffies.h> -#include <linux/bootmem.h> #include <linux/memblock.h> #include <linux/compiler.h> #include <linux/kernel.h> @@ -1339,7 +1338,7 @@ meminit_pfn_in_nid(unsigned long pfn, int node, #endif -void __init __free_pages_bootmem(struct page *page, unsigned long pfn, +void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) { if (early_page_uninitialised(pfn)) @@ -5476,7 +5475,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) /* * Initially all pages are reserved - free ones are freed - * up by free_all_bootmem() once the early boot process is + * up by memblock_free_all() once the early boot process is * done. Non-atomic initialization, single-pass. */ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, @@ -6209,7 +6208,7 @@ static void __ref setup_usemap(struct pglist_data *pgdat, zone->pageblock_flags = NULL; if (usemapsize) zone->pageblock_flags = - memblock_virt_alloc_node_nopanic(usemapsize, + memblock_alloc_node_nopanic(usemapsize, pgdat->node_id); } #else @@ -6439,7 +6438,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) end = pgdat_end_pfn(pgdat); end = ALIGN(end, MAX_ORDER_NR_PAGES); size = (end - start) * sizeof(struct page); - map = memblock_virt_alloc_node_nopanic(size, pgdat->node_id); + map = memblock_alloc_node_nopanic(size, pgdat->node_id); pgdat->node_mem_map = map + offset; } pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", @@ -6508,8 +6507,7 @@ void __init free_area_init_node(int nid, unsigned long *zones_size, free_area_init_core(pgdat); } -#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP) - +#if !defined(CONFIG_FLAT_NODE_MEM_MAP) /* * Zero all valid struct pages in range [spfn, epfn), return number of struct * pages zeroed @@ -6569,7 +6567,7 @@ void __init zero_resv_unavail(void) if (pgcnt) pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt); } -#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */ +#endif /* !CONFIG_FLAT_NODE_MEM_MAP */ #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP @@ -7712,9 +7710,11 @@ void *__init alloc_large_system_hash(const char *tablename, size = bucketsize << log2qty; if (flags & HASH_EARLY) { if (flags & HASH_ZERO) - table = memblock_virt_alloc_nopanic(size, 0); + table = memblock_alloc_nopanic(size, + SMP_CACHE_BYTES); else - table = memblock_virt_alloc_raw(size, 0); + table = memblock_alloc_raw(size, + SMP_CACHE_BYTES); } else if (hashdist) { table = __vmalloc(size, gfp_flags, PAGE_KERNEL); } else { diff --git a/mm/page_ext.c b/mm/page_ext.c index a9826da84ccb..ae44f7adbe07 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <linux/mmzone.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/page_ext.h> #include <linux/memory.h> #include <linux/vmalloc.h> @@ -161,9 +161,9 @@ static int __init alloc_node_page_ext(int nid) table_size = get_entry_size() * nr_pages; - base = memblock_virt_alloc_try_nid_nopanic( + base = memblock_alloc_try_nid_nopanic( table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), - BOOTMEM_ALLOC_ACCESSIBLE, nid); + MEMBLOCK_ALLOC_ACCESSIBLE, nid); if (!base) return -ENOMEM; NODE_DATA(nid)->node_page_ext = base; diff --git a/mm/page_idle.c b/mm/page_idle.c index 6302bc62c27d..b9e4b42b33ab 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/init.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/fs.h> #include <linux/sysfs.h> #include <linux/kobject.h> diff --git a/mm/page_owner.c b/mm/page_owner.c index d80adfe702d3..87bc0dfdb52b 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -3,7 +3,7 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/uaccess.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/stacktrace.h> #include <linux/page_owner.h> #include <linux/jump_label.h> diff --git a/mm/page_poison.c b/mm/page_poison.c index aa2b3d34e8ea..f7e2a676365a 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -21,7 +21,7 @@ bool page_poisoning_enabled(void) { /* * Assumes that debug_pagealloc_enabled is set before - * free_all_bootmem. + * memblock_free_all. * Page poisoning is debug page alloc for some arches. If * either of those options are enabled, enable poisoning. */ diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index ae3c2a35d61b..11df03e71288 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -21,7 +21,29 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw) if (!is_swap_pte(*pvmw->pte)) return false; } else { - if (!pte_present(*pvmw->pte)) + /* + * We get here when we are trying to unmap a private + * device page from the process address space. Such + * page is not CPU accessible and thus is mapped as + * a special swap entry, nonetheless it still does + * count as a valid regular mapping for the page (and + * is accounted as such in page maps count). + * + * So handle this special case as if it was a normal + * page mapping ie lock CPU page table and returns + * true. + * + * For more details on device private memory see HMM + * (include/linux/hmm.h or mm/hmm.c). + */ + if (is_swap_pte(*pvmw->pte)) { + swp_entry_t entry; + + /* Handle un-addressable ZONE_DEVICE memory */ + entry = pte_to_swp_entry(*pvmw->pte); + if (!is_device_private_entry(entry)) + return false; + } else if (!pte_present(*pvmw->pte)) return false; } } diff --git a/mm/percpu.c b/mm/percpu.c index 4b90682623e9..a6b74c6fe0be 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -65,7 +65,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/bitmap.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/err.h> #include <linux/lcm.h> #include <linux/list.h> @@ -1101,9 +1101,9 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, region_size = ALIGN(start_offset + map_size, lcm_align); /* allocate chunk */ - chunk = memblock_virt_alloc(sizeof(struct pcpu_chunk) + - BITS_TO_LONGS(region_size >> PAGE_SHIFT), - 0); + chunk = memblock_alloc(sizeof(struct pcpu_chunk) + + BITS_TO_LONGS(region_size >> PAGE_SHIFT), + SMP_CACHE_BYTES); INIT_LIST_HEAD(&chunk->list); @@ -1114,12 +1114,12 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, chunk->nr_pages = region_size >> PAGE_SHIFT; region_bits = pcpu_chunk_map_bits(chunk); - chunk->alloc_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits) * - sizeof(chunk->alloc_map[0]), 0); - chunk->bound_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits + 1) * - sizeof(chunk->bound_map[0]), 0); - chunk->md_blocks = memblock_virt_alloc(pcpu_chunk_nr_blocks(chunk) * - sizeof(chunk->md_blocks[0]), 0); + chunk->alloc_map = memblock_alloc(BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]), + SMP_CACHE_BYTES); + chunk->bound_map = memblock_alloc(BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]), + SMP_CACHE_BYTES); + chunk->md_blocks = memblock_alloc(pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]), + SMP_CACHE_BYTES); pcpu_init_md_blocks(chunk); /* manage populated page bitmap */ @@ -1888,7 +1888,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, __alignof__(ai->groups[0].cpu_map[0])); ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); - ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), PAGE_SIZE); + ptr = memblock_alloc_nopanic(PFN_ALIGN(ai_size), PAGE_SIZE); if (!ptr) return NULL; ai = ptr; @@ -2075,12 +2075,14 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); /* process group information and build config tables accordingly */ - group_offsets = memblock_virt_alloc(ai->nr_groups * - sizeof(group_offsets[0]), 0); - group_sizes = memblock_virt_alloc(ai->nr_groups * - sizeof(group_sizes[0]), 0); - unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0); - unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0); + group_offsets = memblock_alloc(ai->nr_groups * sizeof(group_offsets[0]), + SMP_CACHE_BYTES); + group_sizes = memblock_alloc(ai->nr_groups * sizeof(group_sizes[0]), + SMP_CACHE_BYTES); + unit_map = memblock_alloc(nr_cpu_ids * sizeof(unit_map[0]), + SMP_CACHE_BYTES); + unit_off = memblock_alloc(nr_cpu_ids * sizeof(unit_off[0]), + SMP_CACHE_BYTES); for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = UINT_MAX; @@ -2144,8 +2146,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, * empty chunks. */ pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; - pcpu_slot = memblock_virt_alloc( - pcpu_nr_slots * sizeof(pcpu_slot[0]), 0); + pcpu_slot = memblock_alloc(pcpu_nr_slots * sizeof(pcpu_slot[0]), + SMP_CACHE_BYTES); for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_slot[i]); @@ -2458,7 +2460,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); - areas = memblock_virt_alloc_nopanic(areas_size, 0); + areas = memblock_alloc_nopanic(areas_size, SMP_CACHE_BYTES); if (!areas) { rc = -ENOMEM; goto out_free; @@ -2599,7 +2601,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, /* unaligned allocations can't be freed, round up to page size */ pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * sizeof(pages[0])); - pages = memblock_virt_alloc(pages_size, 0); + pages = memblock_alloc(pages_size, SMP_CACHE_BYTES); /* allocate pages */ j = 0; @@ -2688,7 +2690,7 @@ EXPORT_SYMBOL(__per_cpu_offset); static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, size_t align) { - return memblock_virt_alloc_from_nopanic( + return memblock_alloc_from_nopanic( size, align, __pa(MAX_DMA_ADDRESS)); } @@ -2737,7 +2739,7 @@ void __init setup_per_cpu_areas(void) void *fc; ai = pcpu_alloc_alloc_info(1, 1); - fc = memblock_virt_alloc_from_nopanic(unit_size, + fc = memblock_alloc_from_nopanic(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); if (!ai || !fc) diff --git a/mm/readahead.c b/mm/readahead.c index 4e630143a0ba..f3d6f9656a3c 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -176,10 +176,8 @@ unsigned int __do_page_cache_readahead(struct address_space *mapping, if (page_offset > end_index) break; - rcu_read_lock(); - page = radix_tree_lookup(&mapping->i_pages, page_offset); - rcu_read_unlock(); - if (page && !radix_tree_exceptional_entry(page)) { + page = xa_load(&mapping->i_pages, page_offset); + if (page && !xa_is_value(page)) { /* * Page already present? Kick off the current batch of * contiguous pages before continuing with the next @@ -336,7 +334,7 @@ static pgoff_t count_history_pages(struct address_space *mapping, pgoff_t head; rcu_read_lock(); - head = page_cache_prev_hole(mapping, offset - 1, max); + head = page_cache_prev_miss(mapping, offset - 1, max); rcu_read_unlock(); return offset - 1 - head; @@ -425,7 +423,7 @@ ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = page_cache_next_hole(mapping, offset + 1, max_pages); + start = page_cache_next_miss(mapping, offset + 1, max_pages); rcu_read_unlock(); if (!start || start - offset > max_pages) diff --git a/mm/shmem.c b/mm/shmem.c index 446942677cd4..56bf122e0bb4 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -322,24 +322,20 @@ void shmem_uncharge(struct inode *inode, long pages) } /* - * Replace item expected in radix tree by a new item, while holding tree lock. + * Replace item expected in xarray by a new item, while holding xa_lock. */ -static int shmem_radix_tree_replace(struct address_space *mapping, +static int shmem_replace_entry(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { - struct radix_tree_node *node; - void __rcu **pslot; + XA_STATE(xas, &mapping->i_pages, index); void *item; VM_BUG_ON(!expected); VM_BUG_ON(!replacement); - item = __radix_tree_lookup(&mapping->i_pages, index, &node, &pslot); - if (!item) - return -ENOENT; + item = xas_load(&xas); if (item != expected) return -ENOENT; - __radix_tree_replace(&mapping->i_pages, node, pslot, - replacement, NULL); + xas_store(&xas, replacement); return 0; } @@ -353,12 +349,7 @@ static int shmem_radix_tree_replace(struct address_space *mapping, static bool shmem_confirm_swap(struct address_space *mapping, pgoff_t index, swp_entry_t swap) { - void *item; - - rcu_read_lock(); - item = radix_tree_lookup(&mapping->i_pages, index); - rcu_read_unlock(); - return item == swp_to_radix_entry(swap); + return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap); } /* @@ -586,9 +577,11 @@ static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo) */ static int shmem_add_to_page_cache(struct page *page, struct address_space *mapping, - pgoff_t index, void *expected) + pgoff_t index, void *expected, gfp_t gfp) { - int error, nr = hpage_nr_pages(page); + XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page)); + unsigned long i = 0; + unsigned long nr = 1UL << compound_order(page); VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(index != round_down(index, nr), page); @@ -600,47 +593,39 @@ static int shmem_add_to_page_cache(struct page *page, page->mapping = mapping; page->index = index; - xa_lock_irq(&mapping->i_pages); - if (PageTransHuge(page)) { - void __rcu **results; - pgoff_t idx; - int i; - - error = 0; - if (radix_tree_gang_lookup_slot(&mapping->i_pages, - &results, &idx, index, 1) && - idx < index + HPAGE_PMD_NR) { - error = -EEXIST; + do { + void *entry; + xas_lock_irq(&xas); + entry = xas_find_conflict(&xas); + if (entry != expected) + xas_set_err(&xas, -EEXIST); + xas_create_range(&xas); + if (xas_error(&xas)) + goto unlock; +next: + xas_store(&xas, page + i); + if (++i < nr) { + xas_next(&xas); + goto next; } - - if (!error) { - for (i = 0; i < HPAGE_PMD_NR; i++) { - error = radix_tree_insert(&mapping->i_pages, - index + i, page + i); - VM_BUG_ON(error); - } + if (PageTransHuge(page)) { count_vm_event(THP_FILE_ALLOC); + __inc_node_page_state(page, NR_SHMEM_THPS); } - } else if (!expected) { - error = radix_tree_insert(&mapping->i_pages, index, page); - } else { - error = shmem_radix_tree_replace(mapping, index, expected, - page); - } - - if (!error) { mapping->nrpages += nr; - if (PageTransHuge(page)) - __inc_node_page_state(page, NR_SHMEM_THPS); __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr); - xa_unlock_irq(&mapping->i_pages); - } else { +unlock: + xas_unlock_irq(&xas); + } while (xas_nomem(&xas, gfp)); + + if (xas_error(&xas)) { page->mapping = NULL; - xa_unlock_irq(&mapping->i_pages); page_ref_sub(page, nr); + return xas_error(&xas); } - return error; + + return 0; } /* @@ -654,7 +639,7 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) VM_BUG_ON_PAGE(PageCompound(page), page); xa_lock_irq(&mapping->i_pages); - error = shmem_radix_tree_replace(mapping, page->index, page, radswap); + error = shmem_replace_entry(mapping, page->index, page, radswap); page->mapping = NULL; mapping->nrpages--; __dec_node_page_state(page, NR_FILE_PAGES); @@ -665,7 +650,7 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) } /* - * Remove swap entry from radix tree, free the swap and its page cache. + * Remove swap entry from page cache, free the swap and its page cache. */ static int shmem_free_swap(struct address_space *mapping, pgoff_t index, void *radswap) @@ -673,7 +658,7 @@ static int shmem_free_swap(struct address_space *mapping, void *old; xa_lock_irq(&mapping->i_pages); - old = radix_tree_delete_item(&mapping->i_pages, index, radswap); + old = __xa_cmpxchg(&mapping->i_pages, index, radswap, NULL, 0); xa_unlock_irq(&mapping->i_pages); if (old != radswap) return -ENOENT; @@ -691,29 +676,19 @@ static int shmem_free_swap(struct address_space *mapping, unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end) { - struct radix_tree_iter iter; - void __rcu **slot; + XA_STATE(xas, &mapping->i_pages, start); struct page *page; unsigned long swapped = 0; rcu_read_lock(); - - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - if (iter.index >= end) - break; - - page = radix_tree_deref_slot(slot); - - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); + xas_for_each(&xas, page, end - 1) { + if (xas_retry(&xas, page)) continue; - } - - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) swapped++; if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); + xas_pause(&xas); cond_resched_rcu(); } } @@ -788,7 +763,7 @@ void shmem_unlock_mapping(struct address_space *mapping) } /* - * Remove range of pages and swap entries from radix tree, and free them. + * Remove range of pages and swap entries from page cache, and free them. * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. */ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, @@ -824,7 +799,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (index >= end) break; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { if (unfalloc) continue; nr_swaps_freed += !shmem_free_swap(mapping, @@ -921,7 +896,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (index >= end) break; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { if (unfalloc) continue; if (shmem_free_swap(mapping, index, page)) { @@ -1110,34 +1085,27 @@ static void shmem_evict_inode(struct inode *inode) clear_inode(inode); } -static unsigned long find_swap_entry(struct radix_tree_root *root, void *item) +static unsigned long find_swap_entry(struct xarray *xa, void *item) { - struct radix_tree_iter iter; - void __rcu **slot; - unsigned long found = -1; + XA_STATE(xas, xa, 0); unsigned int checked = 0; + void *entry; rcu_read_lock(); - radix_tree_for_each_slot(slot, root, &iter, 0) { - void *entry = radix_tree_deref_slot(slot); - - if (radix_tree_deref_retry(entry)) { - slot = radix_tree_iter_retry(&iter); + xas_for_each(&xas, entry, ULONG_MAX) { + if (xas_retry(&xas, entry)) continue; - } - if (entry == item) { - found = iter.index; + if (entry == item) break; - } checked++; - if ((checked % 4096) != 0) + if ((checked % XA_CHECK_SCHED) != 0) continue; - slot = radix_tree_iter_resume(slot, &iter); + xas_pause(&xas); cond_resched_rcu(); } - rcu_read_unlock(); - return found; + + return entry ? xas.xa_index : -1; } /* @@ -1175,10 +1143,10 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, * We needed to drop mutex to make that restrictive page * allocation, but the inode might have been freed while we * dropped it: although a racing shmem_evict_inode() cannot - * complete without emptying the radix_tree, our page lock + * complete without emptying the page cache, our page lock * on this swapcache page is not enough to prevent that - * free_swap_and_cache() of our swap entry will only - * trylock_page(), removing swap from radix_tree whatever. + * trylock_page(), removing swap from page cache whatever. * * We must not proceed to shmem_add_to_page_cache() if the * inode has been freed, but of course we cannot rely on @@ -1200,7 +1168,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, */ if (!error) error = shmem_add_to_page_cache(*pagep, mapping, index, - radswap); + radswap, gfp); if (error != -ENOMEM) { /* * Truncation and eviction use free_swap_and_cache(), which @@ -1244,7 +1212,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page) &memcg, false); if (error) goto out; - /* No radix_tree_preload: swap entry keeps a place for page in tree */ + /* No memory allocation: swap entry occupies the slot for the page */ error = -EAGAIN; mutex_lock(&shmem_swaplist_mutex); @@ -1453,23 +1421,17 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; - struct inode *inode = &info->vfs_inode; - struct address_space *mapping = inode->i_mapping; - pgoff_t idx, hindex; - void __rcu **results; + struct address_space *mapping = info->vfs_inode.i_mapping; + pgoff_t hindex; struct page *page; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) return NULL; hindex = round_down(index, HPAGE_PMD_NR); - rcu_read_lock(); - if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx, - hindex, 1) && idx < hindex + HPAGE_PMD_NR) { - rcu_read_unlock(); + if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1, + XA_PRESENT)) return NULL; - } - rcu_read_unlock(); shmem_pseudo_vma_init(&pvma, info, hindex); page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, @@ -1578,8 +1540,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, * a nice clean interface for us to replace oldpage by newpage there. */ xa_lock_irq(&swap_mapping->i_pages); - error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, - newpage); + error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage); if (!error) { __inc_node_page_state(newpage, NR_FILE_PAGES); __dec_node_page_state(oldpage, NR_FILE_PAGES); @@ -1643,7 +1604,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, repeat: swap.val = 0; page = find_lock_entry(mapping, index); - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { swap = radix_to_swp_entry(page); page = NULL; } @@ -1718,7 +1679,7 @@ repeat: false); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, - swp_to_radix_entry(swap)); + swp_to_radix_entry(swap), gfp); /* * We already confirmed swap under page lock, and make * no memory allocation here, so usually no possibility @@ -1824,13 +1785,8 @@ alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, PageTransHuge(page)); if (error) goto unacct; - error = radix_tree_maybe_preload_order(gfp & GFP_RECLAIM_MASK, - compound_order(page)); - if (!error) { - error = shmem_add_to_page_cache(page, mapping, hindex, - NULL); - radix_tree_preload_end(); - } + error = shmem_add_to_page_cache(page, mapping, hindex, + NULL, gfp & GFP_RECLAIM_MASK); if (error) { mem_cgroup_cancel_charge(page, memcg, PageTransHuge(page)); @@ -1931,7 +1887,7 @@ unlock: spin_unlock_irq(&info->lock); goto repeat; } - if (error == -EEXIST) /* from above or from radix_tree_insert */ + if (error == -EEXIST) goto repeat; return error; } @@ -2299,11 +2255,8 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (ret) goto out_release; - ret = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); - if (!ret) { - ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL); - radix_tree_preload_end(); - } + ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, + gfp & GFP_RECLAIM_MASK); if (ret) goto out_release_uncharge; @@ -2548,7 +2501,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } /* - * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. + * llseek SEEK_DATA or SEEK_HOLE through the page cache. */ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, pgoff_t index, pgoff_t end, int whence) @@ -2578,7 +2531,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, index = indices[i]; } page = pvec.pages[i]; - if (page && !radix_tree_exceptional_entry(page)) { + if (page && !xa_is_value(page)) { if (!PageUptodate(page)) page = NULL; } diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 8301293331a2..7fec05796796 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -20,7 +20,7 @@ */ #include <linux/mm.h> #include <linux/mmzone.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/memremap.h> #include <linux/highmem.h> #include <linux/slab.h> @@ -42,8 +42,8 @@ static void * __ref __earlyonly_bootmem_alloc(int node, unsigned long align, unsigned long goal) { - return memblock_virt_alloc_try_nid_raw(size, align, goal, - BOOTMEM_ALLOC_ACCESSIBLE, node); + return memblock_alloc_try_nid_raw(size, align, goal, + MEMBLOCK_ALLOC_ACCESSIBLE, node); } void * __meminit vmemmap_alloc_block(unsigned long size, int node) diff --git a/mm/sparse.c b/mm/sparse.c index 67ad061f7fb8..33307fc05c4d 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -5,7 +5,7 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/mmzone.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/compiler.h> #include <linux/highmem.h> #include <linux/export.h> @@ -68,7 +68,8 @@ static noinline struct mem_section __ref *sparse_index_alloc(int nid) if (slab_is_available()) section = kzalloc_node(array_size, GFP_KERNEL, nid); else - section = memblock_virt_alloc_node(array_size, nid); + section = memblock_alloc_node(array_size, SMP_CACHE_BYTES, + nid); return section; } @@ -216,7 +217,7 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) size = sizeof(struct mem_section*) * NR_SECTION_ROOTS; align = 1 << (INTERNODE_CACHE_SHIFT); - mem_section = memblock_virt_alloc(size, align); + mem_section = memblock_alloc(size, align); } #endif @@ -306,7 +307,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, limit = goal + (1UL << PA_SECTION_SHIFT); nid = early_pfn_to_nid(goal >> PAGE_SHIFT); again: - p = memblock_virt_alloc_try_nid_nopanic(size, + p = memblock_alloc_try_nid_nopanic(size, SMP_CACHE_BYTES, goal, limit, nid); if (!p && limit) { @@ -362,7 +363,7 @@ static unsigned long * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, unsigned long size) { - return memblock_virt_alloc_node_nopanic(size, pgdat->node_id); + return memblock_alloc_node_nopanic(size, pgdat->node_id); } static void __init check_usemap_section_nr(int nid, unsigned long *usemap) @@ -391,9 +392,9 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid, if (map) return map; - map = memblock_virt_alloc_try_nid(size, + map = memblock_alloc_try_nid(size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), - BOOTMEM_ALLOC_ACCESSIBLE, nid); + MEMBLOCK_ALLOC_ACCESSIBLE, nid); return map; } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ @@ -405,9 +406,9 @@ static void __init sparse_buffer_init(unsigned long size, int nid) { WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */ sparsemap_buf = - memblock_virt_alloc_try_nid_raw(size, PAGE_SIZE, + memblock_alloc_try_nid_raw(size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), - BOOTMEM_ALLOC_ACCESSIBLE, nid); + MEMBLOCK_ALLOC_ACCESSIBLE, nid); sparsemap_buf_end = sparsemap_buf + size; } diff --git a/mm/swap.c b/mm/swap.c index 87a54c8dee34..aa483719922e 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -964,7 +964,7 @@ void pagevec_remove_exceptionals(struct pagevec *pvec) for (i = 0, j = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - if (!radix_tree_exceptional_entry(page)) + if (!xa_is_value(page)) pvec->pages[j++] = page; } pvec->nr = j; @@ -1001,7 +1001,7 @@ EXPORT_SYMBOL(pagevec_lookup_range); unsigned pagevec_lookup_range_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, - int tag) + xa_mark_t tag) { pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, PAGEVEC_SIZE, pvec->pages); @@ -1011,7 +1011,7 @@ EXPORT_SYMBOL(pagevec_lookup_range_tag); unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, - int tag, unsigned max_pages) + xa_mark_t tag, unsigned max_pages) { pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages); diff --git a/mm/swap_state.c b/mm/swap_state.c index 0d6a7f268d2e..fd2f21e1c60a 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -107,14 +107,15 @@ void show_swap_cache_info(void) } /* - * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, + * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ -int __add_to_swap_cache(struct page *page, swp_entry_t entry) +int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp) { - int error, i, nr = hpage_nr_pages(page); - struct address_space *address_space; + struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swp_offset(entry); + XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page)); + unsigned long i, nr = 1UL << compound_order(page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapCache(page), page); @@ -123,73 +124,52 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) page_ref_add(page, nr); SetPageSwapCache(page); - address_space = swap_address_space(entry); - xa_lock_irq(&address_space->i_pages); - for (i = 0; i < nr; i++) { - set_page_private(page + i, entry.val + i); - error = radix_tree_insert(&address_space->i_pages, - idx + i, page + i); - if (unlikely(error)) - break; - } - if (likely(!error)) { + do { + xas_lock_irq(&xas); + xas_create_range(&xas); + if (xas_error(&xas)) + goto unlock; + for (i = 0; i < nr; i++) { + VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); + set_page_private(page + i, entry.val + i); + xas_store(&xas, page + i); + xas_next(&xas); + } address_space->nrpages += nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); ADD_CACHE_INFO(add_total, nr); - } else { - /* - * Only the context which have set SWAP_HAS_CACHE flag - * would call add_to_swap_cache(). - * So add_to_swap_cache() doesn't returns -EEXIST. - */ - VM_BUG_ON(error == -EEXIST); - set_page_private(page + i, 0UL); - while (i--) { - radix_tree_delete(&address_space->i_pages, idx + i); - set_page_private(page + i, 0UL); - } - ClearPageSwapCache(page); - page_ref_sub(page, nr); - } - xa_unlock_irq(&address_space->i_pages); +unlock: + xas_unlock_irq(&xas); + } while (xas_nomem(&xas, gfp)); - return error; -} - - -int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) -{ - int error; + if (!xas_error(&xas)) + return 0; - error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page)); - if (!error) { - error = __add_to_swap_cache(page, entry); - radix_tree_preload_end(); - } - return error; + ClearPageSwapCache(page); + page_ref_sub(page, nr); + return xas_error(&xas); } /* * This must be called only on pages that have * been verified to be in the swap cache. */ -void __delete_from_swap_cache(struct page *page) +void __delete_from_swap_cache(struct page *page, swp_entry_t entry) { - struct address_space *address_space; + struct address_space *address_space = swap_address_space(entry); int i, nr = hpage_nr_pages(page); - swp_entry_t entry; - pgoff_t idx; + pgoff_t idx = swp_offset(entry); + XA_STATE(xas, &address_space->i_pages, idx); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageSwapCache(page), page); VM_BUG_ON_PAGE(PageWriteback(page), page); - entry.val = page_private(page); - address_space = swap_address_space(entry); - idx = swp_offset(entry); for (i = 0; i < nr; i++) { - radix_tree_delete(&address_space->i_pages, idx + i); + void *entry = xas_store(&xas, NULL); + VM_BUG_ON_PAGE(entry != page + i, entry); set_page_private(page + i, 0); + xas_next(&xas); } ClearPageSwapCache(page); address_space->nrpages -= nr; @@ -217,7 +197,7 @@ int add_to_swap(struct page *page) return 0; /* - * Radix-tree node allocations from PF_MEMALLOC contexts could + * XArray node allocations from PF_MEMALLOC contexts could * completely exhaust the page allocator. __GFP_NOMEMALLOC * stops emergency reserves from being allocated. * @@ -229,7 +209,6 @@ int add_to_swap(struct page *page) */ err = add_to_swap_cache(page, entry, __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); - /* -ENOMEM radix-tree allocation failure */ if (err) /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely @@ -263,14 +242,11 @@ fail: */ void delete_from_swap_cache(struct page *page) { - swp_entry_t entry; - struct address_space *address_space; + swp_entry_t entry = { .val = page_private(page) }; + struct address_space *address_space = swap_address_space(entry); - entry.val = page_private(page); - - address_space = swap_address_space(entry); xa_lock_irq(&address_space->i_pages); - __delete_from_swap_cache(page); + __delete_from_swap_cache(page, entry); xa_unlock_irq(&address_space->i_pages); put_swap_page(page, entry); @@ -414,18 +390,10 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, } /* - * call radix_tree_preload() while we can wait. - */ - err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL); - if (err) - break; - - /* * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry); if (err == -EEXIST) { - radix_tree_preload_end(); /* * We might race against get_swap_page() and stumble * across a SWAP_HAS_CACHE swap_map entry whose page @@ -433,27 +401,20 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, */ cond_resched(); continue; - } - if (err) { /* swp entry is obsolete ? */ - radix_tree_preload_end(); + } else if (err) /* swp entry is obsolete ? */ break; - } - /* May fail (-ENOMEM) if radix-tree node allocation failed. */ + /* May fail (-ENOMEM) if XArray node allocation failed. */ __SetPageLocked(new_page); __SetPageSwapBacked(new_page); - err = __add_to_swap_cache(new_page, entry); + err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); if (likely(!err)) { - radix_tree_preload_end(); - /* - * Initiate read into locked page and return. - */ + /* Initiate read into locked page */ SetPageWorkingset(new_page); lru_cache_add_anon(new_page); *new_page_allocated = true; return new_page; } - radix_tree_preload_end(); __ClearPageLocked(new_page); /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely @@ -626,7 +587,7 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages) return -ENOMEM; for (i = 0; i < nr; i++) { space = spaces + i; - INIT_RADIX_TREE(&space->i_pages, GFP_ATOMIC|__GFP_NOWARN); + xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ); atomic_set(&space->i_mmap_writable, 0); space->a_ops = &swap_aops; /* swap cache doesn't use writeback related tags */ diff --git a/mm/truncate.c b/mm/truncate.c index 1d2fb2dca96f..45d68e90b703 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -33,15 +33,12 @@ static inline void __clear_shadow_entry(struct address_space *mapping, pgoff_t index, void *entry) { - struct radix_tree_node *node; - void **slot; + XA_STATE(xas, &mapping->i_pages, index); - if (!__radix_tree_lookup(&mapping->i_pages, index, &node, &slot)) + xas_set_update(&xas, workingset_update_node); + if (xas_load(&xas) != entry) return; - if (*slot != entry) - return; - __radix_tree_replace(&mapping->i_pages, node, slot, NULL, - workingset_update_node); + xas_store(&xas, NULL); mapping->nrexceptional--; } @@ -70,7 +67,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, return; for (j = 0; j < pagevec_count(pvec); j++) - if (radix_tree_exceptional_entry(pvec->pages[j])) + if (xa_is_value(pvec->pages[j])) break; if (j == pagevec_count(pvec)) @@ -85,7 +82,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, struct page *page = pvec->pages[i]; pgoff_t index = indices[i]; - if (!radix_tree_exceptional_entry(page)) { + if (!xa_is_value(page)) { pvec->pages[j++] = page; continue; } @@ -347,7 +344,7 @@ void truncate_inode_pages_range(struct address_space *mapping, if (index >= end) break; - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) continue; if (!trylock_page(page)) @@ -442,7 +439,7 @@ void truncate_inode_pages_range(struct address_space *mapping, break; } - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) continue; lock_page(page); @@ -561,7 +558,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, if (index > end) break; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { invalidate_exceptional_entry(mapping, index, page); continue; @@ -692,7 +689,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, if (index > end) break; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { if (!invalidate_exceptional_entry2(mapping, index, page)) ret = -EBUSY; @@ -738,10 +735,10 @@ int invalidate_inode_pages2_range(struct address_space *mapping, index++; } /* - * For DAX we invalidate page tables after invalidating radix tree. We + * For DAX we invalidate page tables after invalidating page cache. We * could invalidate page tables while invalidating each entry however * that would be expensive. And doing range unmapping before doesn't - * work as we have no cheap way to find whether radix tree entry didn't + * work as we have no cheap way to find whether page cache entry didn't * get remapped later. */ if (dax_mapping(mapping)) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 28c9ae5633b9..62ac0c488624 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -751,12 +751,12 @@ static inline int is_page_cache_freeable(struct page *page) { /* * A freeable page cache page is referenced only by the caller - * that isolated the page, the page cache radix tree and - * optional buffer heads at page->private. + * that isolated the page, the page cache and optional buffer + * heads at page->private. */ - int radix_pins = PageTransHuge(page) && PageSwapCache(page) ? + int page_cache_pins = PageTransHuge(page) && PageSwapCache(page) ? HPAGE_PMD_NR : 1; - return page_count(page) - page_has_private(page) == 1 + radix_pins; + return page_count(page) - page_has_private(page) == 1 + page_cache_pins; } static int may_write_to_inode(struct inode *inode, struct scan_control *sc) @@ -932,7 +932,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; mem_cgroup_swapout(page, swap); - __delete_from_swap_cache(page); + __delete_from_swap_cache(page, swap); xa_unlock_irqrestore(&mapping->i_pages, flags); put_swap_page(page, swap); } else { diff --git a/mm/workingset.c b/mm/workingset.c index cbc13d4dfa79..d46f8c92aa2f 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -160,20 +160,20 @@ * and activations is maintained (node->inactive_age). * * On eviction, a snapshot of this counter (along with some bits to - * identify the node) is stored in the now empty page cache radix tree + * identify the node) is stored in the now empty page cache * slot of the evicted page. This is called a shadow entry. * * On cache misses for which there are shadow entries, an eligible * refault distance will immediately activate the refaulting page. */ -#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ +#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \ 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) /* * Eviction timestamps need to be able to cover the full range of - * actionable refaults. However, bits are tight in the radix tree + * actionable refaults. However, bits are tight in the xarray * entry, and after storing the identifier for the lruvec there might * not be enough left to represent every single actionable refault. In * that case, we have to sacrifice granularity for distance, and group @@ -185,22 +185,21 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, bool workingset) { eviction >>= bucket_order; + eviction &= EVICTION_MASK; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; eviction = (eviction << 1) | workingset; - eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); - return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); + return xa_mk_value(eviction); } static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, unsigned long *evictionp, bool *workingsetp) { - unsigned long entry = (unsigned long)shadow; + unsigned long entry = xa_to_value(shadow); int memcgid, nid; bool workingset; - entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; workingset = entry & 1; entry >>= 1; nid = entry & ((1UL << NODES_SHIFT) - 1); @@ -367,7 +366,7 @@ out: static struct list_lru shadow_nodes; -void workingset_update_node(struct radix_tree_node *node) +void workingset_update_node(struct xa_node *node) { /* * Track non-empty nodes that contain only shadow entries; @@ -379,7 +378,7 @@ void workingset_update_node(struct radix_tree_node *node) */ VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */ - if (node->count && node->count == node->exceptional) { + if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { list_lru_add(&shadow_nodes, &node->private_list); __inc_lruvec_page_state(virt_to_page(node), @@ -404,7 +403,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, nodes = list_lru_shrink_count(&shadow_nodes, sc); /* - * Approximate a reasonable limit for the radix tree nodes + * Approximate a reasonable limit for the nodes * containing shadow entries. We don't need to keep more * shadow entries than possible pages on the active list, * since refault distances bigger than that are dismissed. @@ -419,11 +418,11 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, * worst-case density of 1/8th. Below that, not all eligible * refaults can be detected anymore. * - * On 64-bit with 7 radix_tree_nodes per page and 64 slots + * On 64-bit with 7 xa_nodes per page and 64 slots * each, this will reclaim shadow entries when they consume * ~1.8% of available memory: * - * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE + * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE */ #ifdef CONFIG_MEMCG if (sc->memcg) { @@ -438,7 +437,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, #endif pages = node_present_pages(sc->nid); - max_nodes = pages >> (RADIX_TREE_MAP_SHIFT - 3); + max_nodes = pages >> (XA_CHUNK_SHIFT - 3); if (!nodes) return SHRINK_EMPTY; @@ -451,11 +450,11 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, static enum lru_status shadow_lru_isolate(struct list_head *item, struct list_lru_one *lru, spinlock_t *lru_lock, - void *arg) + void *arg) __must_hold(lru_lock) { + struct xa_node *node = container_of(item, struct xa_node, private_list); + XA_STATE(xas, node->array, 0); struct address_space *mapping; - struct radix_tree_node *node; - unsigned int i; int ret; /* @@ -463,15 +462,14 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * the shadow node LRU under the i_pages lock and the * lru_lock. Because the page cache tree is emptied before * the inode can be destroyed, holding the lru_lock pins any - * address_space that has radix tree nodes on the LRU. + * address_space that has nodes on the LRU. * * We can then safely transition to the i_pages lock to * pin only the address_space of the particular node we want * to reclaim, take the node off-LRU, and drop the lru_lock. */ - node = container_of(item, struct radix_tree_node, private_list); - mapping = container_of(node->root, struct address_space, i_pages); + mapping = container_of(node->array, struct address_space, i_pages); /* Coming from the list, invert the lock order */ if (!xa_trylock(&mapping->i_pages)) { @@ -490,29 +488,21 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * no pages, so we expect to be able to remove them all and * delete and free the empty node afterwards. */ - if (WARN_ON_ONCE(!node->exceptional)) + if (WARN_ON_ONCE(!node->nr_values)) goto out_invalid; - if (WARN_ON_ONCE(node->count != node->exceptional)) - goto out_invalid; - for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { - if (node->slots[i]) { - if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i]))) - goto out_invalid; - if (WARN_ON_ONCE(!node->exceptional)) - goto out_invalid; - if (WARN_ON_ONCE(!mapping->nrexceptional)) - goto out_invalid; - node->slots[i] = NULL; - node->exceptional--; - node->count--; - mapping->nrexceptional--; - } - } - if (WARN_ON_ONCE(node->exceptional)) + if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; + mapping->nrexceptional -= node->nr_values; + xas.xa_node = xa_parent_locked(&mapping->i_pages, node); + xas.xa_offset = node->offset; + xas.xa_shift = node->shift + XA_CHUNK_SHIFT; + xas_set_update(&xas, workingset_update_node); + /* + * We could store a shadow entry here which was the minimum of the + * shadow entries we were tracking ... + */ + xas_store(&xas, NULL); __inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); - __radix_tree_delete_node(&mapping->i_pages, node, - workingset_lookup_update(mapping)); out_invalid: xa_unlock_irq(&mapping->i_pages); |