diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 10 | ||||
-rw-r--r-- | mm/Makefile | 13 | ||||
-rw-r--r-- | mm/backing-dev.c | 46 | ||||
-rw-r--r-- | mm/balloon_compaction.c | 123 | ||||
-rw-r--r-- | mm/bootmem.c | 4 | ||||
-rw-r--r-- | mm/cma.c | 87 | ||||
-rw-r--r-- | mm/compaction.c | 674 | ||||
-rw-r--r-- | mm/debug.c | 237 | ||||
-rw-r--r-- | mm/dmapool.c | 58 | ||||
-rw-r--r-- | mm/filemap.c | 27 | ||||
-rw-r--r-- | mm/gup.c | 358 | ||||
-rw-r--r-- | mm/huge_memory.c | 35 | ||||
-rw-r--r-- | mm/hugetlb.c | 14 | ||||
-rw-r--r-- | mm/internal.h | 26 | ||||
-rw-r--r-- | mm/interval_tree.c | 2 | ||||
-rw-r--r-- | mm/iov_iter.c | 254 | ||||
-rw-r--r-- | mm/kmemcheck.c | 1 | ||||
-rw-r--r-- | mm/ksm.c | 4 | ||||
-rw-r--r-- | mm/memblock.c | 4 | ||||
-rw-r--r-- | mm/memcontrol.c | 318 | ||||
-rw-r--r-- | mm/memory-failure.c | 2 | ||||
-rw-r--r-- | mm/memory.c | 7 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 2 | ||||
-rw-r--r-- | mm/mempolicy.c | 134 | ||||
-rw-r--r-- | mm/migrate.c | 21 | ||||
-rw-r--r-- | mm/mlock.c | 8 | ||||
-rw-r--r-- | mm/mmap.c | 129 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 5 | ||||
-rw-r--r-- | mm/mprotect.c | 20 | ||||
-rw-r--r-- | mm/mremap.c | 5 | ||||
-rw-r--r-- | mm/nobootmem.c | 2 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/oom_kill.c | 6 | ||||
-rw-r--r-- | mm/page-writeback.c | 10 | ||||
-rw-r--r-- | mm/page_alloc.c | 357 | ||||
-rw-r--r-- | mm/pagewalk.c | 2 | ||||
-rw-r--r-- | mm/percpu-km.c | 16 | ||||
-rw-r--r-- | mm/percpu-vm.c | 184 | ||||
-rw-r--r-- | mm/percpu.c | 524 | ||||
-rw-r--r-- | mm/rmap.c | 14 | ||||
-rw-r--r-- | mm/shmem.c | 8 | ||||
-rw-r--r-- | mm/slab.c | 364 | ||||
-rw-r--r-- | mm/slab.h | 57 | ||||
-rw-r--r-- | mm/slab_common.c | 178 | ||||
-rw-r--r-- | mm/slob.c | 2 | ||||
-rw-r--r-- | mm/slub.c | 126 | ||||
-rw-r--r-- | mm/swap.c | 30 | ||||
-rw-r--r-- | mm/swap_state.c | 16 | ||||
-rw-r--r-- | mm/truncate.c | 57 | ||||
-rw-r--r-- | mm/util.c | 23 | ||||
-rw-r--r-- | mm/vmalloc.c | 20 | ||||
-rw-r--r-- | mm/vmscan.c | 112 | ||||
-rw-r--r-- | mm/vmstat.c | 153 | ||||
-rw-r--r-- | mm/zbud.c | 13 | ||||
-rw-r--r-- | mm/zsmalloc.c | 46 |
55 files changed, 3103 insertions, 1847 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 886db2158538..1d1ae6b078fd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -137,6 +137,9 @@ config HAVE_MEMBLOCK_NODE_MAP config HAVE_MEMBLOCK_PHYS_MAP boolean +config HAVE_GENERIC_RCU_GUP + boolean + config ARCH_DISCARD_MEMBLOCK boolean @@ -228,11 +231,16 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK boolean # +# support for memory balloon +config MEMORY_BALLOON + boolean + +# # support for memory balloon compaction config BALLOON_COMPACTION bool "Allow for balloon memory compaction/migration" def_bool y - depends on COMPACTION && VIRTIO_BALLOON + depends on COMPACTION && MEMORY_BALLOON help Memory fragmentation introduced by ballooning might reduce significantly the number of 2MB contiguous memory blocks that can be diff --git a/mm/Makefile b/mm/Makefile index 632ae77e6070..8405eb0023a9 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -3,7 +3,7 @@ # mmu-y := nommu.o -mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o madvise.o memory.o mincore.o \ +mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ vmalloc.o pagewalk.o pgtable-generic.o @@ -11,14 +11,14 @@ ifdef CONFIG_CROSS_MEMORY_ATTACH mmu-$(CONFIG_MMU) += process_vm_access.o endif -obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ +obj-y := filemap.o mempool.o oom_kill.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ - compaction.o balloon_compaction.o vmacache.o \ + compaction.o vmacache.o \ interval_tree.o list_lru.o workingset.o \ - iov_iter.o $(mmu-y) + iov_iter.o debug.o $(mmu-y) obj-y += init-mm.o @@ -28,6 +28,10 @@ else obj-y += bootmem.o endif +obj-$(CONFIG_ADVISE_SYSCALLS) += fadvise.o +ifdef CONFIG_MMU + obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o +endif obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o @@ -64,3 +68,4 @@ obj-$(CONFIG_ZBUD) += zbud.o obj-$(CONFIG_ZSMALLOC) += zsmalloc.o obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o obj-$(CONFIG_CMA) += cma.o +obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 1706cbbdf5f0..0ae0df55000b 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -40,7 +40,7 @@ LIST_HEAD(bdi_list); /* bdi_wq serves all asynchronous writeback tasks */ struct workqueue_struct *bdi_wq; -void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) +static void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) { if (wb1 < wb2) { spin_lock(&wb1->list_lock); @@ -376,13 +376,7 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); flush_delayed_work(&bdi->wb.dwork); WARN_ON(!list_empty(&bdi->work_list)); - - /* - * This shouldn't be necessary unless @bdi for some reason has - * unflushed dirty IO after work_list is drained. Do it anyway - * just in case. - */ - cancel_delayed_work_sync(&bdi->wb.dwork); + WARN_ON(delayed_work_pending(&bdi->wb.dwork)); } /* @@ -402,21 +396,15 @@ static void bdi_prune_sb(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { - struct device *dev = bdi->dev; - - if (dev) { + if (bdi->dev) { bdi_set_min_ratio(bdi, 0); trace_writeback_bdi_unregister(bdi); bdi_prune_sb(bdi); bdi_wb_shutdown(bdi); bdi_debug_unregister(bdi); - - spin_lock_bh(&bdi->wb_lock); + device_unregister(bdi->dev); bdi->dev = NULL; - spin_unlock_bh(&bdi->wb_lock); - - device_unregister(dev); } } EXPORT_SYMBOL(bdi_unregister); @@ -455,7 +443,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi_wb_init(&bdi->wb, bdi); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { - err = percpu_counter_init(&bdi->bdi_stat[i], 0); + err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL); if (err) goto err; } @@ -470,7 +458,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi->write_bandwidth = INIT_BW; bdi->avg_write_bandwidth = INIT_BW; - err = fprop_local_init_percpu(&bdi->completions); + err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL); if (err) { err: @@ -487,8 +475,17 @@ void bdi_destroy(struct backing_dev_info *bdi) int i; /* - * Splice our entries to the default_backing_dev_info, if this - * bdi disappears + * Splice our entries to the default_backing_dev_info. This + * condition shouldn't happen. @wb must be empty at this point and + * dirty inodes on it might cause other issues. This workaround is + * added by ce5f8e779519 ("writeback: splice dirty inode entries to + * default bdi on bdi_destroy()") without root-causing the issue. + * + * http://lkml.kernel.org/g/1253038617-30204-11-git-send-email-jens.axboe@oracle.com + * http://thread.gmane.org/gmane.linux.file-systems/35341/focus=35350 + * + * We should probably add WARN_ON() to find out whether it still + * happens and track it down if so. */ if (bdi_has_dirty_io(bdi)) { struct bdi_writeback *dst = &default_backing_dev_info.wb; @@ -503,12 +500,7 @@ void bdi_destroy(struct backing_dev_info *bdi) bdi_unregister(bdi); - /* - * If bdi_unregister() had already been called earlier, the dwork - * could still be pending because bdi_prune_sb() can race with the - * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty(). - */ - cancel_delayed_work_sync(&bdi->wb.dwork); + WARN_ON(delayed_work_pending(&bdi->wb.dwork)); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) percpu_counter_destroy(&bdi->bdi_stat[i]); @@ -631,7 +623,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout) * of sleeping on the congestion queue */ if (atomic_read(&nr_bdi_congested[sync]) == 0 || - !zone_is_reclaim_congested(zone)) { + !test_bit(ZONE_CONGESTED, &zone->flags)) { cond_resched(); /* In case we scheduled, work out time remaining */ diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 6e45a5074bf0..b3cbe19f71b5 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -11,32 +11,6 @@ #include <linux/balloon_compaction.h> /* - * balloon_devinfo_alloc - allocates a balloon device information descriptor. - * @balloon_dev_descriptor: pointer to reference the balloon device which - * this struct balloon_dev_info will be servicing. - * - * Driver must call it to properly allocate and initialize an instance of - * struct balloon_dev_info which will be used to reference a balloon device - * as well as to keep track of the balloon device page list. - */ -struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor) -{ - struct balloon_dev_info *b_dev_info; - b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL); - if (!b_dev_info) - return ERR_PTR(-ENOMEM); - - b_dev_info->balloon_device = balloon_dev_descriptor; - b_dev_info->mapping = NULL; - b_dev_info->isolated_pages = 0; - spin_lock_init(&b_dev_info->pages_lock); - INIT_LIST_HEAD(&b_dev_info->pages); - - return b_dev_info; -} -EXPORT_SYMBOL_GPL(balloon_devinfo_alloc); - -/* * balloon_page_enqueue - allocates a new page and inserts it into the balloon * page list. * @b_dev_info: balloon device decriptor where we will insert a new page to @@ -61,7 +35,8 @@ struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info) */ BUG_ON(!trylock_page(page)); spin_lock_irqsave(&b_dev_info->pages_lock, flags); - balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages); + balloon_page_insert(b_dev_info, page); + __count_vm_event(BALLOON_INFLATE); spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); unlock_page(page); return page; @@ -93,18 +68,14 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) * to be released by the balloon driver. */ if (trylock_page(page)) { + if (!PagePrivate(page)) { + /* raced with isolation */ + unlock_page(page); + continue; + } spin_lock_irqsave(&b_dev_info->pages_lock, flags); - /* - * Raise the page refcount here to prevent any wrong - * attempt to isolate this page, in case of coliding - * with balloon_page_isolate() just after we release - * the page lock. - * - * balloon_page_free() will take care of dropping - * this extra refcount later. - */ - get_page(page); balloon_page_delete(page); + __count_vm_event(BALLOON_DEFLATE); spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); unlock_page(page); dequeued_page = true; @@ -132,62 +103,14 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) EXPORT_SYMBOL_GPL(balloon_page_dequeue); #ifdef CONFIG_BALLOON_COMPACTION -/* - * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages. - * @b_dev_info: holds the balloon device information descriptor. - * @a_ops: balloon_mapping address_space_operations descriptor. - * - * Driver must call it to properly allocate and initialize an instance of - * struct address_space which will be used as the special page->mapping for - * balloon device enlisted page instances. - */ -struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info, - const struct address_space_operations *a_ops) -{ - struct address_space *mapping; - - mapping = kmalloc(sizeof(*mapping), GFP_KERNEL); - if (!mapping) - return ERR_PTR(-ENOMEM); - - /* - * Give a clean 'zeroed' status to all elements of this special - * balloon page->mapping struct address_space instance. - */ - address_space_init_once(mapping); - - /* - * Set mapping->flags appropriately, to allow balloon pages - * ->mapping identification. - */ - mapping_set_balloon(mapping); - mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask()); - - /* balloon's page->mapping->a_ops callback descriptor */ - mapping->a_ops = a_ops; - - /* - * Establish a pointer reference back to the balloon device descriptor - * this particular page->mapping will be servicing. - * This is used by compaction / migration procedures to identify and - * access the balloon device pageset while isolating / migrating pages. - * - * As some balloon drivers can register multiple balloon devices - * for a single guest, this also helps compaction / migration to - * properly deal with multiple balloon pagesets, when required. - */ - mapping->private_data = b_dev_info; - b_dev_info->mapping = mapping; - - return mapping; -} -EXPORT_SYMBOL_GPL(balloon_mapping_alloc); static inline void __isolate_balloon_page(struct page *page) { - struct balloon_dev_info *b_dev_info = page->mapping->private_data; + struct balloon_dev_info *b_dev_info = balloon_page_device(page); unsigned long flags; + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + ClearPagePrivate(page); list_del(&page->lru); b_dev_info->isolated_pages++; spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); @@ -195,20 +118,16 @@ static inline void __isolate_balloon_page(struct page *page) static inline void __putback_balloon_page(struct page *page) { - struct balloon_dev_info *b_dev_info = page->mapping->private_data; + struct balloon_dev_info *b_dev_info = balloon_page_device(page); unsigned long flags; + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + SetPagePrivate(page); list_add(&page->lru, &b_dev_info->pages); b_dev_info->isolated_pages--; spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); } -static inline int __migrate_balloon_page(struct address_space *mapping, - struct page *newpage, struct page *page, enum migrate_mode mode) -{ - return page->mapping->a_ops->migratepage(mapping, newpage, page, mode); -} - /* __isolate_lru_page() counterpart for a ballooned page */ bool balloon_page_isolate(struct page *page) { @@ -235,12 +154,11 @@ bool balloon_page_isolate(struct page *page) */ if (likely(trylock_page(page))) { /* - * A ballooned page, by default, has just one refcount. + * A ballooned page, by default, has PagePrivate set. * Prevent concurrent compaction threads from isolating - * an already isolated balloon page by refcount check. + * an already isolated balloon page by clearing it. */ - if (__is_movable_balloon_page(page) && - page_count(page) == 2) { + if (balloon_page_movable(page)) { __isolate_balloon_page(page); unlock_page(page); return true; @@ -276,7 +194,7 @@ void balloon_page_putback(struct page *page) int balloon_page_migrate(struct page *newpage, struct page *page, enum migrate_mode mode) { - struct address_space *mapping; + struct balloon_dev_info *balloon = balloon_page_device(page); int rc = -EAGAIN; /* @@ -292,9 +210,8 @@ int balloon_page_migrate(struct page *newpage, return rc; } - mapping = page->mapping; - if (mapping) - rc = __migrate_balloon_page(mapping, newpage, page, mode); + if (balloon && balloon->migratepage) + rc = balloon->migratepage(balloon, newpage, page, mode); unlock_page(newpage); return rc; diff --git a/mm/bootmem.c b/mm/bootmem.c index 90bd3507b413..8a000cebb0d7 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -16,9 +16,9 @@ #include <linux/kmemleak.h> #include <linux/range.h> #include <linux/memblock.h> +#include <linux/bug.h> +#include <linux/io.h> -#include <asm/bug.h> -#include <asm/io.h> #include <asm/processor.h> #include "internal.h" @@ -32,6 +32,7 @@ #include <linux/slab.h> #include <linux/log2.h> #include <linux/cma.h> +#include <linux/highmem.h> struct cma { unsigned long base_pfn; @@ -57,7 +58,9 @@ unsigned long cma_get_size(struct cma *cma) static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) { - return (1UL << (align_order >> cma->order_per_bit)) - 1; + if (align_order <= cma->order_per_bit) + return 0; + return (1UL << (align_order - cma->order_per_bit)) - 1; } static unsigned long cma_bitmap_maxno(struct cma *cma) @@ -140,6 +143,54 @@ static int __init cma_init_reserved_areas(void) core_initcall(cma_init_reserved_areas); /** + * cma_init_reserved_mem() - create custom contiguous area from reserved memory + * @base: Base address of the reserved area + * @size: Size of the reserved area (in bytes), + * @order_per_bit: Order of pages represented by one bit on bitmap. + * @res_cma: Pointer to store the created cma region. + * + * This function creates custom contiguous area from already reserved memory. + */ +int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, + int order_per_bit, struct cma **res_cma) +{ + struct cma *cma; + phys_addr_t alignment; + + /* Sanity checks */ + if (cma_area_count == ARRAY_SIZE(cma_areas)) { + pr_err("Not enough slots for CMA reserved regions!\n"); + return -ENOSPC; + } + + if (!size || !memblock_is_region_reserved(base, size)) + return -EINVAL; + + /* ensure minimal alignment requied by mm core */ + alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); + + /* alignment should be aligned with order_per_bit */ + if (!IS_ALIGNED(alignment >> PAGE_SHIFT, 1 << order_per_bit)) + return -EINVAL; + + if (ALIGN(base, alignment) != base || ALIGN(size, alignment) != size) + return -EINVAL; + + /* + * Each reserved area must be initialised later, when more kernel + * subsystems (like slab allocator) are available. + */ + cma = &cma_areas[cma_area_count]; + cma->base_pfn = PFN_DOWN(base); + cma->count = size >> PAGE_SHIFT; + cma->order_per_bit = order_per_bit; + *res_cma = cma; + cma_area_count++; + + return 0; +} + +/** * cma_declare_contiguous() - reserve custom contiguous area * @base: Base address of the reserved area optional, use 0 for any * @size: Size of the reserved area (in bytes), @@ -162,7 +213,8 @@ int __init cma_declare_contiguous(phys_addr_t base, phys_addr_t alignment, unsigned int order_per_bit, bool fixed, struct cma **res_cma) { - struct cma *cma; + phys_addr_t memblock_end = memblock_end_of_DRAM(); + phys_addr_t highmem_start = __pa(high_memory); int ret = 0; pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n", @@ -196,6 +248,24 @@ int __init cma_declare_contiguous(phys_addr_t base, if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) return -EINVAL; + /* + * adjust limit to avoid crossing low/high memory boundary for + * automatically allocated regions + */ + if (((limit == 0 || limit > memblock_end) && + (memblock_end - size < highmem_start && + memblock_end > highmem_start)) || + (!fixed && limit > highmem_start && limit - size < highmem_start)) { + limit = highmem_start; + } + + if (fixed && base < highmem_start && base+size > highmem_start) { + ret = -EINVAL; + pr_err("Region at %08lx defined on low/high memory boundary (%08lx)\n", + (unsigned long)base, (unsigned long)highmem_start); + goto err; + } + /* Reserve memory */ if (base && fixed) { if (memblock_is_region_reserved(base, size) || @@ -214,16 +284,9 @@ int __init cma_declare_contiguous(phys_addr_t base, } } - /* - * Each reserved area must be initialised later, when more kernel - * subsystems (like slab allocator) are available. - */ - cma = &cma_areas[cma_area_count]; - cma->base_pfn = PFN_DOWN(base); - cma->count = size >> PAGE_SHIFT; - cma->order_per_bit = order_per_bit; - *res_cma = cma; - cma_area_count++; + ret = cma_init_reserved_mem(base, size, order_per_bit, res_cma); + if (ret) + goto err; pr_info("Reserved %ld MiB at %08lx\n", (unsigned long)size / SZ_1M, (unsigned long)base); diff --git a/mm/compaction.c b/mm/compaction.c index 21bf292b642a..edba18aed173 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -67,6 +67,49 @@ static inline bool migrate_async_suitable(int migratetype) return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; } +/* + * Check that the whole (or subset of) a pageblock given by the interval of + * [start_pfn, end_pfn) is valid and within the same zone, before scanning it + * with the migration of free compaction scanner. The scanners then need to + * use only pfn_valid_within() check for arches that allow holes within + * pageblocks. + * + * Return struct page pointer of start_pfn, or NULL if checks were not passed. + * + * It's possible on some configurations to have a setup like node0 node1 node0 + * i.e. it's possible that all pages within a zones range of pages do not + * belong to a single zone. We assume that a border between node0 and node1 + * can occur within a single pageblock, but not a node0 node1 node0 + * interleaving within a single pageblock. It is therefore sufficient to check + * the first and last page of a pageblock and avoid checking each individual + * page in a pageblock. + */ +static struct page *pageblock_pfn_to_page(unsigned long start_pfn, + unsigned long end_pfn, struct zone *zone) +{ + struct page *start_page; + struct page *end_page; + + /* end_pfn is one past the range we are checking */ + end_pfn--; + + if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) + return NULL; + + start_page = pfn_to_page(start_pfn); + + if (page_zone(start_page) != zone) + return NULL; + + end_page = pfn_to_page(end_pfn); + + /* This gives a shorter code than deriving page_zone(end_page) */ + if (page_zone_id(start_page) != page_zone_id(end_page)) + return NULL; + + return start_page; +} + #ifdef CONFIG_COMPACTION /* Returns true if the pageblock should be scanned for pages to isolate. */ static inline bool isolation_suitable(struct compact_control *cc, @@ -132,7 +175,7 @@ void reset_isolation_suitable(pg_data_t *pgdat) */ static void update_pageblock_skip(struct compact_control *cc, struct page *page, unsigned long nr_isolated, - bool set_unsuitable, bool migrate_scanner) + bool migrate_scanner) { struct zone *zone = cc->zone; unsigned long pfn; @@ -146,12 +189,7 @@ static void update_pageblock_skip(struct compact_control *cc, if (nr_isolated) return; - /* - * Only skip pageblocks when all forms of compaction will be known to - * fail in the near future. - */ - if (set_unsuitable) - set_pageblock_skip(page); + set_pageblock_skip(page); pfn = page_to_pfn(page); @@ -180,52 +218,77 @@ static inline bool isolation_suitable(struct compact_control *cc, static void update_pageblock_skip(struct compact_control *cc, struct page *page, unsigned long nr_isolated, - bool set_unsuitable, bool migrate_scanner) + bool migrate_scanner) { } #endif /* CONFIG_COMPACTION */ -static inline bool should_release_lock(spinlock_t *lock) +/* + * Compaction requires the taking of some coarse locks that are potentially + * very heavily contended. For async compaction, back out if the lock cannot + * be taken immediately. For sync compaction, spin on the lock if needed. + * + * Returns true if the lock is held + * Returns false if the lock is not held and compaction should abort + */ +static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags, + struct compact_control *cc) { - return need_resched() || spin_is_contended(lock); + if (cc->mode == MIGRATE_ASYNC) { + if (!spin_trylock_irqsave(lock, *flags)) { + cc->contended = COMPACT_CONTENDED_LOCK; + return false; + } + } else { + spin_lock_irqsave(lock, *flags); + } + + return true; } /* * Compaction requires the taking of some coarse locks that are potentially - * very heavily contended. Check if the process needs to be scheduled or - * if the lock is contended. For async compaction, back out in the event - * if contention is severe. For sync compaction, schedule. + * very heavily contended. The lock should be periodically unlocked to avoid + * having disabled IRQs for a long time, even when there is nobody waiting on + * the lock. It might also be that allowing the IRQs will result in + * need_resched() becoming true. If scheduling is needed, async compaction + * aborts. Sync compaction schedules. + * Either compaction type will also abort if a fatal signal is pending. + * In either case if the lock was locked, it is dropped and not regained. * - * Returns true if the lock is held. - * Returns false if the lock is released and compaction should abort + * Returns true if compaction should abort due to fatal signal pending, or + * async compaction due to need_resched() + * Returns false when compaction can continue (sync compaction might have + * scheduled) */ -static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, - bool locked, struct compact_control *cc) +static bool compact_unlock_should_abort(spinlock_t *lock, + unsigned long flags, bool *locked, struct compact_control *cc) { - if (should_release_lock(lock)) { - if (locked) { - spin_unlock_irqrestore(lock, *flags); - locked = false; - } + if (*locked) { + spin_unlock_irqrestore(lock, flags); + *locked = false; + } + + if (fatal_signal_pending(current)) { + cc->contended = COMPACT_CONTENDED_SCHED; + return true; + } - /* async aborts if taking too long or contended */ + if (need_resched()) { if (cc->mode == MIGRATE_ASYNC) { - cc->contended = true; - return false; + cc->contended = COMPACT_CONTENDED_SCHED; + return true; } - cond_resched(); } - if (!locked) - spin_lock_irqsave(lock, *flags); - return true; + return false; } /* * Aside from avoiding lock contention, compaction also periodically checks * need_resched() and either schedules in sync compaction or aborts async - * compaction. This is similar to what compact_checklock_irqsave() does, but + * compaction. This is similar to what compact_unlock_should_abort() does, but * is used where no lock is concerned. * * Returns false when no scheduling was needed, or sync compaction scheduled. @@ -236,7 +299,7 @@ static inline bool compact_should_abort(struct compact_control *cc) /* async compaction aborts if contended */ if (need_resched()) { if (cc->mode == MIGRATE_ASYNC) { - cc->contended = true; + cc->contended = COMPACT_CONTENDED_SCHED; return true; } @@ -250,8 +313,15 @@ static inline bool compact_should_abort(struct compact_control *cc) static bool suitable_migration_target(struct page *page) { /* If the page is a large free page, then disallow migration */ - if (PageBuddy(page) && page_order(page) >= pageblock_order) - return false; + if (PageBuddy(page)) { + /* + * We are checking page_order without zone->lock taken. But + * the only small danger is that we skip a potentially suitable + * pageblock, so it's not worth to check order for valid range. + */ + if (page_order_unsafe(page) >= pageblock_order) + return false; + } /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ if (migrate_async_suitable(get_pageblock_migratetype(page))) @@ -267,16 +337,16 @@ static bool suitable_migration_target(struct page *page) * (even though it may still end up isolating some pages). */ static unsigned long isolate_freepages_block(struct compact_control *cc, - unsigned long blockpfn, + unsigned long *start_pfn, unsigned long end_pfn, struct list_head *freelist, bool strict) { int nr_scanned = 0, total_isolated = 0; struct page *cursor, *valid_page = NULL; - unsigned long flags; + unsigned long flags = 0; bool locked = false; - bool checked_pageblock = false; + unsigned long blockpfn = *start_pfn; cursor = pfn_to_page(blockpfn); @@ -285,6 +355,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, int isolated, i; struct page *page = cursor; + /* + * Periodically drop the lock (if held) regardless of its + * contention, to give chance to IRQs. Abort if fatal signal + * pending or async compaction detects need_resched() + */ + if (!(blockpfn % SWAP_CLUSTER_MAX) + && compact_unlock_should_abort(&cc->zone->lock, flags, + &locked, cc)) + break; + nr_scanned++; if (!pfn_valid_within(blockpfn)) goto isolate_fail; @@ -295,33 +375,30 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, goto isolate_fail; /* - * The zone lock must be held to isolate freepages. - * Unfortunately this is a very coarse lock and can be - * heavily contended if there are parallel allocations - * or parallel compactions. For async compaction do not - * spin on the lock and we acquire the lock as late as - * possible. + * If we already hold the lock, we can skip some rechecking. + * Note that if we hold the lock now, checked_pageblock was + * already set in some previous iteration (or strict is true), + * so it is correct to skip the suitable migration target + * recheck as well. */ - locked = compact_checklock_irqsave(&cc->zone->lock, &flags, - locked, cc); - if (!locked) - break; - - /* Recheck this is a suitable migration target under lock */ - if (!strict && !checked_pageblock) { + if (!locked) { /* - * We need to check suitability of pageblock only once - * and this isolate_freepages_block() is called with - * pageblock range, so just check once is sufficient. + * The zone lock must be held to isolate freepages. + * Unfortunately this is a very coarse lock and can be + * heavily contended if there are parallel allocations + * or parallel compactions. For async compaction do not + * spin on the lock and we acquire the lock as late as + * possible. */ - checked_pageblock = true; - if (!suitable_migration_target(page)) + locked = compact_trylock_irqsave(&cc->zone->lock, + &flags, cc); + if (!locked) break; - } - /* Recheck this is a buddy page under lock */ - if (!PageBuddy(page)) - goto isolate_fail; + /* Recheck this is a buddy page under lock */ + if (!PageBuddy(page)) + goto isolate_fail; + } /* Found a free page, break it into order-0 pages */ isolated = split_free_page(page); @@ -346,6 +423,9 @@ isolate_fail: } + /* Record how far we have got within the block */ + *start_pfn = blockpfn; + trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); /* @@ -361,8 +441,7 @@ isolate_fail: /* Update the pageblock-skip if the whole pageblock was scanned */ if (blockpfn == end_pfn) - update_pageblock_skip(cc, valid_page, total_isolated, true, - false); + update_pageblock_skip(cc, valid_page, total_isolated, false); count_compact_events(COMPACTFREE_SCANNED, nr_scanned); if (total_isolated) @@ -390,19 +469,21 @@ isolate_freepages_range(struct compact_control *cc, unsigned long isolated, pfn, block_end_pfn; LIST_HEAD(freelist); - for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { - if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn))) - break; + pfn = start_pfn; + block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + + for (; pfn < end_pfn; pfn += isolated, + block_end_pfn += pageblock_nr_pages) { + /* Protect pfn from changing by isolate_freepages_block */ + unsigned long isolate_start_pfn = pfn; - /* - * On subsequent iterations ALIGN() is actually not needed, - * but we keep it that we not to complicate the code. - */ - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); block_end_pfn = min(block_end_pfn, end_pfn); - isolated = isolate_freepages_block(cc, pfn, block_end_pfn, - &freelist, true); + if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) + break; + + isolated = isolate_freepages_block(cc, &isolate_start_pfn, + block_end_pfn, &freelist, true); /* * In strict mode, isolate_freepages_block() returns 0 if @@ -433,22 +514,19 @@ isolate_freepages_range(struct compact_control *cc, } /* Update the number of anon and file isolated pages in the zone */ -static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc) +static void acct_isolated(struct zone *zone, struct compact_control *cc) { struct page *page; unsigned int count[2] = { 0, }; + if (list_empty(&cc->migratepages)) + return; + list_for_each_entry(page, &cc->migratepages, lru) count[!!page_is_file_cache(page)]++; - /* If locked we can use the interrupt unsafe versions */ - if (locked) { - __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); - } else { - mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); - mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); - } + mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); + mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); } /* Similar to reclaim, but different enough that they don't share logic */ @@ -467,40 +545,34 @@ static bool too_many_isolated(struct zone *zone) } /** - * isolate_migratepages_range() - isolate all migrate-able pages in range. - * @zone: Zone pages are in. + * isolate_migratepages_block() - isolate all migrate-able pages within + * a single pageblock * @cc: Compaction control structure. - * @low_pfn: The first PFN of the range. - * @end_pfn: The one-past-the-last PFN of the range. - * @unevictable: true if it allows to isolate unevictable pages + * @low_pfn: The first PFN to isolate + * @end_pfn: The one-past-the-last PFN to isolate, within same pageblock + * @isolate_mode: Isolation mode to be used. * * Isolate all pages that can be migrated from the range specified by - * [low_pfn, end_pfn). Returns zero if there is a fatal signal - * pending), otherwise PFN of the first page that was not scanned - * (which may be both less, equal to or more then end_pfn). + * [low_pfn, end_pfn). The range is expected to be within same pageblock. + * Returns zero if there is a fatal signal pending, otherwise PFN of the + * first page that was not scanned (which may be both less, equal to or more + * than end_pfn). * - * Assumes that cc->migratepages is empty and cc->nr_migratepages is - * zero. - * - * Apart from cc->migratepages and cc->nr_migratetypes this function - * does not modify any cc's fields, in particular it does not modify - * (or read for that matter) cc->migrate_pfn. + * The pages are isolated on cc->migratepages list (not required to be empty), + * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field + * is neither read nor updated. */ -unsigned long -isolate_migratepages_range(struct zone *zone, struct compact_control *cc, - unsigned long low_pfn, unsigned long end_pfn, bool unevictable) +static unsigned long +isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, + unsigned long end_pfn, isolate_mode_t isolate_mode) { - unsigned long last_pageblock_nr = 0, pageblock_nr; + struct zone *zone = cc->zone; unsigned long nr_scanned = 0, nr_isolated = 0; struct list_head *migratelist = &cc->migratepages; struct lruvec *lruvec; - unsigned long flags; + unsigned long flags = 0; bool locked = false; struct page *page = NULL, *valid_page = NULL; - bool set_unsuitable = true; - const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ? - ISOLATE_ASYNC_MIGRATE : 0) | - (unevictable ? ISOLATE_UNEVICTABLE : 0); /* * Ensure that there are not too many pages isolated from the LRU @@ -523,72 +595,43 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, /* Time to isolate some pages for migration */ for (; low_pfn < end_pfn; low_pfn++) { - /* give a chance to irqs before checking need_resched() */ - if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) { - if (should_release_lock(&zone->lru_lock)) { - spin_unlock_irqrestore(&zone->lru_lock, flags); - locked = false; - } - } - /* - * migrate_pfn does not necessarily start aligned to a - * pageblock. Ensure that pfn_valid is called when moving - * into a new MAX_ORDER_NR_PAGES range in case of large - * memory holes within the zone + * Periodically drop the lock (if held) regardless of its + * contention, to give chance to IRQs. Abort async compaction + * if contended. */ - if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { - if (!pfn_valid(low_pfn)) { - low_pfn += MAX_ORDER_NR_PAGES - 1; - continue; - } - } + if (!(low_pfn % SWAP_CLUSTER_MAX) + && compact_unlock_should_abort(&zone->lru_lock, flags, + &locked, cc)) + break; if (!pfn_valid_within(low_pfn)) continue; nr_scanned++; - /* - * Get the page and ensure the page is within the same zone. - * See the comment in isolate_freepages about overlapping - * nodes. It is deliberate that the new zone lock is not taken - * as memory compaction should not move pages between nodes. - */ page = pfn_to_page(low_pfn); - if (page_zone(page) != zone) - continue; if (!valid_page) valid_page = page; - /* If isolation recently failed, do not retry */ - pageblock_nr = low_pfn >> pageblock_order; - if (last_pageblock_nr != pageblock_nr) { - int mt; - - last_pageblock_nr = pageblock_nr; - if (!isolation_suitable(cc, page)) - goto next_pageblock; + /* + * Skip if free. We read page order here without zone lock + * which is generally unsafe, but the race window is small and + * the worst thing that can happen is that we skip some + * potential isolation targets. + */ + if (PageBuddy(page)) { + unsigned long freepage_order = page_order_unsafe(page); /* - * For async migration, also only scan in MOVABLE - * blocks. Async migration is optimistic to see if - * the minimum amount of work satisfies the allocation + * Without lock, we cannot be sure that what we got is + * a valid page order. Consider only values in the + * valid order range to prevent low_pfn overflow. */ - mt = get_pageblock_migratetype(page); - if (cc->mode == MIGRATE_ASYNC && - !migrate_async_suitable(mt)) { - set_unsuitable = false; - goto next_pageblock; - } - } - - /* - * Skip if free. page_order cannot be used without zone->lock - * as nothing prevents parallel allocations or buddy merging. - */ - if (PageBuddy(page)) + if (freepage_order > 0 && freepage_order < MAX_ORDER) + low_pfn += (1UL << freepage_order) - 1; continue; + } /* * Check may be lockless but that's ok as we recheck later. @@ -597,7 +640,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, */ if (!PageLRU(page)) { if (unlikely(balloon_page_movable(page))) { - if (locked && balloon_page_isolate(page)) { + if (balloon_page_isolate(page)) { /* Successfully isolated */ goto isolate_success; } @@ -617,8 +660,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, */ if (PageTransHuge(page)) { if (!locked) - goto next_pageblock; - low_pfn += (1 << compound_order(page)) - 1; + low_pfn = ALIGN(low_pfn + 1, + pageblock_nr_pages) - 1; + else + low_pfn += (1 << compound_order(page)) - 1; + continue; } @@ -631,24 +677,26 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, page_count(page) > page_mapcount(page)) continue; - /* Check if it is ok to still hold the lock */ - locked = compact_checklock_irqsave(&zone->lru_lock, &flags, - locked, cc); - if (!locked || fatal_signal_pending(current)) - break; + /* If we already hold the lock, we can skip some rechecking */ + if (!locked) { + locked = compact_trylock_irqsave(&zone->lru_lock, + &flags, cc); + if (!locked) + break; - /* Recheck PageLRU and PageTransHuge under lock */ - if (!PageLRU(page)) - continue; - if (PageTransHuge(page)) { - low_pfn += (1 << compound_order(page)) - 1; - continue; + /* Recheck PageLRU and PageTransHuge under lock */ + if (!PageLRU(page)) + continue; + if (PageTransHuge(page)) { + low_pfn += (1 << compound_order(page)) - 1; + continue; + } } lruvec = mem_cgroup_page_lruvec(page, zone); /* Try isolate the page */ - if (__isolate_lru_page(page, mode) != 0) + if (__isolate_lru_page(page, isolate_mode) != 0) continue; VM_BUG_ON_PAGE(PageTransCompound(page), page); @@ -667,14 +715,14 @@ isolate_success: ++low_pfn; break; } - - continue; - -next_pageblock: - low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1; } - acct_isolated(zone, locked, cc); + /* + * The PageBuddy() check could have potentially brought us outside + * the range to be scanned. + */ + if (unlikely(low_pfn > end_pfn)) + low_pfn = end_pfn; if (locked) spin_unlock_irqrestore(&zone->lru_lock, flags); @@ -684,8 +732,7 @@ next_pageblock: * if the whole pageblock was scanned without isolating any page. */ if (low_pfn == end_pfn) - update_pageblock_skip(cc, valid_page, nr_isolated, - set_unsuitable, true); + update_pageblock_skip(cc, valid_page, nr_isolated, true); trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); @@ -696,17 +743,65 @@ next_pageblock: return low_pfn; } +/** + * isolate_migratepages_range() - isolate migrate-able pages in a PFN range + * @cc: Compaction control structure. + * @start_pfn: The first PFN to start isolating. + * @end_pfn: The one-past-last PFN. + * + * Returns zero if isolation fails fatally due to e.g. pending signal. + * Otherwise, function returns one-past-the-last PFN of isolated page + * (which may be greater than end_pfn if end fell in a middle of a THP page). + */ +unsigned long +isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long pfn, block_end_pfn; + + /* Scan block by block. First and last block may be incomplete */ + pfn = start_pfn; + block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + + for (; pfn < end_pfn; pfn = block_end_pfn, + block_end_pfn += pageblock_nr_pages) { + + block_end_pfn = min(block_end_pfn, end_pfn); + + if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) + continue; + + pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, + ISOLATE_UNEVICTABLE); + + /* + * In case of fatal failure, release everything that might + * have been isolated in the previous iteration, and signal + * the failure back to caller. + */ + if (!pfn) { + putback_movable_pages(&cc->migratepages); + cc->nr_migratepages = 0; + break; + } + } + acct_isolated(cc->zone, cc); + + return pfn; +} + #endif /* CONFIG_COMPACTION || CONFIG_CMA */ #ifdef CONFIG_COMPACTION /* * Based on information in the current compact_control, find blocks * suitable for isolating free pages from and then isolate them. */ -static void isolate_freepages(struct zone *zone, - struct compact_control *cc) +static void isolate_freepages(struct compact_control *cc) { + struct zone *zone = cc->zone; struct page *page; unsigned long block_start_pfn; /* start of current pageblock */ + unsigned long isolate_start_pfn; /* exact pfn we start at */ unsigned long block_end_pfn; /* end of current pageblock */ unsigned long low_pfn; /* lowest pfn scanner is able to scan */ int nr_freepages = cc->nr_freepages; @@ -715,14 +810,15 @@ static void isolate_freepages(struct zone *zone, /* * Initialise the free scanner. The starting point is where we last * successfully isolated from, zone-cached value, or the end of the - * zone when isolating for the first time. We need this aligned to - * the pageblock boundary, because we do + * zone when isolating for the first time. For looping we also need + * this pfn aligned down to the pageblock boundary, because we do * block_start_pfn -= pageblock_nr_pages in the for loop. * For ending point, take care when isolating in last pageblock of a * a zone which ends in the middle of a pageblock. * The low boundary is the end of the pageblock the migration scanner * is using. */ + isolate_start_pfn = cc->free_pfn; block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); block_end_pfn = min(block_start_pfn + pageblock_nr_pages, zone_end_pfn(zone)); @@ -735,7 +831,8 @@ static void isolate_freepages(struct zone *zone, */ for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages; block_end_pfn = block_start_pfn, - block_start_pfn -= pageblock_nr_pages) { + block_start_pfn -= pageblock_nr_pages, + isolate_start_pfn = block_start_pfn) { unsigned long isolated; /* @@ -747,18 +844,9 @@ static void isolate_freepages(struct zone *zone, && compact_should_abort(cc)) break; - if (!pfn_valid(block_start_pfn)) - continue; - - /* - * Check for overlapping nodes/zones. It's possible on some - * configurations to have a setup like - * node0 node1 node0 - * i.e. it's possible that all pages within a zones range of - * pages do not belong to a single zone. - */ - page = pfn_to_page(block_start_pfn); - if (page_zone(page) != zone) + page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, + zone); + if (!page) continue; /* Check the block is suitable for migration */ @@ -769,13 +857,25 @@ static void isolate_freepages(struct zone *zone, if (!isolation_suitable(cc, page)) continue; - /* Found a block suitable for isolating free pages from */ - cc->free_pfn = block_start_pfn; - isolated = isolate_freepages_block(cc, block_start_pfn, + /* Found a block suitable for isolating free pages from. */ + isolated = isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn, freelist, false); nr_freepages += isolated; /* + * Remember where the free scanner should restart next time, + * which is where isolate_freepages_block() left off. + * But if it scanned the whole pageblock, isolate_start_pfn + * now points at block_end_pfn, which is the start of the next + * pageblock. + * In that case we will however want to restart at the start + * of the previous pageblock. + */ + cc->free_pfn = (isolate_start_pfn < block_end_pfn) ? + isolate_start_pfn : + block_start_pfn - pageblock_nr_pages; + + /* * Set a flag that we successfully isolated in this pageblock. * In the next loop iteration, zone->compact_cached_free_pfn * will not be updated and thus it will effectively contain the @@ -822,7 +922,7 @@ static struct page *compaction_alloc(struct page *migratepage, */ if (list_empty(&cc->freepages)) { if (!cc->contended) - isolate_freepages(cc->zone, cc); + isolate_freepages(cc); if (list_empty(&cc->freepages)) return NULL; @@ -856,38 +956,84 @@ typedef enum { } isolate_migrate_t; /* - * Isolate all pages that can be migrated from the block pointed to by - * the migrate scanner within compact_control. + * Isolate all pages that can be migrated from the first suitable block, + * starting at the block pointed to by the migrate scanner pfn within + * compact_control. */ static isolate_migrate_t isolate_migratepages(struct zone *zone, struct compact_control *cc) { unsigned long low_pfn, end_pfn; + struct page *page; + const isolate_mode_t isolate_mode = + (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); - /* Do not scan outside zone boundaries */ - low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); + /* + * Start at where we last stopped, or beginning of the zone as + * initialized by compact_zone() + */ + low_pfn = cc->migrate_pfn; /* Only scan within a pageblock boundary */ end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); - /* Do not cross the free scanner or scan within a memory hole */ - if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { - cc->migrate_pfn = end_pfn; - return ISOLATE_NONE; - } + /* + * Iterate over whole pageblocks until we find the first suitable. + * Do not cross the free scanner. + */ + for (; end_pfn <= cc->free_pfn; + low_pfn = end_pfn, end_pfn += pageblock_nr_pages) { - /* Perform the isolation */ - low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false); - if (!low_pfn || cc->contended) - return ISOLATE_ABORT; + /* + * This can potentially iterate a massively long zone with + * many pageblocks unsuitable, so periodically check if we + * need to schedule, or even abort async compaction. + */ + if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) + && compact_should_abort(cc)) + break; + + page = pageblock_pfn_to_page(low_pfn, end_pfn, zone); + if (!page) + continue; + + /* If isolation recently failed, do not retry */ + if (!isolation_suitable(cc, page)) + continue; + + /* + * For async compaction, also only scan in MOVABLE blocks. + * Async compaction is optimistic to see if the minimum amount + * of work satisfies the allocation. + */ + if (cc->mode == MIGRATE_ASYNC && + !migrate_async_suitable(get_pageblock_migratetype(page))) + continue; + + /* Perform the isolation */ + low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, + isolate_mode); + if (!low_pfn || cc->contended) + return ISOLATE_ABORT; + + /* + * Either we isolated something and proceed with migration. Or + * we failed and compact_zone should decide if we should + * continue or not. + */ + break; + } + + acct_isolated(zone, cc); + /* Record where migration scanner will be restarted */ cc->migrate_pfn = low_pfn; - return ISOLATE_SUCCESS; + return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; } -static int compact_finished(struct zone *zone, - struct compact_control *cc) +static int compact_finished(struct zone *zone, struct compact_control *cc, + const int migratetype) { unsigned int order; unsigned long watermark; @@ -933,7 +1079,7 @@ static int compact_finished(struct zone *zone, struct free_area *area = &zone->free_area[order]; /* Job done if page is free of the right migratetype */ - if (!list_empty(&area->free_list[cc->migratetype])) + if (!list_empty(&area->free_list[migratetype])) return COMPACT_PARTIAL; /* Job done if allocation would set block type */ @@ -999,6 +1145,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) int ret; unsigned long start_pfn = zone->zone_start_pfn; unsigned long end_pfn = zone_end_pfn(zone); + const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); const bool sync = cc->mode != MIGRATE_ASYNC; ret = compaction_suitable(zone, cc->order); @@ -1041,7 +1188,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) migrate_prep_local(); - while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { + while ((ret = compact_finished(zone, cc, migratetype)) == + COMPACT_CONTINUE) { int err; switch (isolate_migratepages(zone, cc)) { @@ -1056,9 +1204,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) ; } - if (!cc->nr_migratepages) - continue; - err = migrate_pages(&cc->migratepages, compaction_alloc, compaction_free, (unsigned long)cc, cc->mode, MR_COMPACTION); @@ -1092,14 +1237,14 @@ out: } static unsigned long compact_zone_order(struct zone *zone, int order, - gfp_t gfp_mask, enum migrate_mode mode, bool *contended) + gfp_t gfp_mask, enum migrate_mode mode, int *contended) { unsigned long ret; struct compact_control cc = { .nr_freepages = 0, .nr_migratepages = 0, .order = order, - .migratetype = allocflags_to_migratetype(gfp_mask), + .gfp_mask = gfp_mask, .zone = zone, .mode = mode, }; @@ -1124,48 +1269,117 @@ int sysctl_extfrag_threshold = 500; * @gfp_mask: The GFP mask of the current allocation * @nodemask: The allowed nodes to allocate from * @mode: The migration mode for async, sync light, or sync migration - * @contended: Return value that is true if compaction was aborted due to lock contention - * @page: Optionally capture a free page of the requested order during compaction + * @contended: Return value that determines if compaction was aborted due to + * need_resched() or lock contention + * @candidate_zone: Return the zone where we think allocation should succeed * * This is the main entry point for direct page compaction. */ unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - enum migrate_mode mode, bool *contended) + enum migrate_mode mode, int *contended, + struct zone **candidate_zone) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; int may_perform_io = gfp_mask & __GFP_IO; struct zoneref *z; struct zone *zone; - int rc = COMPACT_SKIPPED; + int rc = COMPACT_DEFERRED; int alloc_flags = 0; + int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ + + *contended = COMPACT_CONTENDED_NONE; /* Check if the GFP flags allow compaction */ if (!order || !may_enter_fs || !may_perform_io) - return rc; - - count_compact_event(COMPACTSTALL); + return COMPACT_SKIPPED; #ifdef CONFIG_CMA - if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif /* Compact each zone in the list */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { int status; + int zone_contended; + + if (compaction_deferred(zone, order)) + continue; status = compact_zone_order(zone, order, gfp_mask, mode, - contended); + &zone_contended); rc = max(status, rc); + /* + * It takes at least one zone that wasn't lock contended + * to clear all_zones_contended. + */ + all_zones_contended &= zone_contended; /* If a normal allocation would succeed, stop compacting */ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, - alloc_flags)) - break; + alloc_flags)) { + *candidate_zone = zone; + /* + * We think the allocation will succeed in this zone, + * but it is not certain, hence the false. The caller + * will repeat this with true if allocation indeed + * succeeds in this zone. + */ + compaction_defer_reset(zone, order, false); + /* + * It is possible that async compaction aborted due to + * need_resched() and the watermarks were ok thanks to + * somebody else freeing memory. The allocation can + * however still fail so we better signal the + * need_resched() contention anyway (this will not + * prevent the allocation attempt). + */ + if (zone_contended == COMPACT_CONTENDED_SCHED) + *contended = COMPACT_CONTENDED_SCHED; + + goto break_loop; + } + + if (mode != MIGRATE_ASYNC) { + /* + * We think that allocation won't succeed in this zone + * so we defer compaction there. If it ends up + * succeeding after all, it will be reset. + */ + defer_compaction(zone, order); + } + + /* + * We might have stopped compacting due to need_resched() in + * async compaction, or due to a fatal signal detected. In that + * case do not try further zones and signal need_resched() + * contention. + */ + if ((zone_contended == COMPACT_CONTENDED_SCHED) + || fatal_signal_pending(current)) { + *contended = COMPACT_CONTENDED_SCHED; + goto break_loop; + } + + continue; +break_loop: + /* + * We might not have tried all the zones, so be conservative + * and assume they are not all lock contended. + */ + all_zones_contended = 0; + break; } + /* + * If at least one zone wasn't deferred or skipped, we report if all + * zones that were tried were lock contended. + */ + if (rc > COMPACT_SKIPPED && all_zones_contended) + *contended = COMPACT_CONTENDED_LOCK; + return rc; } diff --git a/mm/debug.c b/mm/debug.c new file mode 100644 index 000000000000..5ce45c9a29b5 --- /dev/null +++ b/mm/debug.c @@ -0,0 +1,237 @@ +/* + * mm/debug.c + * + * mm/ specific debug routines. + * + */ + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/ftrace_event.h> +#include <linux/memcontrol.h> + +static const struct trace_print_flags pageflag_names[] = { + {1UL << PG_locked, "locked" }, + {1UL << PG_error, "error" }, + {1UL << PG_referenced, "referenced" }, + {1UL << PG_uptodate, "uptodate" }, + {1UL << PG_dirty, "dirty" }, + {1UL << PG_lru, "lru" }, + {1UL << PG_active, "active" }, + {1UL << PG_slab, "slab" }, + {1UL << PG_owner_priv_1, "owner_priv_1" }, + {1UL << PG_arch_1, "arch_1" }, + {1UL << PG_reserved, "reserved" }, + {1UL << PG_private, "private" }, + {1UL << PG_private_2, "private_2" }, + {1UL << PG_writeback, "writeback" }, +#ifdef CONFIG_PAGEFLAGS_EXTENDED + {1UL << PG_head, "head" }, + {1UL << PG_tail, "tail" }, +#else + {1UL << PG_compound, "compound" }, +#endif + {1UL << PG_swapcache, "swapcache" }, + {1UL << PG_mappedtodisk, "mappedtodisk" }, + {1UL << PG_reclaim, "reclaim" }, + {1UL << PG_swapbacked, "swapbacked" }, + {1UL << PG_unevictable, "unevictable" }, +#ifdef CONFIG_MMU + {1UL << PG_mlocked, "mlocked" }, +#endif +#ifdef CONFIG_ARCH_USES_PG_UNCACHED + {1UL << PG_uncached, "uncached" }, +#endif +#ifdef CONFIG_MEMORY_FAILURE + {1UL << PG_hwpoison, "hwpoison" }, +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + {1UL << PG_compound_lock, "compound_lock" }, +#endif +}; + +static void dump_flags(unsigned long flags, + const struct trace_print_flags *names, int count) +{ + const char *delim = ""; + unsigned long mask; + int i; + + pr_emerg("flags: %#lx(", flags); + + /* remove zone id */ + flags &= (1UL << NR_PAGEFLAGS) - 1; + + for (i = 0; i < count && flags; i++) { + + mask = names[i].mask; + if ((flags & mask) != mask) + continue; + + flags &= ~mask; + pr_cont("%s%s", delim, names[i].name); + delim = "|"; + } + + /* check for left over flags */ + if (flags) + pr_cont("%s%#lx", delim, flags); + + pr_cont(")\n"); +} + +void dump_page_badflags(struct page *page, const char *reason, + unsigned long badflags) +{ + pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", + page, atomic_read(&page->_count), page_mapcount(page), + page->mapping, page->index); + BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); + dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names)); + if (reason) + pr_alert("page dumped because: %s\n", reason); + if (page->flags & badflags) { + pr_alert("bad because of flags:\n"); + dump_flags(page->flags & badflags, + pageflag_names, ARRAY_SIZE(pageflag_names)); + } + mem_cgroup_print_bad_page(page); +} + +void dump_page(struct page *page, const char *reason) +{ + dump_page_badflags(page, reason, 0); +} +EXPORT_SYMBOL(dump_page); + +#ifdef CONFIG_DEBUG_VM + +static const struct trace_print_flags vmaflags_names[] = { + {VM_READ, "read" }, + {VM_WRITE, "write" }, + {VM_EXEC, "exec" }, + {VM_SHARED, "shared" }, + {VM_MAYREAD, "mayread" }, + {VM_MAYWRITE, "maywrite" }, + {VM_MAYEXEC, "mayexec" }, + {VM_MAYSHARE, "mayshare" }, + {VM_GROWSDOWN, "growsdown" }, + {VM_PFNMAP, "pfnmap" }, + {VM_DENYWRITE, "denywrite" }, + {VM_LOCKED, "locked" }, + {VM_IO, "io" }, + {VM_SEQ_READ, "seqread" }, + {VM_RAND_READ, "randread" }, + {VM_DONTCOPY, "dontcopy" }, + {VM_DONTEXPAND, "dontexpand" }, + {VM_ACCOUNT, "account" }, + {VM_NORESERVE, "noreserve" }, + {VM_HUGETLB, "hugetlb" }, + {VM_NONLINEAR, "nonlinear" }, +#if defined(CONFIG_X86) + {VM_PAT, "pat" }, +#elif defined(CONFIG_PPC) + {VM_SAO, "sao" }, +#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64) + {VM_GROWSUP, "growsup" }, +#elif !defined(CONFIG_MMU) + {VM_MAPPED_COPY, "mappedcopy" }, +#else + {VM_ARCH_1, "arch_1" }, +#endif + {VM_DONTDUMP, "dontdump" }, +#ifdef CONFIG_MEM_SOFT_DIRTY + {VM_SOFTDIRTY, "softdirty" }, +#endif + {VM_MIXEDMAP, "mixedmap" }, + {VM_HUGEPAGE, "hugepage" }, + {VM_NOHUGEPAGE, "nohugepage" }, + {VM_MERGEABLE, "mergeable" }, +}; + +void dump_vma(const struct vm_area_struct *vma) +{ + pr_emerg("vma %p start %p end %p\n" + "next %p prev %p mm %p\n" + "prot %lx anon_vma %p vm_ops %p\n" + "pgoff %lx file %p private_data %p\n", + vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next, + vma->vm_prev, vma->vm_mm, + (unsigned long)pgprot_val(vma->vm_page_prot), + vma->anon_vma, vma->vm_ops, vma->vm_pgoff, + vma->vm_file, vma->vm_private_data); + dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names)); +} +EXPORT_SYMBOL(dump_vma); + +void dump_mm(const struct mm_struct *mm) +{ + pr_emerg("mm %p mmap %p seqnum %d task_size %lu\n" +#ifdef CONFIG_MMU + "get_unmapped_area %p\n" +#endif + "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" + "pgd %p mm_users %d mm_count %d nr_ptes %lu map_count %d\n" + "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" + "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n" + "start_code %lx end_code %lx start_data %lx end_data %lx\n" + "start_brk %lx brk %lx start_stack %lx\n" + "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" + "binfmt %p flags %lx core_state %p\n" +#ifdef CONFIG_AIO + "ioctx_table %p\n" +#endif +#ifdef CONFIG_MEMCG + "owner %p " +#endif + "exe_file %p\n" +#ifdef CONFIG_MMU_NOTIFIER + "mmu_notifier_mm %p\n" +#endif +#ifdef CONFIG_NUMA_BALANCING + "numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n" +#endif +#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) + "tlb_flush_pending %d\n" +#endif + "%s", /* This is here to hold the comma */ + + mm, mm->mmap, mm->vmacache_seqnum, mm->task_size, +#ifdef CONFIG_MMU + mm->get_unmapped_area, +#endif + mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end, + mm->pgd, atomic_read(&mm->mm_users), + atomic_read(&mm->mm_count), + atomic_long_read((atomic_long_t *)&mm->nr_ptes), + mm->map_count, + mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, + mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm, + mm->start_code, mm->end_code, mm->start_data, mm->end_data, + mm->start_brk, mm->brk, mm->start_stack, + mm->arg_start, mm->arg_end, mm->env_start, mm->env_end, + mm->binfmt, mm->flags, mm->core_state, +#ifdef CONFIG_AIO + mm->ioctx_table, +#endif +#ifdef CONFIG_MEMCG + mm->owner, +#endif + mm->exe_file, +#ifdef CONFIG_MMU_NOTIFIER + mm->mmu_notifier_mm, +#endif +#ifdef CONFIG_NUMA_BALANCING + mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq, +#endif +#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) + mm->tlb_flush_pending, +#endif + "" /* This is here to not have a comma! */ + ); + + dump_flags(mm->def_flags, vmaflags_names, + ARRAY_SIZE(vmaflags_names)); +} + +#endif /* CONFIG_DEBUG_VM */ diff --git a/mm/dmapool.c b/mm/dmapool.c index 306baa594f95..fd5fe4342e93 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -62,6 +62,7 @@ struct dma_page { /* cacheable header for 'allocation' bytes */ }; static DEFINE_MUTEX(pools_lock); +static DEFINE_MUTEX(pools_reg_lock); static ssize_t show_pools(struct device *dev, struct device_attribute *attr, char *buf) @@ -132,29 +133,27 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, { struct dma_pool *retval; size_t allocation; + bool empty = false; - if (align == 0) { + if (align == 0) align = 1; - } else if (align & (align - 1)) { + else if (align & (align - 1)) return NULL; - } - if (size == 0) { + if (size == 0) return NULL; - } else if (size < 4) { + else if (size < 4) size = 4; - } if ((size % align) != 0) size = ALIGN(size, align); allocation = max_t(size_t, size, PAGE_SIZE); - if (!boundary) { + if (!boundary) boundary = allocation; - } else if ((boundary < size) || (boundary & (boundary - 1))) { + else if ((boundary < size) || (boundary & (boundary - 1))) return NULL; - } retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev)); if (!retval) @@ -172,15 +171,34 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, INIT_LIST_HEAD(&retval->pools); + /* + * pools_lock ensures that the ->dma_pools list does not get corrupted. + * pools_reg_lock ensures that there is not a race between + * dma_pool_create() and dma_pool_destroy() or within dma_pool_create() + * when the first invocation of dma_pool_create() failed on + * device_create_file() and the second assumes that it has been done (I + * know it is a short window). + */ + mutex_lock(&pools_reg_lock); mutex_lock(&pools_lock); - if (list_empty(&dev->dma_pools) && - device_create_file(dev, &dev_attr_pools)) { - kfree(retval); - return NULL; - } else - list_add(&retval->pools, &dev->dma_pools); + if (list_empty(&dev->dma_pools)) + empty = true; + list_add(&retval->pools, &dev->dma_pools); mutex_unlock(&pools_lock); - + if (empty) { + int err; + + err = device_create_file(dev, &dev_attr_pools); + if (err) { + mutex_lock(&pools_lock); + list_del(&retval->pools); + mutex_unlock(&pools_lock); + mutex_unlock(&pools_reg_lock); + kfree(retval); + return NULL; + } + } + mutex_unlock(&pools_reg_lock); return retval; } EXPORT_SYMBOL(dma_pool_create); @@ -251,11 +269,17 @@ static void pool_free_page(struct dma_pool *pool, struct dma_page *page) */ void dma_pool_destroy(struct dma_pool *pool) { + bool empty = false; + + mutex_lock(&pools_reg_lock); mutex_lock(&pools_lock); list_del(&pool->pools); if (pool->dev && list_empty(&pool->dev->dma_pools)) - device_remove_file(pool->dev, &dev_attr_pools); + empty = true; mutex_unlock(&pools_lock); + if (empty) + device_remove_file(pool->dev, &dev_attr_pools); + mutex_unlock(&pools_reg_lock); while (!list_empty(&pool->page_list)) { struct dma_page *page; diff --git a/mm/filemap.c b/mm/filemap.c index 90effcdf948d..14b4642279f1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -670,17 +670,13 @@ EXPORT_SYMBOL(__page_cache_alloc); * at a cost of "thundering herd" phenomena during rare hash * collisions. */ -static wait_queue_head_t *page_waitqueue(struct page *page) +wait_queue_head_t *page_waitqueue(struct page *page) { const struct zone *zone = page_zone(page); return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; } - -static inline void wake_up_page(struct page *page, int bit) -{ - __wake_up_bit(page_waitqueue(page), &page->flags, bit); -} +EXPORT_SYMBOL(page_waitqueue); void wait_on_page_bit(struct page *page, int bit_nr) { @@ -703,6 +699,19 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr) bit_wait_io, TASK_KILLABLE); } +int wait_on_page_bit_killable_timeout(struct page *page, + int bit_nr, unsigned long timeout) +{ + DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); + + wait.key.timeout = jiffies + timeout; + if (!test_bit(bit_nr, &page->flags)) + return 0; + return __wait_on_bit(page_waitqueue(page), &wait, + bit_wait_io_timeout, TASK_KILLABLE); +} +EXPORT_SYMBOL_GPL(wait_on_page_bit_killable_timeout); + /** * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue * @page: Page defining the wait queue of interest @@ -727,7 +736,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue); * * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). * Also wakes sleepers in wait_on_page_writeback() because the wakeup - * mechananism between PageLocked pages and PageWriteback pages is shared. + * mechanism between PageLocked pages and PageWriteback pages is shared. * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. * * The mb is necessary to enforce ordering between the clear_bit and the read @@ -1744,7 +1753,7 @@ EXPORT_SYMBOL(generic_file_read_iter); static int page_cache_read(struct file *file, pgoff_t offset) { struct address_space *mapping = file->f_mapping; - struct page *page; + struct page *page; int ret; do { @@ -1761,7 +1770,7 @@ static int page_cache_read(struct file *file, pgoff_t offset) page_cache_release(page); } while (ret == AOP_TRUNCATED_PAGE); - + return ret; } @@ -10,6 +10,10 @@ #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/sched.h> +#include <linux/rwsem.h> +#include <asm/pgtable.h> + #include "internal.h" static struct page *no_page_table(struct vm_area_struct *vma, @@ -281,6 +285,10 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, fault_flags |= FAULT_FLAG_ALLOW_RETRY; if (*flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; + if (*flags & FOLL_TRIED) { + VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY); + fault_flags |= FAULT_FLAG_TRIED; + } ret = handle_mm_fault(mm, vma, address, fault_flags); if (ret & VM_FAULT_ERROR) { @@ -672,3 +680,353 @@ struct page *get_dump_page(unsigned long addr) return page; } #endif /* CONFIG_ELF_CORE */ + +/* + * Generic RCU Fast GUP + * + * get_user_pages_fast attempts to pin user pages by walking the page + * tables directly and avoids taking locks. Thus the walker needs to be + * protected from page table pages being freed from under it, and should + * block any THP splits. + * + * One way to achieve this is to have the walker disable interrupts, and + * rely on IPIs from the TLB flushing code blocking before the page table + * pages are freed. This is unsuitable for architectures that do not need + * to broadcast an IPI when invalidating TLBs. + * + * Another way to achieve this is to batch up page table containing pages + * belonging to more than one mm_user, then rcu_sched a callback to free those + * pages. Disabling interrupts will allow the fast_gup walker to both block + * the rcu_sched callback, and an IPI that we broadcast for splitting THPs + * (which is a relatively rare event). The code below adopts this strategy. + * + * Before activating this code, please be aware that the following assumptions + * are currently made: + * + * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free + * pages containing page tables. + * + * *) THP splits will broadcast an IPI, this can be achieved by overriding + * pmdp_splitting_flush. + * + * *) ptes can be read atomically by the architecture. + * + * *) access_ok is sufficient to validate userspace address ranges. + * + * The last two assumptions can be relaxed by the addition of helper functions. + * + * This code is based heavily on the PowerPC implementation by Nick Piggin. + */ +#ifdef CONFIG_HAVE_GENERIC_RCU_GUP + +#ifdef __HAVE_ARCH_PTE_SPECIAL +static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + pte_t *ptep, *ptem; + int ret = 0; + + ptem = ptep = pte_offset_map(&pmd, addr); + do { + /* + * In the line below we are assuming that the pte can be read + * atomically. If this is not the case for your architecture, + * please wrap this in a helper function! + * + * for an example see gup_get_pte in arch/x86/mm/gup.c + */ + pte_t pte = ACCESS_ONCE(*ptep); + struct page *page; + + /* + * Similar to the PMD case below, NUMA hinting must take slow + * path + */ + if (!pte_present(pte) || pte_special(pte) || + pte_numa(pte) || (write && !pte_write(pte))) + goto pte_unmap; + + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + page = pte_page(pte); + + if (!page_cache_get_speculative(page)) + goto pte_unmap; + + if (unlikely(pte_val(pte) != pte_val(*ptep))) { + put_page(page); + goto pte_unmap; + } + + pages[*nr] = page; + (*nr)++; + + } while (ptep++, addr += PAGE_SIZE, addr != end); + + ret = 1; + +pte_unmap: + pte_unmap(ptem); + return ret; +} +#else + +/* + * If we can't determine whether or not a pte is special, then fail immediately + * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not + * to be special. + * + * For a futex to be placed on a THP tail page, get_futex_key requires a + * __get_user_pages_fast implementation that can pin pages. Thus it's still + * useful to have gup_huge_pmd even if we can't operate on ptes. + */ +static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + return 0; +} +#endif /* __HAVE_ARCH_PTE_SPECIAL */ + +static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + struct page *head, *page, *tail; + int refs; + + if (write && !pmd_write(orig)) + return 0; + + refs = 0; + head = pmd_page(orig); + page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + tail = page; + do { + VM_BUG_ON_PAGE(compound_head(page) != head, page); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + + if (!page_cache_add_speculative(head, refs)) { + *nr -= refs; + return 0; + } + + if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { + *nr -= refs; + while (refs--) + put_page(head); + return 0; + } + + /* + * Any tail pages need their mapcount reference taken before we + * return. (This allows the THP code to bump their ref count when + * they are split into base pages). + */ + while (refs--) { + if (PageTail(tail)) + get_huge_page_tail(tail); + tail++; + } + + return 1; +} + +static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + struct page *head, *page, *tail; + int refs; + + if (write && !pud_write(orig)) + return 0; + + refs = 0; + head = pud_page(orig); + page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + tail = page; + do { + VM_BUG_ON_PAGE(compound_head(page) != head, page); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + + if (!page_cache_add_speculative(head, refs)) { + *nr -= refs; + return 0; + } + + if (unlikely(pud_val(orig) != pud_val(*pudp))) { + *nr -= refs; + while (refs--) + put_page(head); + return 0; + } + + while (refs--) { + if (PageTail(tail)) + get_huge_page_tail(tail); + tail++; + } + + return 1; +} + +static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pmd_t *pmdp; + + pmdp = pmd_offset(&pud, addr); + do { + pmd_t pmd = ACCESS_ONCE(*pmdp); + + next = pmd_addr_end(addr, end); + if (pmd_none(pmd) || pmd_trans_splitting(pmd)) + return 0; + + if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { + /* + * NUMA hinting faults need to be handled in the GUP + * slowpath for accounting purposes and so that they + * can be serialised against THP migration. + */ + if (pmd_numa(pmd)) + return 0; + + if (!gup_huge_pmd(pmd, pmdp, addr, next, write, + pages, nr)) + return 0; + + } else if (!gup_pte_range(pmd, addr, next, write, pages, nr)) + return 0; + } while (pmdp++, addr = next, addr != end); + + return 1; +} + +static int gup_pud_range(pgd_t *pgdp, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pud_t *pudp; + + pudp = pud_offset(pgdp, addr); + do { + pud_t pud = ACCESS_ONCE(*pudp); + + next = pud_addr_end(addr, end); + if (pud_none(pud)) + return 0; + if (pud_huge(pud)) { + if (!gup_huge_pud(pud, pudp, addr, next, write, + pages, nr)) + return 0; + } else if (!gup_pmd_range(pud, addr, next, write, pages, nr)) + return 0; + } while (pudp++, addr = next, addr != end); + + return 1; +} + +/* + * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to + * the regular GUP. It will only return non-negative values. + */ +int __get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + struct mm_struct *mm = current->mm; + unsigned long addr, len, end; + unsigned long next, flags; + pgd_t *pgdp; + int nr = 0; + + start &= PAGE_MASK; + addr = start; + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + + if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, + start, len))) + return 0; + + /* + * Disable interrupts. We use the nested form as we can already have + * interrupts disabled by get_futex_key. + * + * With interrupts disabled, we block page table pages from being + * freed from under us. See mmu_gather_tlb in asm-generic/tlb.h + * for more details. + * + * We do not adopt an rcu_read_lock(.) here as we also want to + * block IPIs that come from THPs splitting. + */ + + local_irq_save(flags); + pgdp = pgd_offset(mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none(*pgdp)) + break; + else if (!gup_pud_range(pgdp, addr, next, write, pages, &nr)) + break; + } while (pgdp++, addr = next, addr != end); + local_irq_restore(flags); + + return nr; +} + +/** + * get_user_pages_fast() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @write: whether pages will be written to + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * + * Attempt to pin user pages in memory without taking mm->mmap_sem. + * If not successful, it will fall back to taking the lock and + * calling get_user_pages(). + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. + */ +int get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + struct mm_struct *mm = current->mm; + int nr, ret; + + start &= PAGE_MASK; + nr = __get_user_pages_fast(start, nr_pages, write, pages); + ret = nr; + + if (nr < nr_pages) { + /* Try to get the remaining pages with get_user_pages */ + start += nr << PAGE_SHIFT; + pages += nr; + + down_read(&mm->mmap_sem); + ret = get_user_pages(current, mm, start, + nr_pages - nr, write, 0, pages, NULL); + up_read(&mm->mmap_sem); + + /* Have to be a bit careful with return values */ + if (nr > 0) { + if (ret < 0) + ret = nr; + else + ret += nr; + } + } + + return ret; +} + +#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d9a21d06b862..74c78aa8bc2f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1096,7 +1096,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long mmun_end; /* For mmu_notifiers */ ptl = pmd_lockptr(mm, pmd); - VM_BUG_ON(!vma->anon_vma); + VM_BUG_ON_VMA(!vma->anon_vma, vma); haddr = address & HPAGE_PMD_MASK; if (is_huge_zero_pmd(orig_pmd)) goto alloc; @@ -1795,14 +1795,17 @@ static int __split_huge_page_map(struct page *page, for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { pte_t *pte, entry; BUG_ON(PageCompound(page+i)); + /* + * Note that pmd_numa is not transferred deliberately + * to avoid any possibility that pte_numa leaks to + * a PROT_NONE VMA by accident. + */ entry = mk_pte(page + i, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (!pmd_write(*pmd)) entry = pte_wrprotect(entry); if (!pmd_young(*pmd)) entry = pte_mkold(entry); - if (pmd_numa(*pmd)) - entry = pte_mknuma(entry); pte = pte_offset_map(&_pmd, haddr); BUG_ON(!pte_none(*pte)); set_pte_at(mm, haddr, pte, entry); @@ -2045,7 +2048,7 @@ int __khugepaged_enter(struct mm_struct *mm) return -ENOMEM; /* __khugepaged_exit() must not run from under us */ - VM_BUG_ON(khugepaged_test_exit(mm)); + VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { free_mm_slot(mm_slot); return 0; @@ -2080,7 +2083,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma) if (vma->vm_ops) /* khugepaged not yet working on file or special mappings */ return 0; - VM_BUG_ON(vma->vm_flags & VM_NO_THP); + VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart < hend) @@ -2319,23 +2322,17 @@ static struct page int node) { VM_BUG_ON_PAGE(*hpage, *hpage); + /* - * Allocate the page while the vma is still valid and under - * the mmap_sem read mode so there is no memory allocation - * later when we take the mmap_sem in write mode. This is more - * friendly behavior (OTOH it may actually hide bugs) to - * filesystems in userland with daemons allocating memory in - * the userland I/O paths. Allocating memory with the - * mmap_sem in read mode is good idea also to allow greater - * scalability. + * Before allocating the hugepage, release the mmap_sem read lock. + * The allocation can take potentially a long time if it involves + * sync compaction, and we do not need to hold the mmap_sem during + * that. We will recheck the vma after taking it again in write mode. */ + up_read(&mm->mmap_sem); + *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); - /* - * After allocating the hugepage, release the mmap_sem read lock in - * preparation for taking it in write mode. - */ - up_read(&mm->mmap_sem); if (unlikely(!*hpage)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); *hpage = ERR_PTR(-ENOMEM); @@ -2409,7 +2406,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma) return false; if (is_vma_temporary_stack(vma)) return false; - VM_BUG_ON(vma->vm_flags & VM_NO_THP); + VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma); return true; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index eeceeeb09019..9fd722769927 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -434,7 +434,7 @@ static inline struct resv_map *inode_resv_map(struct inode *inode) static struct resv_map *vma_resv_map(struct vm_area_struct *vma) { - VM_BUG_ON(!is_vm_hugetlb_page(vma)); + VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); if (vma->vm_flags & VM_MAYSHARE) { struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; @@ -449,8 +449,8 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma) static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) { - VM_BUG_ON(!is_vm_hugetlb_page(vma)); - VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); + VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); + VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); set_vma_private_data(vma, (get_vma_private_data(vma) & HPAGE_RESV_MASK) | (unsigned long)map); @@ -458,15 +458,15 @@ static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) { - VM_BUG_ON(!is_vm_hugetlb_page(vma)); - VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); + VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); + VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); set_vma_private_data(vma, get_vma_private_data(vma) | flags); } static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) { - VM_BUG_ON(!is_vm_hugetlb_page(vma)); + VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); return (get_vma_private_data(vma) & flag) != 0; } @@ -474,7 +474,7 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) /* Reset counters to 0 and clear all HPAGE_RESV_* flags */ void reset_vma_resv_huge_pages(struct vm_area_struct *vma) { - VM_BUG_ON(!is_vm_hugetlb_page(vma)); + VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); if (!(vma->vm_flags & VM_MAYSHARE)) vma->vm_private_data = (void *)0; } diff --git a/mm/internal.h b/mm/internal.h index a1b651b11c5f..829304090b90 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -142,10 +142,10 @@ struct compact_control { bool finished_update_migrate; int order; /* order a direct compactor needs */ - int migratetype; /* MOVABLE, RECLAIMABLE etc */ + const gfp_t gfp_mask; /* gfp mask of a direct compactor */ struct zone *zone; - bool contended; /* True if a lock was contended, or - * need_resched() true during async + int contended; /* Signal need_sched() or lock + * contention detected during * compaction */ }; @@ -154,8 +154,8 @@ unsigned long isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn); unsigned long -isolate_migratepages_range(struct zone *zone, struct compact_control *cc, - unsigned long low_pfn, unsigned long end_pfn, bool unevictable); +isolate_migratepages_range(struct compact_control *cc, + unsigned long low_pfn, unsigned long end_pfn); #endif @@ -164,7 +164,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, * general, page_zone(page)->lock must be held by the caller to prevent the * page from being allocated in parallel and returning garbage as the order. * If a caller does not hold page_zone(page)->lock, it must guarantee that the - * page cannot be allocated or merged in parallel. + * page cannot be allocated or merged in parallel. Alternatively, it must + * handle invalid values gracefully, and use page_order_unsafe() below. */ static inline unsigned long page_order(struct page *page) { @@ -172,6 +173,19 @@ static inline unsigned long page_order(struct page *page) return page_private(page); } +/* + * Like page_order(), but for callers who cannot afford to hold the zone lock. + * PageBuddy() should be checked first by the caller to minimize race window, + * and invalid values must be handled gracefully. + * + * ACCESS_ONCE is used so that if the caller assigns the result into a local + * variable and e.g. tests it for valid range before using, the compiler cannot + * decide to remove the variable and inline the page_private(page) multiple + * times, potentially observing different values in the tests and the actual + * use of the result. + */ +#define page_order_unsafe(page) ACCESS_ONCE(page_private(page)) + static inline bool is_cow_mapping(vm_flags_t flags) { return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; diff --git a/mm/interval_tree.c b/mm/interval_tree.c index 4a5822a586e6..8da581fa9060 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -34,7 +34,7 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node, struct vm_area_struct *parent; unsigned long last = vma_last_pgoff(node); - VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev)); + VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node); if (!prev->shared.linear.rb.rb_right) { parent = prev; diff --git a/mm/iov_iter.c b/mm/iov_iter.c index ab88dc0ea1d3..eafcf60f6b83 100644 --- a/mm/iov_iter.c +++ b/mm/iov_iter.c @@ -4,6 +4,96 @@ #include <linux/slab.h> #include <linux/vmalloc.h> +static size_t copy_to_iter_iovec(void *from, size_t bytes, struct iov_iter *i) +{ + size_t skip, copy, left, wanted; + const struct iovec *iov; + char __user *buf; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + wanted = bytes; + iov = i->iov; + skip = i->iov_offset; + buf = iov->iov_base + skip; + copy = min(bytes, iov->iov_len - skip); + + left = __copy_to_user(buf, from, copy); + copy -= left; + skip += copy; + from += copy; + bytes -= copy; + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_to_user(buf, from, copy); + copy -= left; + skip = copy; + from += copy; + bytes -= copy; + } + + if (skip == iov->iov_len) { + iov++; + skip = 0; + } + i->count -= wanted - bytes; + i->nr_segs -= iov - i->iov; + i->iov = iov; + i->iov_offset = skip; + return wanted - bytes; +} + +static size_t copy_from_iter_iovec(void *to, size_t bytes, struct iov_iter *i) +{ + size_t skip, copy, left, wanted; + const struct iovec *iov; + char __user *buf; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + wanted = bytes; + iov = i->iov; + skip = i->iov_offset; + buf = iov->iov_base + skip; + copy = min(bytes, iov->iov_len - skip); + + left = __copy_from_user(to, buf, copy); + copy -= left; + skip += copy; + to += copy; + bytes -= copy; + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_from_user(to, buf, copy); + copy -= left; + skip = copy; + to += copy; + bytes -= copy; + } + + if (skip == iov->iov_len) { + iov++; + skip = 0; + } + i->count -= wanted - bytes; + i->nr_segs -= iov - i->iov; + i->iov = iov; + i->iov_offset = skip; + return wanted - bytes; +} + static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { @@ -166,6 +256,50 @@ done: return wanted - bytes; } +static size_t zero_iovec(size_t bytes, struct iov_iter *i) +{ + size_t skip, copy, left, wanted; + const struct iovec *iov; + char __user *buf; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + wanted = bytes; + iov = i->iov; + skip = i->iov_offset; + buf = iov->iov_base + skip; + copy = min(bytes, iov->iov_len - skip); + + left = __clear_user(buf, copy); + copy -= left; + skip += copy; + bytes -= copy; + + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __clear_user(buf, copy); + copy -= left; + skip = copy; + bytes -= copy; + } + + if (skip == iov->iov_len) { + iov++; + skip = 0; + } + i->count -= wanted - bytes; + i->nr_segs -= iov - i->iov; + i->iov = iov; + i->iov_offset = skip; + return wanted - bytes; +} + static size_t __iovec_copy_from_user_inatomic(char *vaddr, const struct iovec *iov, size_t base, size_t bytes) { @@ -310,7 +444,7 @@ void iov_iter_init(struct iov_iter *i, int direction, EXPORT_SYMBOL(iov_iter_init); static ssize_t get_pages_iovec(struct iov_iter *i, - struct page **pages, unsigned maxpages, + struct page **pages, size_t maxsize, unsigned maxpages, size_t *start) { size_t offset = i->iov_offset; @@ -323,6 +457,8 @@ static ssize_t get_pages_iovec(struct iov_iter *i, len = iov->iov_len - offset; if (len > i->count) len = i->count; + if (len > maxsize) + len = maxsize; addr = (unsigned long)iov->iov_base + offset; len += *start = addr & (PAGE_SIZE - 1); if (len > maxpages * PAGE_SIZE) @@ -412,12 +548,17 @@ static void memcpy_to_page(struct page *page, size_t offset, char *from, size_t kunmap_atomic(to); } -static size_t copy_page_to_iter_bvec(struct page *page, size_t offset, size_t bytes, - struct iov_iter *i) +static void memzero_page(struct page *page, size_t offset, size_t len) +{ + char *addr = kmap_atomic(page); + memset(addr + offset, 0, len); + kunmap_atomic(addr); +} + +static size_t copy_to_iter_bvec(void *from, size_t bytes, struct iov_iter *i) { size_t skip, copy, wanted; const struct bio_vec *bvec; - void *kaddr, *from; if (unlikely(bytes > i->count)) bytes = i->count; @@ -430,8 +571,6 @@ static size_t copy_page_to_iter_bvec(struct page *page, size_t offset, size_t by skip = i->iov_offset; copy = min_t(size_t, bytes, bvec->bv_len - skip); - kaddr = kmap_atomic(page); - from = kaddr + offset; memcpy_to_page(bvec->bv_page, skip + bvec->bv_offset, from, copy); skip += copy; from += copy; @@ -444,7 +583,6 @@ static size_t copy_page_to_iter_bvec(struct page *page, size_t offset, size_t by from += copy; bytes -= copy; } - kunmap_atomic(kaddr); if (skip == bvec->bv_len) { bvec++; skip = 0; @@ -456,12 +594,10 @@ static size_t copy_page_to_iter_bvec(struct page *page, size_t offset, size_t by return wanted - bytes; } -static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, size_t bytes, - struct iov_iter *i) +static size_t copy_from_iter_bvec(void *to, size_t bytes, struct iov_iter *i) { size_t skip, copy, wanted; const struct bio_vec *bvec; - void *kaddr, *to; if (unlikely(bytes > i->count)) bytes = i->count; @@ -473,10 +609,6 @@ static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, size_t bvec = i->bvec; skip = i->iov_offset; - kaddr = kmap_atomic(page); - - to = kaddr + offset; - copy = min(bytes, bvec->bv_len - skip); memcpy_from_page(to, bvec->bv_page, bvec->bv_offset + skip, copy); @@ -493,7 +625,6 @@ static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, size_t to += copy; bytes -= copy; } - kunmap_atomic(kaddr); if (skip == bvec->bv_len) { bvec++; skip = 0; @@ -505,6 +636,61 @@ static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, size_t return wanted; } +static size_t copy_page_to_iter_bvec(struct page *page, size_t offset, + size_t bytes, struct iov_iter *i) +{ + void *kaddr = kmap_atomic(page); + size_t wanted = copy_to_iter_bvec(kaddr + offset, bytes, i); + kunmap_atomic(kaddr); + return wanted; +} + +static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, + size_t bytes, struct iov_iter *i) +{ + void *kaddr = kmap_atomic(page); + size_t wanted = copy_from_iter_bvec(kaddr + offset, bytes, i); + kunmap_atomic(kaddr); + return wanted; +} + +static size_t zero_bvec(size_t bytes, struct iov_iter *i) +{ + size_t skip, copy, wanted; + const struct bio_vec *bvec; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + wanted = bytes; + bvec = i->bvec; + skip = i->iov_offset; + copy = min_t(size_t, bytes, bvec->bv_len - skip); + + memzero_page(bvec->bv_page, skip + bvec->bv_offset, copy); + skip += copy; + bytes -= copy; + while (bytes) { + bvec++; + copy = min(bytes, (size_t)bvec->bv_len); + memzero_page(bvec->bv_page, bvec->bv_offset, copy); + skip = copy; + bytes -= copy; + } + if (skip == bvec->bv_len) { + bvec++; + skip = 0; + } + i->count -= wanted - bytes; + i->nr_segs -= bvec - i->bvec; + i->bvec = bvec; + i->iov_offset = skip; + return wanted - bytes; +} + static size_t copy_from_user_bvec(struct page *page, struct iov_iter *i, unsigned long offset, size_t bytes) { @@ -588,13 +774,15 @@ static unsigned long alignment_bvec(const struct iov_iter *i) } static ssize_t get_pages_bvec(struct iov_iter *i, - struct page **pages, unsigned maxpages, + struct page **pages, size_t maxsize, unsigned maxpages, size_t *start) { const struct bio_vec *bvec = i->bvec; size_t len = bvec->bv_len - i->iov_offset; if (len > i->count) len = i->count; + if (len > maxsize) + len = maxsize; /* can't be more than PAGE_SIZE */ *start = bvec->bv_offset + i->iov_offset; @@ -668,6 +856,34 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, } EXPORT_SYMBOL(copy_page_from_iter); +size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i) +{ + if (i->type & ITER_BVEC) + return copy_to_iter_bvec(addr, bytes, i); + else + return copy_to_iter_iovec(addr, bytes, i); +} +EXPORT_SYMBOL(copy_to_iter); + +size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) +{ + if (i->type & ITER_BVEC) + return copy_from_iter_bvec(addr, bytes, i); + else + return copy_from_iter_iovec(addr, bytes, i); +} +EXPORT_SYMBOL(copy_from_iter); + +size_t iov_iter_zero(size_t bytes, struct iov_iter *i) +{ + if (i->type & ITER_BVEC) { + return zero_bvec(bytes, i); + } else { + return zero_iovec(bytes, i); + } +} +EXPORT_SYMBOL(iov_iter_zero); + size_t iov_iter_copy_from_user_atomic(struct page *page, struct iov_iter *i, unsigned long offset, size_t bytes) { @@ -711,13 +927,13 @@ unsigned long iov_iter_alignment(const struct iov_iter *i) EXPORT_SYMBOL(iov_iter_alignment); ssize_t iov_iter_get_pages(struct iov_iter *i, - struct page **pages, unsigned maxpages, + struct page **pages, size_t maxsize, unsigned maxpages, size_t *start) { if (i->type & ITER_BVEC) - return get_pages_bvec(i, pages, maxpages, start); + return get_pages_bvec(i, pages, maxsize, maxpages, start); else - return get_pages_iovec(i, pages, maxpages, start); + return get_pages_iovec(i, pages, maxsize, maxpages, start); } EXPORT_SYMBOL(iov_iter_get_pages); diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c index fd814fd61319..cab58bb592d8 100644 --- a/mm/kmemcheck.c +++ b/mm/kmemcheck.c @@ -2,6 +2,7 @@ #include <linux/mm_types.h> #include <linux/mm.h> #include <linux/slab.h> +#include "slab.h" #include <linux/kmemcheck.h> void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) @@ -2310,7 +2310,7 @@ static int __init ksm_init(void) ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); if (IS_ERR(ksm_thread)) { - printk(KERN_ERR "ksm: creating kthread failed\n"); + pr_err("ksm: creating kthread failed\n"); err = PTR_ERR(ksm_thread); goto out_free; } @@ -2318,7 +2318,7 @@ static int __init ksm_init(void) #ifdef CONFIG_SYSFS err = sysfs_create_group(mm_kobj, &ksm_attr_group); if (err) { - printk(KERN_ERR "ksm: register sysfs failed\n"); + pr_err("ksm: register sysfs failed\n"); kthread_stop(ksm_thread); goto out_free; } diff --git a/mm/memblock.c b/mm/memblock.c index 70fad0c0dafb..6ecb0d937fb5 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -816,6 +816,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, if (nid != NUMA_NO_NODE && nid != m_nid) continue; + /* skip hotpluggable memory regions if needed */ + if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) + continue; + if (!type_b) { if (out_start) *out_start = m_start; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 085dc6d2f876..23976fd885fd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -292,6 +292,9 @@ struct mem_cgroup { /* vmpressure notifications */ struct vmpressure vmpressure; + /* css_online() has been completed */ + int initialized; + /* * the counter to account for mem+swap usage. */ @@ -315,9 +318,6 @@ struct mem_cgroup { /* OOM-Killer disable */ int oom_kill_disable; - /* set when res.limit == memsw.limit */ - bool memsw_is_minimum; - /* protect arrays of thresholds */ struct mutex thresholds_lock; @@ -481,14 +481,6 @@ enum res_type { #define OOM_CONTROL (0) /* - * Reclaim flags for mem_cgroup_hierarchical_reclaim - */ -#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 -#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) -#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 -#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) - -/* * The memcg_create_mutex will be held whenever a new cgroup is created. * As a consequence, any change that needs to protect against new child cgroups * appearing has to hold it as well. @@ -646,11 +638,13 @@ int memcg_limited_groups_array_size; struct static_key memcg_kmem_enabled_key; EXPORT_SYMBOL(memcg_kmem_enabled_key); +static void memcg_free_cache_id(int id); + static void disarm_kmem_keys(struct mem_cgroup *memcg) { if (memcg_kmem_is_active(memcg)) { static_key_slow_dec(&memcg_kmem_enabled_key); - ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id); + memcg_free_cache_id(memcg->kmemcg_id); } /* * This check can't live in kmem destruction function, @@ -1099,10 +1093,21 @@ skip_node: * skipping css reference should be safe. */ if (next_css) { - if ((next_css == &root->css) || - ((next_css->flags & CSS_ONLINE) && - css_tryget_online(next_css))) - return mem_cgroup_from_css(next_css); + struct mem_cgroup *memcg = mem_cgroup_from_css(next_css); + + if (next_css == &root->css) + return memcg; + + if (css_tryget_online(next_css)) { + /* + * Make sure the memcg is initialized: + * mem_cgroup_css_online() orders the the + * initialization against setting the flag. + */ + if (smp_load_acquire(&memcg->initialized)) + return memcg; + css_put(next_css); + } prev_css = next_css; goto skip_node; @@ -1792,42 +1797,6 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, NULL, "Memory cgroup out of memory"); } -static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, - gfp_t gfp_mask, - unsigned long flags) -{ - unsigned long total = 0; - bool noswap = false; - int loop; - - if (flags & MEM_CGROUP_RECLAIM_NOSWAP) - noswap = true; - if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum) - noswap = true; - - for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) { - if (loop) - drain_all_stock_async(memcg); - total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap); - /* - * Allow limit shrinkers, which are triggered directly - * by userspace, to catch signals and stop reclaim - * after minimal progress, regardless of the margin. - */ - if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK)) - break; - if (mem_cgroup_margin(memcg)) - break; - /* - * If nothing was reclaimed after two attempts, there - * may be no reclaimable pages in this hierarchy. - */ - if (loop && !total) - break; - } - return total; -} - /** * test_mem_cgroup_node_reclaimable * @memcg: the target memcg @@ -2530,8 +2499,9 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, struct mem_cgroup *mem_over_limit; struct res_counter *fail_res; unsigned long nr_reclaimed; - unsigned long flags = 0; unsigned long long size; + bool may_swap = true; + bool drained = false; int ret = 0; if (mem_cgroup_is_root(memcg)) @@ -2541,16 +2511,17 @@ retry: goto done; size = batch * PAGE_SIZE; - if (!res_counter_charge(&memcg->res, size, &fail_res)) { - if (!do_swap_account) - goto done_restock; - if (!res_counter_charge(&memcg->memsw, size, &fail_res)) + if (!do_swap_account || + !res_counter_charge(&memcg->memsw, size, &fail_res)) { + if (!res_counter_charge(&memcg->res, size, &fail_res)) goto done_restock; - res_counter_uncharge(&memcg->res, size); - mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); - flags |= MEM_CGROUP_RECLAIM_NOSWAP; - } else + if (do_swap_account) + res_counter_uncharge(&memcg->memsw, size); mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); + } else { + mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); + may_swap = false; + } if (batch > nr_pages) { batch = nr_pages; @@ -2574,11 +2545,18 @@ retry: if (!(gfp_mask & __GFP_WAIT)) goto nomem; - nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); + nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, + gfp_mask, may_swap); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) goto retry; + if (!drained) { + drain_all_stock_async(mem_over_limit); + drained = true; + goto retry; + } + if (gfp_mask & __GFP_NORETRY) goto nomem; /* @@ -2784,12 +2762,6 @@ static DEFINE_MUTEX(memcg_slab_mutex); static DEFINE_MUTEX(activate_kmem_mutex); -static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) -{ - return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && - memcg_kmem_is_active(memcg); -} - /* * This is a bit cumbersome, but it is rarely used and avoids a backpointer * in the memcg_cache_params struct. @@ -2809,7 +2781,7 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); struct memcg_cache_params *params; - if (!memcg_can_account_kmem(memcg)) + if (!memcg_kmem_is_active(memcg)) return -EIO; print_slabinfo_header(m); @@ -2892,19 +2864,44 @@ int memcg_cache_id(struct mem_cgroup *memcg) return memcg ? memcg->kmemcg_id : -1; } -static size_t memcg_caches_array_size(int num_groups) +static int memcg_alloc_cache_id(void) { - ssize_t size; - if (num_groups <= 0) - return 0; + int id, size; + int err; + + id = ida_simple_get(&kmem_limited_groups, + 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); + if (id < 0) + return id; - size = 2 * num_groups; + if (id < memcg_limited_groups_array_size) + return id; + + /* + * There's no space for the new id in memcg_caches arrays, + * so we have to grow them. + */ + + size = 2 * (id + 1); if (size < MEMCG_CACHES_MIN_SIZE) size = MEMCG_CACHES_MIN_SIZE; else if (size > MEMCG_CACHES_MAX_SIZE) size = MEMCG_CACHES_MAX_SIZE; - return size; + mutex_lock(&memcg_slab_mutex); + err = memcg_update_all_caches(size); + mutex_unlock(&memcg_slab_mutex); + + if (err) { + ida_simple_remove(&kmem_limited_groups, id); + return err; + } + return id; +} + +static void memcg_free_cache_id(int id) +{ + ida_simple_remove(&kmem_limited_groups, id); } /* @@ -2914,97 +2911,7 @@ static size_t memcg_caches_array_size(int num_groups) */ void memcg_update_array_size(int num) { - if (num > memcg_limited_groups_array_size) - memcg_limited_groups_array_size = memcg_caches_array_size(num); -} - -int memcg_update_cache_size(struct kmem_cache *s, int num_groups) -{ - struct memcg_cache_params *cur_params = s->memcg_params; - - VM_BUG_ON(!is_root_cache(s)); - - if (num_groups > memcg_limited_groups_array_size) { - int i; - struct memcg_cache_params *new_params; - ssize_t size = memcg_caches_array_size(num_groups); - - size *= sizeof(void *); - size += offsetof(struct memcg_cache_params, memcg_caches); - - new_params = kzalloc(size, GFP_KERNEL); - if (!new_params) - return -ENOMEM; - - new_params->is_root_cache = true; - - /* - * There is the chance it will be bigger than - * memcg_limited_groups_array_size, if we failed an allocation - * in a cache, in which case all caches updated before it, will - * have a bigger array. - * - * But if that is the case, the data after - * memcg_limited_groups_array_size is certainly unused - */ - for (i = 0; i < memcg_limited_groups_array_size; i++) { - if (!cur_params->memcg_caches[i]) - continue; - new_params->memcg_caches[i] = - cur_params->memcg_caches[i]; - } - - /* - * Ideally, we would wait until all caches succeed, and only - * then free the old one. But this is not worth the extra - * pointer per-cache we'd have to have for this. - * - * It is not a big deal if some caches are left with a size - * bigger than the others. And all updates will reset this - * anyway. - */ - rcu_assign_pointer(s->memcg_params, new_params); - if (cur_params) - kfree_rcu(cur_params, rcu_head); - } - return 0; -} - -int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, - struct kmem_cache *root_cache) -{ - size_t size; - - if (!memcg_kmem_enabled()) - return 0; - - if (!memcg) { - size = offsetof(struct memcg_cache_params, memcg_caches); - size += memcg_limited_groups_array_size * sizeof(void *); - } else - size = sizeof(struct memcg_cache_params); - - s->memcg_params = kzalloc(size, GFP_KERNEL); - if (!s->memcg_params) - return -ENOMEM; - - if (memcg) { - s->memcg_params->memcg = memcg; - s->memcg_params->root_cache = root_cache; - css_get(&memcg->css); - } else - s->memcg_params->is_root_cache = true; - - return 0; -} - -void memcg_free_cache_params(struct kmem_cache *s) -{ - if (!s->memcg_params) - return; - if (!s->memcg_params->is_root_cache) - css_put(&s->memcg_params->memcg->css); - kfree(s->memcg_params); + memcg_limited_groups_array_size = num; } static void memcg_register_cache(struct mem_cgroup *memcg, @@ -3037,6 +2944,7 @@ static void memcg_register_cache(struct mem_cgroup *memcg, if (!cachep) return; + css_get(&memcg->css); list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); /* @@ -3070,6 +2978,9 @@ static void memcg_unregister_cache(struct kmem_cache *cachep) list_del(&cachep->memcg_params->list); kmem_cache_destroy(cachep); + + /* drop the reference taken in memcg_register_cache */ + css_put(&memcg->css); } /* @@ -3247,7 +3158,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); - if (!memcg_can_account_kmem(memcg)) + if (!memcg_kmem_is_active(memcg)) goto out; memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); @@ -3332,7 +3243,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) memcg = get_mem_cgroup_from_mm(current->mm); - if (!memcg_can_account_kmem(memcg)) { + if (!memcg_kmem_is_active(memcg)) { css_put(&memcg->css); return true; } @@ -3674,7 +3585,6 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) { int retry_count; - u64 memswlimit, memlimit; int ret = 0; int children = mem_cgroup_count_children(memcg); u64 curusage, oldusage; @@ -3701,31 +3611,23 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, * We have to guarantee memcg->res.limit <= memcg->memsw.limit. */ mutex_lock(&set_limit_mutex); - memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); - if (memswlimit < val) { + if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) { ret = -EINVAL; mutex_unlock(&set_limit_mutex); break; } - memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); - if (memlimit < val) + if (res_counter_read_u64(&memcg->res, RES_LIMIT) < val) enlarge = 1; ret = res_counter_set_limit(&memcg->res, val); - if (!ret) { - if (memswlimit == val) - memcg->memsw_is_minimum = true; - else - memcg->memsw_is_minimum = false; - } mutex_unlock(&set_limit_mutex); if (!ret) break; - mem_cgroup_reclaim(memcg, GFP_KERNEL, - MEM_CGROUP_RECLAIM_SHRINK); + try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); + curusage = res_counter_read_u64(&memcg->res, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -3743,7 +3645,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, unsigned long long val) { int retry_count; - u64 memlimit, memswlimit, oldusage, curusage; + u64 oldusage, curusage; int children = mem_cgroup_count_children(memcg); int ret = -EBUSY; int enlarge = 0; @@ -3762,30 +3664,21 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, * We have to guarantee memcg->res.limit <= memcg->memsw.limit. */ mutex_lock(&set_limit_mutex); - memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); - if (memlimit > val) { + if (res_counter_read_u64(&memcg->res, RES_LIMIT) > val) { ret = -EINVAL; mutex_unlock(&set_limit_mutex); break; } - memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); - if (memswlimit < val) + if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) enlarge = 1; ret = res_counter_set_limit(&memcg->memsw, val); - if (!ret) { - if (memlimit == val) - memcg->memsw_is_minimum = true; - else - memcg->memsw_is_minimum = false; - } mutex_unlock(&set_limit_mutex); if (!ret) break; - mem_cgroup_reclaim(memcg, GFP_KERNEL, - MEM_CGROUP_RECLAIM_NOSWAP | - MEM_CGROUP_RECLAIM_SHRINK); + try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); + curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -4034,8 +3927,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) if (signal_pending(current)) return -EINTR; - progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, - false); + progress = try_to_free_mem_cgroup_pages(memcg, 1, + GFP_KERNEL, true); if (!progress) { nr_retries--; /* maybe some writeback is necessary */ @@ -4200,23 +4093,12 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, if (err) goto out; - memcg_id = ida_simple_get(&kmem_limited_groups, - 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); + memcg_id = memcg_alloc_cache_id(); if (memcg_id < 0) { err = memcg_id; goto out; } - /* - * Make sure we have enough space for this cgroup in each root cache's - * memcg_params. - */ - mutex_lock(&memcg_slab_mutex); - err = memcg_update_all_caches(memcg_id + 1); - mutex_unlock(&memcg_slab_mutex); - if (err) - goto out_rmid; - memcg->kmemcg_id = memcg_id; INIT_LIST_HEAD(&memcg->memcg_slab_caches); @@ -4237,10 +4119,6 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, out: memcg_resume_kmem_account(); return err; - -out_rmid: - ida_simple_remove(&kmem_limited_groups, memcg_id); - goto out; } static int memcg_activate_kmem(struct mem_cgroup *memcg, @@ -5549,6 +5427,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); + int ret; if (css->id > MEM_CGROUP_ID_MAX) return -ENOSPC; @@ -5585,7 +5464,18 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) } mutex_unlock(&memcg_create_mutex); - return memcg_init_kmem(memcg, &memory_cgrp_subsys); + ret = memcg_init_kmem(memcg, &memory_cgrp_subsys); + if (ret) + return ret; + + /* + * Make sure the memcg is initialized: mem_cgroup_iter() + * orders reading memcg->initialized against its callers + * reading the memcg members. + */ + smp_store_release(&memcg->initialized, 1); + + return 0; } /* diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 44c6bd201d3a..8639f6b28746 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -148,7 +148,7 @@ static int hwpoison_filter_task(struct page *p) ino = cgroup_ino(css->cgroup); css_put(css); - if (!ino || ino != hwpoison_filter_memcg) + if (ino != hwpoison_filter_memcg) return -EINVAL; return 0; diff --git a/mm/memory.c b/mm/memory.c index adeac306610f..1cc6bfbd872e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -118,6 +118,8 @@ __setup("norandmaps", disable_randmaps); unsigned long zero_pfn __read_mostly; unsigned long highest_memmap_pfn __read_mostly; +EXPORT_SYMBOL(zero_pfn); + /* * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() */ @@ -1125,7 +1127,7 @@ again: addr) != page->index) { pte_t ptfile = pgoff_to_pte(page->index); if (pte_soft_dirty(ptent)) - pte_file_mksoft_dirty(ptfile); + ptfile = pte_file_mksoft_dirty(ptfile); set_pte_at(mm, addr, pte, ptfile); } if (PageAnon(page)) @@ -2051,7 +2053,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, old_page = vm_normal_page(vma, address, orig_pte); if (!old_page) { /* - * VM_MIXEDMAP !pfn_valid() case + * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a + * VM_PFNMAP VMA. * * We should not cow pages in a shared writeable mapping. * Just mark the pages writable as we can't do any dirty diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2ff8c2325e96..29d8693d0c61 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1307,7 +1307,7 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) /* * Confirm all pages in a range [start, end) is belongs to the same zone. */ -static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) +int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; struct zone *zone = NULL; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8f5330d74f47..e58725aff7e9 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -123,25 +123,23 @@ static struct mempolicy default_policy = { static struct mempolicy preferred_node_policy[MAX_NUMNODES]; -static struct mempolicy *get_task_policy(struct task_struct *p) +struct mempolicy *get_task_policy(struct task_struct *p) { struct mempolicy *pol = p->mempolicy; + int node; - if (!pol) { - int node = numa_node_id(); + if (pol) + return pol; - if (node != NUMA_NO_NODE) { - pol = &preferred_node_policy[node]; - /* - * preferred_node_policy is not initialised early in - * boot - */ - if (!pol->mode) - pol = NULL; - } + node = numa_node_id(); + if (node != NUMA_NO_NODE) { + pol = &preferred_node_policy[node]; + /* preferred_node_policy is not initialised early in boot */ + if (pol->mode) + return pol; } - return pol; + return &default_policy; } static const struct mempolicy_operations { @@ -683,7 +681,9 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, } if (flags & MPOL_MF_LAZY) { - change_prot_numa(vma, start, endvma); + /* Similar to task_numa_work, skip inaccessible VMAs */ + if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) + change_prot_numa(vma, start, endvma); goto next; } @@ -804,7 +804,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *new, *old; - struct mm_struct *mm = current->mm; NODEMASK_SCRATCH(scratch); int ret; @@ -816,20 +815,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, ret = PTR_ERR(new); goto out; } - /* - * prevent changing our mempolicy while show_numa_maps() - * is using it. - * Note: do_set_mempolicy() can be called at init time - * with no 'mm'. - */ - if (mm) - down_write(&mm->mmap_sem); + task_lock(current); ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { task_unlock(current); - if (mm) - up_write(&mm->mmap_sem); mpol_put(new); goto out; } @@ -839,9 +829,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodes_weight(new->v.nodes)) current->il_next = first_node(new->v.nodes); task_unlock(current); - if (mm) - up_write(&mm->mmap_sem); - mpol_put(old); ret = 0; out: @@ -1605,32 +1592,14 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, #endif -/* - * get_vma_policy(@task, @vma, @addr) - * @task: task for fallback if vma policy == default - * @vma: virtual memory area whose policy is sought - * @addr: address in @vma for shared policy lookup - * - * Returns effective policy for a VMA at specified address. - * Falls back to @task or system default policy, as necessary. - * Current or other task's task mempolicy and non-shared vma policies must be - * protected by task_lock(task) by the caller. - * Shared policies [those marked as MPOL_F_SHARED] require an extra reference - * count--added by the get_policy() vm_op, as appropriate--to protect against - * freeing by another task. It is the caller's responsibility to free the - * extra reference for shared policies. - */ -struct mempolicy *get_vma_policy(struct task_struct *task, - struct vm_area_struct *vma, unsigned long addr) +struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, + unsigned long addr) { - struct mempolicy *pol = get_task_policy(task); + struct mempolicy *pol = NULL; if (vma) { if (vma->vm_ops && vma->vm_ops->get_policy) { - struct mempolicy *vpol = vma->vm_ops->get_policy(vma, - addr); - if (vpol) - pol = vpol; + pol = vma->vm_ops->get_policy(vma, addr); } else if (vma->vm_policy) { pol = vma->vm_policy; @@ -1644,31 +1613,51 @@ struct mempolicy *get_vma_policy(struct task_struct *task, mpol_get(pol); } } + + return pol; +} + +/* + * get_vma_policy(@vma, @addr) + * @vma: virtual memory area whose policy is sought + * @addr: address in @vma for shared policy lookup + * + * Returns effective policy for a VMA at specified address. + * Falls back to current->mempolicy or system default policy, as necessary. + * Shared policies [those marked as MPOL_F_SHARED] require an extra reference + * count--added by the get_policy() vm_op, as appropriate--to protect against + * freeing by another task. It is the caller's responsibility to free the + * extra reference for shared policies. + */ +static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, + unsigned long addr) +{ + struct mempolicy *pol = __get_vma_policy(vma, addr); + if (!pol) - pol = &default_policy; + pol = get_task_policy(current); + return pol; } -bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma) +bool vma_policy_mof(struct vm_area_struct *vma) { - struct mempolicy *pol = get_task_policy(task); - if (vma) { - if (vma->vm_ops && vma->vm_ops->get_policy) { - bool ret = false; + struct mempolicy *pol; - pol = vma->vm_ops->get_policy(vma, vma->vm_start); - if (pol && (pol->flags & MPOL_F_MOF)) - ret = true; - mpol_cond_put(pol); + if (vma->vm_ops && vma->vm_ops->get_policy) { + bool ret = false; - return ret; - } else if (vma->vm_policy) { - pol = vma->vm_policy; - } + pol = vma->vm_ops->get_policy(vma, vma->vm_start); + if (pol && (pol->flags & MPOL_F_MOF)) + ret = true; + mpol_cond_put(pol); + + return ret; } + pol = vma->vm_policy; if (!pol) - return default_policy.flags & MPOL_F_MOF; + pol = get_task_policy(current); return pol->flags & MPOL_F_MOF; } @@ -1874,7 +1863,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, { struct zonelist *zl; - *mpol = get_vma_policy(current, vma, addr); + *mpol = get_vma_policy(vma, addr); *nodemask = NULL; /* assume !MPOL_BIND */ if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { @@ -2029,7 +2018,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned int cpuset_mems_cookie; retry_cpuset: - pol = get_vma_policy(current, vma, addr); + pol = get_vma_policy(vma, addr); cpuset_mems_cookie = read_mems_allowed_begin(); if (unlikely(pol->mode == MPOL_INTERLEAVE)) { @@ -2046,8 +2035,7 @@ retry_cpuset: page = __alloc_pages_nodemask(gfp, order, policy_zonelist(gfp, pol, node), policy_nodemask(gfp, pol)); - if (unlikely(mpol_needs_cond_ref(pol))) - __mpol_put(pol); + mpol_cond_put(pol); if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; @@ -2074,12 +2062,12 @@ retry_cpuset: */ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { - struct mempolicy *pol = get_task_policy(current); + struct mempolicy *pol = &default_policy; struct page *page; unsigned int cpuset_mems_cookie; - if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) - pol = &default_policy; + if (!in_interrupt() && !(gfp & __GFP_THISNODE)) + pol = get_task_policy(current); retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); @@ -2296,7 +2284,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long BUG_ON(!vma); - pol = get_vma_policy(current, vma, addr); + pol = get_vma_policy(vma, addr); if (!(pol->flags & MPOL_F_MOF)) goto out; diff --git a/mm/migrate.c b/mm/migrate.c index f78ec9bd454d..01439953abf5 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -146,8 +146,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); if (pte_swp_soft_dirty(*ptep)) pte = pte_mksoft_dirty(pte); + + /* Recheck VMA as permissions can change since migration started */ if (is_write_migration_entry(entry)) - pte = pte_mkwrite(pte); + pte = maybe_mkwrite(pte, vma); + #ifdef CONFIG_HUGETLB_PAGE if (PageHuge(new)) { pte = pte_mkhuge(pte); @@ -873,7 +876,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, } } - if (unlikely(balloon_page_movable(page))) { + if (unlikely(isolated_balloon_page(page))) { /* * A ballooned page does not need any special attention from * physical to virtual reverse mapping procedures. @@ -952,17 +955,6 @@ static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, rc = __unmap_and_move(page, newpage, force, mode); - if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) { - /* - * A ballooned page has been migrated already. - * Now, it's the time to wrap-up counters, - * handle the page back to Buddy and return. - */ - dec_zone_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); - balloon_page_free(page); - return MIGRATEPAGE_SUCCESS; - } out: if (rc != -EAGAIN) { /* @@ -985,6 +977,9 @@ out: if (rc != MIGRATEPAGE_SUCCESS && put_new_page) { ClearPageSwapBacked(newpage); put_new_page(newpage, private); + } else if (unlikely(__is_movable_balloon_page(newpage))) { + /* drop our reference, page already in the balloon */ + put_page(newpage); } else putback_lru_page(newpage); diff --git a/mm/mlock.c b/mm/mlock.c index ce84cb0b83ef..73cf0987088c 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -233,9 +233,9 @@ long __mlock_vma_pages_range(struct vm_area_struct *vma, VM_BUG_ON(start & ~PAGE_MASK); VM_BUG_ON(end & ~PAGE_MASK); - VM_BUG_ON(start < vma->vm_start); - VM_BUG_ON(end > vma->vm_end); - VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); + VM_BUG_ON_VMA(start < vma->vm_start, vma); + VM_BUG_ON_VMA(end > vma->vm_end, vma); + VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); gup_flags = FOLL_TOUCH | FOLL_MLOCK; /* @@ -789,7 +789,7 @@ static int do_mlockall(int flags) /* Ignore errors */ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); - cond_resched(); + cond_resched_rcu_qs(); } out: return 0; diff --git a/mm/mmap.c b/mm/mmap.c index c1f2ea4a0b99..7f855206e7fb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -70,7 +70,7 @@ static void unmap_region(struct mm_struct *mm, * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes * w: (no) no w: (no) no w: (yes) yes w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes - * + * * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes * w: (no) no w: (no) no w: (copy) copy w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes @@ -89,6 +89,25 @@ pgprot_t vm_get_page_prot(unsigned long vm_flags) } EXPORT_SYMBOL(vm_get_page_prot); +static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) +{ + return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); +} + +/* Update vma->vm_page_prot to reflect vma->vm_flags. */ +void vma_set_page_prot(struct vm_area_struct *vma) +{ + unsigned long vm_flags = vma->vm_flags; + + vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); + if (vma_wants_writenotify(vma)) { + vm_flags &= ~VM_SHARED; + vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, + vm_flags); + } +} + + int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ unsigned long sysctl_overcommit_kbytes __read_mostly; @@ -268,7 +287,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len); SYSCALL_DEFINE1(brk, unsigned long, brk) { - unsigned long rlim, retval; + unsigned long retval; unsigned long newbrk, oldbrk; struct mm_struct *mm = current->mm; unsigned long min_brk; @@ -298,9 +317,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) * segment grow beyond its set limit the in case where the limit is * not page aligned -Ram Gupta */ - rlim = rlimit(RLIMIT_DATA); - if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + - (mm->end_data - mm->start_data) > rlim) + if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk, + mm->end_data, mm->start_data)) goto out; newbrk = PAGE_ALIGN(brk); @@ -369,20 +387,22 @@ static int browse_rb(struct rb_root *root) struct vm_area_struct *vma; vma = rb_entry(nd, struct vm_area_struct, vm_rb); if (vma->vm_start < prev) { - pr_info("vm_start %lx prev %lx\n", vma->vm_start, prev); + pr_emerg("vm_start %lx < prev %lx\n", + vma->vm_start, prev); bug = 1; } if (vma->vm_start < pend) { - pr_info("vm_start %lx pend %lx\n", vma->vm_start, pend); + pr_emerg("vm_start %lx < pend %lx\n", + vma->vm_start, pend); bug = 1; } if (vma->vm_start > vma->vm_end) { - pr_info("vm_end %lx < vm_start %lx\n", - vma->vm_end, vma->vm_start); + pr_emerg("vm_start %lx > vm_end %lx\n", + vma->vm_start, vma->vm_end); bug = 1; } if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { - pr_info("free gap %lx, correct %lx\n", + pr_emerg("free gap %lx, correct %lx\n", vma->rb_subtree_gap, vma_compute_subtree_gap(vma)); bug = 1; @@ -396,7 +416,7 @@ static int browse_rb(struct rb_root *root) for (nd = pn; nd; nd = rb_prev(nd)) j++; if (i != j) { - pr_info("backwards %d, forwards %d\n", j, i); + pr_emerg("backwards %d, forwards %d\n", j, i); bug = 1; } return bug ? -1 : i; @@ -409,8 +429,9 @@ static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) for (nd = rb_first(root); nd; nd = rb_next(nd)) { struct vm_area_struct *vma; vma = rb_entry(nd, struct vm_area_struct, vm_rb); - BUG_ON(vma != ignore && - vma->rb_subtree_gap != vma_compute_subtree_gap(vma)); + VM_BUG_ON_VMA(vma != ignore && + vma->rb_subtree_gap != vma_compute_subtree_gap(vma), + vma); } } @@ -420,8 +441,10 @@ static void validate_mm(struct mm_struct *mm) int i = 0; unsigned long highest_address = 0; struct vm_area_struct *vma = mm->mmap; + while (vma) { struct anon_vma_chain *avc; + vma_lock_anon_vma(vma); list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) anon_vma_interval_tree_verify(avc); @@ -431,20 +454,21 @@ static void validate_mm(struct mm_struct *mm) i++; } if (i != mm->map_count) { - pr_info("map_count %d vm_next %d\n", mm->map_count, i); + pr_emerg("map_count %d vm_next %d\n", mm->map_count, i); bug = 1; } if (highest_address != mm->highest_vm_end) { - pr_info("mm->highest_vm_end %lx, found %lx\n", - mm->highest_vm_end, highest_address); + pr_emerg("mm->highest_vm_end %lx, found %lx\n", + mm->highest_vm_end, highest_address); bug = 1; } i = browse_rb(&mm->mm_rb); if (i != mm->map_count) { - pr_info("map_count %d rb %d\n", mm->map_count, i); + if (i != -1) + pr_emerg("map_count %d rb %d\n", mm->map_count, i); bug = 1; } - BUG_ON(bug); + VM_BUG_ON_MM(bug, mm); } #else #define validate_mm_rb(root, ignore) do { } while (0) @@ -741,7 +765,7 @@ again: remove_next = 1 + (end > next->vm_end); * split_vma inserting another: so it must be * mprotect case 4 shifting the boundary down. */ - adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); + adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT); exporter = vma; importer = next; } @@ -787,8 +811,8 @@ again: remove_next = 1 + (end > next->vm_end); if (!anon_vma && adjust_next) anon_vma = next->anon_vma; if (anon_vma) { - VM_BUG_ON(adjust_next && next->anon_vma && - anon_vma != next->anon_vma); + VM_BUG_ON_VMA(adjust_next && next->anon_vma && + anon_vma != next->anon_vma, next); anon_vma_lock_write(anon_vma); anon_vma_interval_tree_pre_update_vma(vma); if (adjust_next) @@ -1010,7 +1034,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, struct vm_area_struct *vma_merge(struct mm_struct *mm, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, + struct anon_vma *anon_vma, struct file *file, pgoff_t pgoff, struct mempolicy *policy) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; @@ -1036,7 +1060,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, * Can it merge with the predecessor? */ if (prev && prev->vm_end == addr && - mpol_equal(vma_policy(prev), policy) && + mpol_equal(vma_policy(prev), policy) && can_vma_merge_after(prev, vm_flags, anon_vma, file, pgoff)) { /* @@ -1064,7 +1088,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, * Can this new request be merged in front of next? */ if (next && end == next->vm_start && - mpol_equal(policy, vma_policy(next)) && + mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen)) { if (prev && addr < prev->vm_end) /* case 4 */ @@ -1235,7 +1259,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long flags, unsigned long pgoff, unsigned long *populate) { - struct mm_struct * mm = current->mm; + struct mm_struct *mm = current->mm; vm_flags_t vm_flags; *populate = 0; @@ -1263,7 +1287,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, /* offset overflow? */ if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) - return -EOVERFLOW; + return -EOVERFLOW; /* Too many mappings? */ if (mm->map_count > sysctl_max_map_count) @@ -1470,11 +1494,16 @@ int vma_wants_writenotify(struct vm_area_struct *vma) if (vma->vm_ops && vma->vm_ops->page_mkwrite) return 1; - /* The open routine did something to the protections already? */ + /* The open routine did something to the protections that pgprot_modify + * won't preserve? */ if (pgprot_val(vma->vm_page_prot) != - pgprot_val(vm_get_page_prot(vm_flags))) + pgprot_val(vm_pgprot_modify(vma->vm_page_prot, vm_flags))) return 0; + /* Do we need to track softdirty? */ + if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY)) + return 1; + /* Specialty mapping? */ if (vm_flags & VM_PFNMAP) return 0; @@ -1610,21 +1639,6 @@ munmap_back: goto free_vma; } - if (vma_wants_writenotify(vma)) { - pgprot_t pprot = vma->vm_page_prot; - - /* Can vma->vm_page_prot have changed?? - * - * Answer: Yes, drivers may have changed it in their - * f_op->mmap method. - * - * Ensures that vmas marked as uncached stay that way. - */ - vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); - if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot))) - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - } - vma_link(mm, vma, prev, rb_link, rb_parent); /* Once vma denies write, undo our temporary denial count */ if (file) { @@ -1658,6 +1672,8 @@ out: */ vma->vm_flags |= VM_SOFTDIRTY; + vma_set_page_prot(vma); + return addr; unmap_and_free_vma: @@ -1921,7 +1937,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, info.align_mask = 0; return vm_unmapped_area(&info); } -#endif +#endif /* * This mmap-allocator allocates new areas top-down from below the @@ -2321,13 +2337,13 @@ int expand_stack(struct vm_area_struct *vma, unsigned long address) } struct vm_area_struct * -find_extend_vma(struct mm_struct * mm, unsigned long addr) +find_extend_vma(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct * vma; + struct vm_area_struct *vma; unsigned long start; addr &= PAGE_MASK; - vma = find_vma(mm,addr); + vma = find_vma(mm, addr); if (!vma) return NULL; if (vma->vm_start <= addr) @@ -2376,7 +2392,7 @@ static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end) { - struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; + struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap; struct mmu_gather tlb; lru_add_drain(); @@ -2423,7 +2439,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, * __split_vma() bypasses sysctl_max_map_count checking. We use this on the * munmap path where it doesn't make sense to fail. */ -static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, +static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, int new_below) { struct vm_area_struct *new; @@ -2512,7 +2528,8 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL; - if ((len = PAGE_ALIGN(len)) == 0) + len = PAGE_ALIGN(len); + if (len == 0) return -EINVAL; /* Find the first overlapping VMA */ @@ -2558,7 +2575,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) if (error) return error; } - vma = prev? prev->vm_next: mm->mmap; + vma = prev ? prev->vm_next : mm->mmap; /* * unlock any mlock()ed ranges before detaching vmas @@ -2621,10 +2638,10 @@ static inline void verify_mm_writelocked(struct mm_struct *mm) */ static unsigned long do_brk(unsigned long addr, unsigned long len) { - struct mm_struct * mm = current->mm; - struct vm_area_struct * vma, * prev; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma, *prev; unsigned long flags; - struct rb_node ** rb_link, * rb_parent; + struct rb_node **rb_link, *rb_parent; pgoff_t pgoff = addr >> PAGE_SHIFT; int error; @@ -2848,7 +2865,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, * safe. It is only safe to keep the vm_pgoff * linear if there are no pages mapped yet. */ - VM_BUG_ON(faulted_in_anon_vma); + VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); *vmap = vma = new_vma; } *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); @@ -3196,7 +3213,7 @@ void __init mmap_init(void) { int ret; - ret = percpu_counter_init(&vm_committed_as, 0); + ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); } diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 950813b1eb36..2c8da9825fe3 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -107,7 +107,8 @@ void __mmu_notifier_release(struct mm_struct *mm) * existed or not. */ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, - unsigned long address) + unsigned long start, + unsigned long end) { struct mmu_notifier *mn; int young = 0, id; @@ -115,7 +116,7 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->clear_flush_young) - young |= mn->ops->clear_flush_young(mn, mm, address); + young |= mn->ops->clear_flush_young(mn, mm, start, end); } srcu_read_unlock(&srcu, id); diff --git a/mm/mprotect.c b/mm/mprotect.c index c43d557941f8..ace93454ce8e 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -29,13 +29,6 @@ #include <asm/cacheflush.h> #include <asm/tlbflush.h> -#ifndef pgprot_modify -static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) -{ - return newprot; -} -#endif - /* * For a prot_numa update we only hold mmap_sem for read so there is a * potential race with faulting where a pmd was temporarily none. This @@ -93,7 +86,9 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * Avoid taking write faults for pages we * know to be dirty. */ - if (dirty_accountable && pte_dirty(ptent)) + if (dirty_accountable && pte_dirty(ptent) && + (pte_soft_dirty(ptent) || + !(vma->vm_flags & VM_SOFTDIRTY))) ptent = pte_mkwrite(ptent); ptep_modify_prot_commit(mm, addr, pte, ptent); updated = true; @@ -320,13 +315,8 @@ success: * held in write mode. */ vma->vm_flags = newflags; - vma->vm_page_prot = pgprot_modify(vma->vm_page_prot, - vm_get_page_prot(newflags)); - - if (vma_wants_writenotify(vma)) { - vma->vm_page_prot = vm_get_page_prot(newflags & ~VM_SHARED); - dirty_accountable = 1; - } + dirty_accountable = vma_wants_writenotify(vma); + vma_set_page_prot(vma); change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0); diff --git a/mm/mremap.c b/mm/mremap.c index 05f1180e9f21..b147f66f4c40 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -21,8 +21,8 @@ #include <linux/syscalls.h> #include <linux/mmu_notifier.h> #include <linux/sched/sysctl.h> +#include <linux/uaccess.h> -#include <asm/uaccess.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> @@ -195,7 +195,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma, if (pmd_trans_huge(*old_pmd)) { int err = 0; if (extent == HPAGE_PMD_SIZE) { - VM_BUG_ON(vma->vm_file || !vma->anon_vma); + VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma, + vma); /* See comment in move_ptes() */ if (need_rmap_locks) anon_vma_lock_write(vma->anon_vma); diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 7ed58602e71b..7c7ab32ee503 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -119,6 +119,8 @@ static unsigned long __init free_low_memory_core_early(void) phys_addr_t start, end; u64 i; + memblock_clear_hotplug(0, -1); + for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) count += __free_memory_core(start, end); diff --git a/mm/nommu.c b/mm/nommu.c index a881d9673c6b..bd1808e194a7 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -539,7 +539,7 @@ void __init mmap_init(void) { int ret; - ret = percpu_counter_init(&vm_committed_as, 0); + ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1e11df8fa7ec..bbf405a3a18f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -565,7 +565,7 @@ bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask) spin_lock(&zone_scan_lock); for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) - if (zone_is_oom_locked(zone)) { + if (test_bit(ZONE_OOM_LOCKED, &zone->flags)) { ret = false; goto out; } @@ -575,7 +575,7 @@ bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask) * call to oom_zonelist_trylock() doesn't succeed when it shouldn't. */ for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) - zone_set_flag(zone, ZONE_OOM_LOCKED); + set_bit(ZONE_OOM_LOCKED, &zone->flags); out: spin_unlock(&zone_scan_lock); @@ -594,7 +594,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) spin_lock(&zone_scan_lock); for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) - zone_clear_flag(zone, ZONE_OOM_LOCKED); + clear_bit(ZONE_OOM_LOCKED, &zone->flags); spin_unlock(&zone_scan_lock); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 91d73ef1744d..ff24c9d83112 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1075,13 +1075,13 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, } if (dirty < setpoint) { - x = min(bdi->balanced_dirty_ratelimit, - min(balanced_dirty_ratelimit, task_ratelimit)); + x = min3(bdi->balanced_dirty_ratelimit, + balanced_dirty_ratelimit, task_ratelimit); if (dirty_ratelimit < x) step = x - dirty_ratelimit; } else { - x = max(bdi->balanced_dirty_ratelimit, - max(balanced_dirty_ratelimit, task_ratelimit)); + x = max3(bdi->balanced_dirty_ratelimit, + balanced_dirty_ratelimit, task_ratelimit); if (dirty_ratelimit > x) step = dirty_ratelimit - x; } @@ -1777,7 +1777,7 @@ void __init page_writeback_init(void) writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); - fprop_global_init(&writeout_completions); + fprop_global_init(&writeout_completions, GFP_KERNEL); } /** diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 18cee0d4c8a2..736d8e1b6381 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -53,8 +53,6 @@ #include <linux/kmemleak.h> #include <linux/compaction.h> #include <trace/events/kmem.h> -#include <linux/ftrace_event.h> -#include <linux/memcontrol.h> #include <linux/prefetch.h> #include <linux/mm_inline.h> #include <linux/migrate.h> @@ -85,6 +83,7 @@ EXPORT_PER_CPU_SYMBOL(numa_node); */ DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ EXPORT_PER_CPU_SYMBOL(_numa_mem_); +int _node_numa_mem_[MAX_NUMNODES]; #endif /* @@ -1014,7 +1013,7 @@ int move_freepages(struct zone *zone, * Remove at a later date when no bug reports exist related to * grouping pages by mobility */ - BUG_ON(page_zone(start_page) != page_zone(end_page)); + VM_BUG_ON(page_zone(start_page) != page_zone(end_page)); #endif for (page = start_page; page <= end_page;) { @@ -1612,9 +1611,9 @@ again: } __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); - if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 && - !zone_is_fair_depleted(zone)) - zone_set_flag(zone, ZONE_FAIR_DEPLETED); + if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 && + !test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) + set_bit(ZONE_FAIR_DEPLETED, &zone->flags); __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); @@ -1934,7 +1933,7 @@ static void reset_alloc_batches(struct zone *preferred_zone) mod_zone_page_state(zone, NR_ALLOC_BATCH, high_wmark_pages(zone) - low_wmark_pages(zone) - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); - zone_clear_flag(zone, ZONE_FAIR_DEPLETED); + clear_bit(ZONE_FAIR_DEPLETED, &zone->flags); } while (zone++ != preferred_zone); } @@ -1985,7 +1984,7 @@ zonelist_scan: if (alloc_flags & ALLOC_FAIR) { if (!zone_local(preferred_zone, zone)) break; - if (zone_is_fair_depleted(zone)) { + if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { nr_fair_skipped++; continue; } @@ -2296,58 +2295,72 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, int classzone_idx, int migratetype, enum migrate_mode mode, - bool *contended_compaction, bool *deferred_compaction, - unsigned long *did_some_progress) + int *contended_compaction, bool *deferred_compaction) { - if (!order) - return NULL; + struct zone *last_compact_zone = NULL; + unsigned long compact_result; + struct page *page; - if (compaction_deferred(preferred_zone, order)) { - *deferred_compaction = true; + if (!order) return NULL; - } current->flags |= PF_MEMALLOC; - *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, + compact_result = try_to_compact_pages(zonelist, order, gfp_mask, nodemask, mode, - contended_compaction); + contended_compaction, + &last_compact_zone); current->flags &= ~PF_MEMALLOC; - if (*did_some_progress != COMPACT_SKIPPED) { - struct page *page; + switch (compact_result) { + case COMPACT_DEFERRED: + *deferred_compaction = true; + /* fall-through */ + case COMPACT_SKIPPED: + return NULL; + default: + break; + } - /* Page migration frees to the PCP lists but we want merging */ - drain_pages(get_cpu()); - put_cpu(); + /* + * At least in one zone compaction wasn't deferred or skipped, so let's + * count a compaction stall + */ + count_vm_event(COMPACTSTALL); - page = get_page_from_freelist(gfp_mask, nodemask, - order, zonelist, high_zoneidx, - alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, classzone_idx, migratetype); - if (page) { - preferred_zone->compact_blockskip_flush = false; - compaction_defer_reset(preferred_zone, order, true); - count_vm_event(COMPACTSUCCESS); - return page; - } + /* Page migration frees to the PCP lists but we want merging */ + drain_pages(get_cpu()); + put_cpu(); - /* - * It's bad if compaction run occurs and fails. - * The most likely reason is that pages exist, - * but not enough to satisfy watermarks. - */ - count_vm_event(COMPACTFAIL); + page = get_page_from_freelist(gfp_mask, nodemask, + order, zonelist, high_zoneidx, + alloc_flags & ~ALLOC_NO_WATERMARKS, + preferred_zone, classzone_idx, migratetype); - /* - * As async compaction considers a subset of pageblocks, only - * defer if the failure was a sync compaction failure. - */ - if (mode != MIGRATE_ASYNC) - defer_compaction(preferred_zone, order); + if (page) { + struct zone *zone = page_zone(page); - cond_resched(); + zone->compact_blockskip_flush = false; + compaction_defer_reset(zone, order, true); + count_vm_event(COMPACTSUCCESS); + return page; } + /* + * last_compact_zone is where try_to_compact_pages thought allocation + * should succeed, so it did not defer compaction. But here we know + * that it didn't succeed, so we do the defer. + */ + if (last_compact_zone && mode != MIGRATE_ASYNC) + defer_compaction(last_compact_zone, order); + + /* + * It's bad if compaction run occurs and fails. The most likely reason + * is that pages exist, but not enough to satisfy watermarks. + */ + count_vm_event(COMPACTFAIL); + + cond_resched(); + return NULL; } #else @@ -2355,9 +2368,8 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int classzone_idx, int migratetype, - enum migrate_mode mode, bool *contended_compaction, - bool *deferred_compaction, unsigned long *did_some_progress) + int classzone_idx, int migratetype, enum migrate_mode mode, + int *contended_compaction, bool *deferred_compaction) { return NULL; } @@ -2457,12 +2469,14 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, static void wake_all_kswapds(unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, - struct zone *preferred_zone) + struct zone *preferred_zone, + nodemask_t *nodemask) { struct zoneref *z; struct zone *zone; - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) + for_each_zone_zonelist_nodemask(zone, z, zonelist, + high_zoneidx, nodemask) wakeup_kswapd(zone, order, zone_idx(preferred_zone)); } @@ -2509,7 +2523,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) alloc_flags |= ALLOC_NO_WATERMARKS; } #ifdef CONFIG_CMA - if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif return alloc_flags; @@ -2533,7 +2547,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long did_some_progress; enum migrate_mode migration_mode = MIGRATE_ASYNC; bool deferred_compaction = false; - bool contended_compaction = false; + int contended_compaction = COMPACT_CONTENDED_NONE; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -2560,7 +2574,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, restart: if (!(gfp_mask & __GFP_NO_KSWAPD)) - wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone); + wake_all_kswapds(order, zonelist, high_zoneidx, + preferred_zone, nodemask); /* * OK, we're below the kswapd watermark and have kicked background @@ -2633,20 +2648,40 @@ rebalance: preferred_zone, classzone_idx, migratetype, migration_mode, &contended_compaction, - &deferred_compaction, - &did_some_progress); + &deferred_compaction); if (page) goto got_pg; - /* - * If compaction is deferred for high-order allocations, it is because - * sync compaction recently failed. In this is the case and the caller - * requested a movable allocation that does not heavily disrupt the - * system then fail the allocation instead of entering direct reclaim. - */ - if ((deferred_compaction || contended_compaction) && - (gfp_mask & __GFP_NO_KSWAPD)) - goto nopage; + /* Checks for THP-specific high-order allocations */ + if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) { + /* + * If compaction is deferred for high-order allocations, it is + * because sync compaction recently failed. If this is the case + * and the caller requested a THP allocation, we do not want + * to heavily disrupt the system, so we fail the allocation + * instead of entering direct reclaim. + */ + if (deferred_compaction) + goto nopage; + + /* + * In all zones where compaction was attempted (and not + * deferred or skipped), lock contention has been detected. + * For THP allocation we do not want to disrupt the others + * so we fallback to base pages instead. + */ + if (contended_compaction == COMPACT_CONTENDED_LOCK) + goto nopage; + + /* + * If compaction was aborted due to need_resched(), we do not + * want to further increase allocation latency, unless it is + * khugepaged trying to collapse. + */ + if (contended_compaction == COMPACT_CONTENDED_SCHED + && !(current->flags & PF_KTHREAD)) + goto nopage; + } /* * It can become very expensive to allocate transparent hugepages at @@ -2726,8 +2761,7 @@ rebalance: preferred_zone, classzone_idx, migratetype, migration_mode, &contended_compaction, - &deferred_compaction, - &did_some_progress); + &deferred_compaction); if (page) goto got_pg; } @@ -2753,7 +2787,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zone *preferred_zone; struct zoneref *preferred_zoneref; struct page *page = NULL; - int migratetype = allocflags_to_migratetype(gfp_mask); + int migratetype = gfpflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; int classzone_idx; @@ -2775,6 +2809,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; + if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE) + alloc_flags |= ALLOC_CMA; + retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); @@ -2786,10 +2823,6 @@ retry_cpuset: goto out; classzone_idx = zonelist_zone_idx(preferred_zoneref); -#ifdef CONFIG_CMA - if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) - alloc_flags |= ALLOC_CMA; -#endif /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, alloc_flags, @@ -3579,68 +3612,30 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) zonelist->_zonerefs[pos].zone_idx = 0; } +#if defined(CONFIG_64BIT) +/* + * Devices that require DMA32/DMA are relatively rare and do not justify a + * penalty to every machine in case the specialised case applies. Default + * to Node-ordering on 64-bit NUMA machines + */ +static int default_zonelist_order(void) +{ + return ZONELIST_ORDER_NODE; +} +#else +/* + * On 32-bit, the Normal zone needs to be preserved for allocations accessible + * by the kernel. If processes running on node 0 deplete the low memory zone + * then reclaim will occur more frequency increasing stalls and potentially + * be easier to OOM if a large percentage of the zone is under writeback or + * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set. + * Hence, default to zone ordering on 32-bit. + */ static int default_zonelist_order(void) { - int nid, zone_type; - unsigned long low_kmem_size, total_size; - struct zone *z; - int average_size; - /* - * ZONE_DMA and ZONE_DMA32 can be very small area in the system. - * If they are really small and used heavily, the system can fall - * into OOM very easily. - * This function detect ZONE_DMA/DMA32 size and configures zone order. - */ - /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ - low_kmem_size = 0; - total_size = 0; - for_each_online_node(nid) { - for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { - z = &NODE_DATA(nid)->node_zones[zone_type]; - if (populated_zone(z)) { - if (zone_type < ZONE_NORMAL) - low_kmem_size += z->managed_pages; - total_size += z->managed_pages; - } else if (zone_type == ZONE_NORMAL) { - /* - * If any node has only lowmem, then node order - * is preferred to allow kernel allocations - * locally; otherwise, they can easily infringe - * on other nodes when there is an abundance of - * lowmem available to allocate from. - */ - return ZONELIST_ORDER_NODE; - } - } - } - if (!low_kmem_size || /* there are no DMA area. */ - low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ - return ZONELIST_ORDER_NODE; - /* - * look into each node's config. - * If there is a node whose DMA/DMA32 memory is very big area on - * local memory, NODE_ORDER may be suitable. - */ - average_size = total_size / - (nodes_weight(node_states[N_MEMORY]) + 1); - for_each_online_node(nid) { - low_kmem_size = 0; - total_size = 0; - for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { - z = &NODE_DATA(nid)->node_zones[zone_type]; - if (populated_zone(z)) { - if (zone_type < ZONE_NORMAL) - low_kmem_size += z->present_pages; - total_size += z->present_pages; - } - } - if (low_kmem_size && - total_size > average_size && /* ignore small node */ - low_kmem_size > total_size * 70/100) - return ZONELIST_ORDER_NODE; - } return ZONELIST_ORDER_ZONE; } +#endif /* CONFIG_64BIT */ static void set_zonelist_order(void) { @@ -4976,6 +4971,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_start_pfn = node_start_pfn; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); + printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid, + (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << PAGE_SHIFT) - 1); #endif calculate_node_totalpages(pgdat, start_pfn, end_pfn, zones_size, zholes_size); @@ -5701,9 +5698,8 @@ static void __setup_per_zone_wmarks(void) zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); __mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - - low_wmark_pages(zone) - - zone_page_state(zone, NR_ALLOC_BATCH)); + high_wmark_pages(zone) - low_wmark_pages(zone) - + atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); @@ -6278,8 +6274,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, if (list_empty(&cc->migratepages)) { cc->nr_migratepages = 0; - pfn = isolate_migratepages_range(cc->zone, cc, - pfn, end, true); + pfn = isolate_migratepages_range(cc, pfn, end); if (!pfn) { ret = -EINTR; break; @@ -6555,97 +6550,3 @@ bool is_free_buddy_page(struct page *page) return order < MAX_ORDER; } #endif - -static const struct trace_print_flags pageflag_names[] = { - {1UL << PG_locked, "locked" }, - {1UL << PG_error, "error" }, - {1UL << PG_referenced, "referenced" }, - {1UL << PG_uptodate, "uptodate" }, - {1UL << PG_dirty, "dirty" }, - {1UL << PG_lru, "lru" }, - {1UL << PG_active, "active" }, - {1UL << PG_slab, "slab" }, - {1UL << PG_owner_priv_1, "owner_priv_1" }, - {1UL << PG_arch_1, "arch_1" }, - {1UL << PG_reserved, "reserved" }, - {1UL << PG_private, "private" }, - {1UL << PG_private_2, "private_2" }, - {1UL << PG_writeback, "writeback" }, -#ifdef CONFIG_PAGEFLAGS_EXTENDED - {1UL << PG_head, "head" }, - {1UL << PG_tail, "tail" }, -#else - {1UL << PG_compound, "compound" }, -#endif - {1UL << PG_swapcache, "swapcache" }, - {1UL << PG_mappedtodisk, "mappedtodisk" }, - {1UL << PG_reclaim, "reclaim" }, - {1UL << PG_swapbacked, "swapbacked" }, - {1UL << PG_unevictable, "unevictable" }, -#ifdef CONFIG_MMU - {1UL << PG_mlocked, "mlocked" }, -#endif -#ifdef CONFIG_ARCH_USES_PG_UNCACHED - {1UL << PG_uncached, "uncached" }, -#endif -#ifdef CONFIG_MEMORY_FAILURE - {1UL << PG_hwpoison, "hwpoison" }, -#endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - {1UL << PG_compound_lock, "compound_lock" }, -#endif -}; - -static void dump_page_flags(unsigned long flags) -{ - const char *delim = ""; - unsigned long mask; - int i; - - BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); - - printk(KERN_ALERT "page flags: %#lx(", flags); - - /* remove zone id */ - flags &= (1UL << NR_PAGEFLAGS) - 1; - - for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) { - - mask = pageflag_names[i].mask; - if ((flags & mask) != mask) - continue; - - flags &= ~mask; - printk("%s%s", delim, pageflag_names[i].name); - delim = "|"; - } - - /* check for left over flags */ - if (flags) - printk("%s%#lx", delim, flags); - - printk(")\n"); -} - -void dump_page_badflags(struct page *page, const char *reason, - unsigned long badflags) -{ - printk(KERN_ALERT - "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", - page, atomic_read(&page->_count), page_mapcount(page), - page->mapping, page->index); - dump_page_flags(page->flags); - if (reason) - pr_alert("page dumped because: %s\n", reason); - if (page->flags & badflags) { - pr_alert("bad because of flags:\n"); - dump_page_flags(page->flags & badflags); - } - mem_cgroup_print_bad_page(page); -} - -void dump_page(struct page *page, const char *reason) -{ - dump_page_badflags(page, reason, 0); -} -EXPORT_SYMBOL(dump_page); diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 2beeabf502c5..ad83195521f2 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -177,7 +177,7 @@ int walk_page_range(unsigned long addr, unsigned long end, if (!walk->mm) return -EINVAL; - VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); + VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); pgd = pgd_offset(walk->mm, addr); do { diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 89633fefc6a2..10e3d0b8a86d 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -33,17 +33,14 @@ #include <linux/log2.h> -static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { - unsigned int cpu; - - for_each_possible_cpu(cpu) - memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); - return 0; } -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { /* nada */ } @@ -70,6 +67,11 @@ static struct pcpu_chunk *pcpu_create_chunk(void) chunk->data = pages; chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; + + spin_lock_irq(&pcpu_lock); + pcpu_chunk_populated(chunk, 0, nr_pages); + spin_unlock_irq(&pcpu_lock); + return chunk; } diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 3707c71ae4cd..538998a137d2 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -20,46 +20,25 @@ static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, } /** - * pcpu_get_pages_and_bitmap - get temp pages array and bitmap + * pcpu_get_pages - get temp pages array * @chunk: chunk of interest - * @bitmapp: output parameter for bitmap - * @may_alloc: may allocate the array * - * Returns pointer to array of pointers to struct page and bitmap, - * both of which can be indexed with pcpu_page_idx(). The returned - * array is cleared to zero and *@bitmapp is copied from - * @chunk->populated. Note that there is only one array and bitmap - * and access exclusion is the caller's responsibility. - * - * CONTEXT: - * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc. - * Otherwise, don't care. + * Returns pointer to array of pointers to struct page which can be indexed + * with pcpu_page_idx(). Note that there is only one array and accesses + * should be serialized by pcpu_alloc_mutex. * * RETURNS: - * Pointer to temp pages array on success, NULL on failure. + * Pointer to temp pages array on success. */ -static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, - unsigned long **bitmapp, - bool may_alloc) +static struct page **pcpu_get_pages(struct pcpu_chunk *chunk_alloc) { static struct page **pages; - static unsigned long *bitmap; size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); - size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * - sizeof(unsigned long); - - if (!pages || !bitmap) { - if (may_alloc && !pages) - pages = pcpu_mem_zalloc(pages_size); - if (may_alloc && !bitmap) - bitmap = pcpu_mem_zalloc(bitmap_size); - if (!pages || !bitmap) - return NULL; - } - bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); + lockdep_assert_held(&pcpu_alloc_mutex); - *bitmapp = bitmap; + if (!pages) + pages = pcpu_mem_zalloc(pages_size); return pages; } @@ -67,7 +46,6 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, * pcpu_free_pages - free pages which were allocated for @chunk * @chunk: chunk pages were allocated for * @pages: array of pages to be freed, indexed by pcpu_page_idx() - * @populated: populated bitmap * @page_start: page index of the first page to be freed * @page_end: page index of the last page to be freed + 1 * @@ -75,8 +53,7 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, * The pages were allocated for @chunk. */ static void pcpu_free_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) + struct page **pages, int page_start, int page_end) { unsigned int cpu; int i; @@ -95,7 +72,6 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk, * pcpu_alloc_pages - allocates pages for @chunk * @chunk: target chunk * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() - * @populated: populated bitmap * @page_start: page index of the first page to be allocated * @page_end: page index of the last page to be allocated + 1 * @@ -104,11 +80,10 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk, * content of @pages and will pass it verbatim to pcpu_map_pages(). */ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) + struct page **pages, int page_start, int page_end) { const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; - unsigned int cpu; + unsigned int cpu, tcpu; int i; for_each_possible_cpu(cpu) { @@ -116,14 +91,23 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); - if (!*pagep) { - pcpu_free_pages(chunk, pages, populated, - page_start, page_end); - return -ENOMEM; - } + if (!*pagep) + goto err; } } return 0; + +err: + while (--i >= page_start) + __free_page(pages[pcpu_page_idx(cpu, i)]); + + for_each_possible_cpu(tcpu) { + if (tcpu == cpu) + break; + for (i = page_start; i < page_end; i++) + __free_page(pages[pcpu_page_idx(tcpu, i)]); + } + return -ENOMEM; } /** @@ -155,7 +139,6 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) * pcpu_unmap_pages - unmap pages out of a pcpu_chunk * @chunk: chunk of interest * @pages: pages array which can be used to pass information to free - * @populated: populated bitmap * @page_start: page index of the first page to unmap * @page_end: page index of the last page to unmap + 1 * @@ -166,8 +149,7 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) * proper pre/post flush functions. */ static void pcpu_unmap_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) + struct page **pages, int page_start, int page_end) { unsigned int cpu; int i; @@ -183,8 +165,6 @@ static void pcpu_unmap_pages(struct pcpu_chunk *chunk, __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), page_end - page_start); } - - bitmap_clear(populated, page_start, page_end - page_start); } /** @@ -219,7 +199,6 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages, * pcpu_map_pages - map pages into a pcpu_chunk * @chunk: chunk of interest * @pages: pages array containing pages to be mapped - * @populated: populated bitmap * @page_start: page index of the first page to map * @page_end: page index of the last page to map + 1 * @@ -227,13 +206,11 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages, * caller is responsible for calling pcpu_post_map_flush() after all * mappings are complete. * - * This function is responsible for setting corresponding bits in - * @chunk->populated bitmap and whatever is necessary for reverse - * lookup (addr -> chunk). + * This function is responsible for setting up whatever is necessary for + * reverse lookup (addr -> chunk). */ static int pcpu_map_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) + struct page **pages, int page_start, int page_end) { unsigned int cpu, tcpu; int i, err; @@ -244,18 +221,12 @@ static int pcpu_map_pages(struct pcpu_chunk *chunk, page_end - page_start); if (err < 0) goto err; - } - /* mapping successful, link chunk and mark populated */ - for (i = page_start; i < page_end; i++) { - for_each_possible_cpu(cpu) + for (i = page_start; i < page_end; i++) pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], chunk); - __set_bit(i, populated); } - return 0; - err: for_each_possible_cpu(tcpu) { if (tcpu == cpu) @@ -263,6 +234,7 @@ err: __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), page_end - page_start); } + pcpu_post_unmap_tlb_flush(chunk, page_start, page_end); return err; } @@ -289,123 +261,69 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, /** * pcpu_populate_chunk - populate and map an area of a pcpu_chunk * @chunk: chunk of interest - * @off: offset to the area to populate - * @size: size of the area to populate in bytes + * @page_start: the start page + * @page_end: the end page * * For each cpu, populate and map pages [@page_start,@page_end) into - * @chunk. The area is cleared on return. + * @chunk. * * CONTEXT: * pcpu_alloc_mutex, does GFP_KERNEL allocation. */ -static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { - int page_start = PFN_DOWN(off); - int page_end = PFN_UP(off + size); - int free_end = page_start, unmap_end = page_start; struct page **pages; - unsigned long *populated; - unsigned int cpu; - int rs, re, rc; - - /* quick path, check whether all pages are already there */ - rs = page_start; - pcpu_next_pop(chunk, &rs, &re, page_end); - if (rs == page_start && re == page_end) - goto clear; - /* need to allocate and map pages, this chunk can't be immutable */ - WARN_ON(chunk->immutable); - - pages = pcpu_get_pages_and_bitmap(chunk, &populated, true); + pages = pcpu_get_pages(chunk); if (!pages) return -ENOMEM; - /* alloc and map */ - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { - rc = pcpu_alloc_pages(chunk, pages, populated, rs, re); - if (rc) - goto err_free; - free_end = re; - } + if (pcpu_alloc_pages(chunk, pages, page_start, page_end)) + return -ENOMEM; - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { - rc = pcpu_map_pages(chunk, pages, populated, rs, re); - if (rc) - goto err_unmap; - unmap_end = re; + if (pcpu_map_pages(chunk, pages, page_start, page_end)) { + pcpu_free_pages(chunk, pages, page_start, page_end); + return -ENOMEM; } pcpu_post_map_flush(chunk, page_start, page_end); - /* commit new bitmap */ - bitmap_copy(chunk->populated, populated, pcpu_unit_pages); -clear: - for_each_possible_cpu(cpu) - memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); return 0; - -err_unmap: - pcpu_pre_unmap_flush(chunk, page_start, unmap_end); - pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) - pcpu_unmap_pages(chunk, pages, populated, rs, re); - pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); -err_free: - pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) - pcpu_free_pages(chunk, pages, populated, rs, re); - return rc; } /** * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk * @chunk: chunk to depopulate - * @off: offset to the area to depopulate - * @size: size of the area to depopulate in bytes + * @page_start: the start page + * @page_end: the end page * * For each cpu, depopulate and unmap pages [@page_start,@page_end) - * from @chunk. If @flush is true, vcache is flushed before unmapping - * and tlb after. + * from @chunk. * * CONTEXT: * pcpu_alloc_mutex. */ -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { - int page_start = PFN_DOWN(off); - int page_end = PFN_UP(off + size); struct page **pages; - unsigned long *populated; - int rs, re; - - /* quick path, check whether it's empty already */ - rs = page_start; - pcpu_next_unpop(chunk, &rs, &re, page_end); - if (rs == page_start && re == page_end) - return; - - /* immutable chunks can't be depopulated */ - WARN_ON(chunk->immutable); /* * If control reaches here, there must have been at least one * successful population attempt so the temp pages array must * be available now. */ - pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); + pages = pcpu_get_pages(chunk); BUG_ON(!pages); /* unmap and free */ pcpu_pre_unmap_flush(chunk, page_start, page_end); - pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) - pcpu_unmap_pages(chunk, pages, populated, rs, re); + pcpu_unmap_pages(chunk, pages, page_start, page_end); /* no need to flush tlb, vmalloc will handle it lazily */ - pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) - pcpu_free_pages(chunk, pages, populated, rs, re); - - /* commit new bitmap */ - bitmap_copy(chunk->populated, populated, pcpu_unit_pages); + pcpu_free_pages(chunk, pages, page_start, page_end); } static struct pcpu_chunk *pcpu_create_chunk(void) diff --git a/mm/percpu.c b/mm/percpu.c index 2139e30a4b44..014bab65e0ff 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -76,6 +76,10 @@ #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ +#define PCPU_ATOMIC_MAP_MARGIN_LOW 32 +#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64 +#define PCPU_EMPTY_POP_PAGES_LOW 2 +#define PCPU_EMPTY_POP_PAGES_HIGH 4 #ifdef CONFIG_SMP /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ @@ -102,12 +106,16 @@ struct pcpu_chunk { int free_size; /* free bytes in the chunk */ int contig_hint; /* max contiguous size hint */ void *base_addr; /* base address of this chunk */ + int map_used; /* # of map entries used before the sentry */ int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ + struct work_struct map_extend_work;/* async ->map[] extension */ + void *data; /* chunk data */ int first_free; /* no free below this */ bool immutable; /* no [de]population allowed */ + int nr_populated; /* # of populated pages */ unsigned long populated[]; /* populated bitmap */ }; @@ -151,38 +159,33 @@ static struct pcpu_chunk *pcpu_first_chunk; static struct pcpu_chunk *pcpu_reserved_chunk; static int pcpu_reserved_chunk_limit; +static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ +static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */ + +static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ + /* - * Synchronization rules. - * - * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former - * protects allocation/reclaim paths, chunks, populated bitmap and - * vmalloc mapping. The latter is a spinlock and protects the index - * data structures - chunk slots, chunks and area maps in chunks. - * - * During allocation, pcpu_alloc_mutex is kept locked all the time and - * pcpu_lock is grabbed and released as necessary. All actual memory - * allocations are done using GFP_KERNEL with pcpu_lock released. In - * general, percpu memory can't be allocated with irq off but - * irqsave/restore are still used in alloc path so that it can be used - * from early init path - sched_init() specifically. - * - * Free path accesses and alters only the index data structures, so it - * can be safely called from atomic context. When memory needs to be - * returned to the system, free path schedules reclaim_work which - * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be - * reclaimed, release both locks and frees the chunks. Note that it's - * necessary to grab both locks to remove a chunk from circulation as - * allocation path might be referencing the chunk with only - * pcpu_alloc_mutex locked. + * The number of empty populated pages, protected by pcpu_lock. The + * reserved chunk doesn't contribute to the count. */ -static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ -static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ +static int pcpu_nr_empty_pop_pages; -static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ +/* + * Balance work is used to populate or destroy chunks asynchronously. We + * try to keep the number of populated free pages between + * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one + * empty chunk. + */ +static void pcpu_balance_workfn(struct work_struct *work); +static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn); +static bool pcpu_async_enabled __read_mostly; +static bool pcpu_atomic_alloc_failed; -/* reclaim work to release fully free chunks, scheduled from free path */ -static void pcpu_reclaim(struct work_struct *work); -static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); +static void pcpu_schedule_balance_work(void) +{ + if (pcpu_async_enabled) + schedule_work(&pcpu_balance_work); +} static bool pcpu_addr_in_first_chunk(void *addr) { @@ -315,6 +318,38 @@ static void pcpu_mem_free(void *ptr, size_t size) } /** + * pcpu_count_occupied_pages - count the number of pages an area occupies + * @chunk: chunk of interest + * @i: index of the area in question + * + * Count the number of pages chunk's @i'th area occupies. When the area's + * start and/or end address isn't aligned to page boundary, the straddled + * page is included in the count iff the rest of the page is free. + */ +static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i) +{ + int off = chunk->map[i] & ~1; + int end = chunk->map[i + 1] & ~1; + + if (!PAGE_ALIGNED(off) && i > 0) { + int prev = chunk->map[i - 1]; + + if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE)) + off = round_down(off, PAGE_SIZE); + } + + if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) { + int next = chunk->map[i + 1]; + int nend = chunk->map[i + 2] & ~1; + + if (!(next & 1) && nend >= round_up(end, PAGE_SIZE)) + end = round_up(end, PAGE_SIZE); + } + + return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0); +} + +/** * pcpu_chunk_relocate - put chunk in the appropriate chunk slot * @chunk: chunk of interest * @oslot: the previous slot it was on @@ -342,9 +377,14 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) /** * pcpu_need_to_extend - determine whether chunk area map needs to be extended * @chunk: chunk of interest + * @is_atomic: the allocation context * - * Determine whether area map of @chunk needs to be extended to - * accommodate a new allocation. + * Determine whether area map of @chunk needs to be extended. If + * @is_atomic, only the amount necessary for a new allocation is + * considered; however, async extension is scheduled if the left amount is + * low. If !@is_atomic, it aims for more empty space. Combined, this + * ensures that the map is likely to have enough available space to + * accomodate atomic allocations which can't extend maps directly. * * CONTEXT: * pcpu_lock. @@ -353,15 +393,26 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) * New target map allocation length if extension is necessary, 0 * otherwise. */ -static int pcpu_need_to_extend(struct pcpu_chunk *chunk) +static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic) { - int new_alloc; + int margin, new_alloc; + + if (is_atomic) { + margin = 3; + + if (chunk->map_alloc < + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW && + pcpu_async_enabled) + schedule_work(&chunk->map_extend_work); + } else { + margin = PCPU_ATOMIC_MAP_MARGIN_HIGH; + } - if (chunk->map_alloc >= chunk->map_used + 3) + if (chunk->map_alloc >= chunk->map_used + margin) return 0; new_alloc = PCPU_DFL_MAP_ALLOC; - while (new_alloc < chunk->map_used + 3) + while (new_alloc < chunk->map_used + margin) new_alloc *= 2; return new_alloc; @@ -418,11 +469,76 @@ out_unlock: return 0; } +static void pcpu_map_extend_workfn(struct work_struct *work) +{ + struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk, + map_extend_work); + int new_alloc; + + spin_lock_irq(&pcpu_lock); + new_alloc = pcpu_need_to_extend(chunk, false); + spin_unlock_irq(&pcpu_lock); + + if (new_alloc) + pcpu_extend_area_map(chunk, new_alloc); +} + +/** + * pcpu_fit_in_area - try to fit the requested allocation in a candidate area + * @chunk: chunk the candidate area belongs to + * @off: the offset to the start of the candidate area + * @this_size: the size of the candidate area + * @size: the size of the target allocation + * @align: the alignment of the target allocation + * @pop_only: only allocate from already populated region + * + * We're trying to allocate @size bytes aligned at @align. @chunk's area + * at @off sized @this_size is a candidate. This function determines + * whether the target allocation fits in the candidate area and returns the + * number of bytes to pad after @off. If the target area doesn't fit, -1 + * is returned. + * + * If @pop_only is %true, this function only considers the already + * populated part of the candidate area. + */ +static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size, + int size, int align, bool pop_only) +{ + int cand_off = off; + + while (true) { + int head = ALIGN(cand_off, align) - off; + int page_start, page_end, rs, re; + + if (this_size < head + size) + return -1; + + if (!pop_only) + return head; + + /* + * If the first unpopulated page is beyond the end of the + * allocation, the whole allocation is populated; + * otherwise, retry from the end of the unpopulated area. + */ + page_start = PFN_DOWN(head + off); + page_end = PFN_UP(head + off + size); + + rs = page_start; + pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size)); + if (rs >= page_end) + return head; + cand_off = re * PAGE_SIZE; + } +} + /** * pcpu_alloc_area - allocate area from a pcpu_chunk * @chunk: chunk of interest * @size: wanted size in bytes * @align: wanted align + * @pop_only: allocate only from the populated area + * @occ_pages_p: out param for the number of pages the area occupies * * Try to allocate @size bytes area aligned at @align from @chunk. * Note that this function only allocates the offset. It doesn't @@ -437,7 +553,8 @@ out_unlock: * Allocated offset in @chunk on success, -1 if no matching area is * found. */ -static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) +static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, + bool pop_only, int *occ_pages_p) { int oslot = pcpu_chunk_slot(chunk); int max_contig = 0; @@ -453,11 +570,11 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) if (off & 1) continue; - /* extra for alignment requirement */ - head = ALIGN(off, align) - off; - this_size = (p[1] & ~1) - off; - if (this_size < head + size) { + + head = pcpu_fit_in_area(chunk, off, this_size, size, align, + pop_only); + if (head < 0) { if (!seen_free) { chunk->first_free = i; seen_free = true; @@ -526,6 +643,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) chunk->free_size -= size; *p |= 1; + *occ_pages_p = pcpu_count_occupied_pages(chunk, i); pcpu_chunk_relocate(chunk, oslot); return off; } @@ -541,6 +659,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) * pcpu_free_area - free area to a pcpu_chunk * @chunk: chunk of interest * @freeme: offset of area to free + * @occ_pages_p: out param for the number of pages the area occupies * * Free area starting from @freeme to @chunk. Note that this function * only modifies the allocation map. It doesn't depopulate or unmap @@ -549,7 +668,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) * CONTEXT: * pcpu_lock. */ -static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) +static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme, + int *occ_pages_p) { int oslot = pcpu_chunk_slot(chunk); int off = 0; @@ -580,6 +700,8 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) *p = off &= ~1; chunk->free_size += (p[1] & ~1) - off; + *occ_pages_p = pcpu_count_occupied_pages(chunk, i); + /* merge with next? */ if (!(p[1] & 1)) to_free++; @@ -620,6 +742,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) chunk->map_used = 1; INIT_LIST_HEAD(&chunk->list); + INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn); chunk->free_size = pcpu_unit_size; chunk->contig_hint = pcpu_unit_size; @@ -634,6 +757,50 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) pcpu_mem_free(chunk, pcpu_chunk_struct_size); } +/** + * pcpu_chunk_populated - post-population bookkeeping + * @chunk: pcpu_chunk which got populated + * @page_start: the start page + * @page_end: the end page + * + * Pages in [@page_start,@page_end) have been populated to @chunk. Update + * the bookkeeping information accordingly. Must be called after each + * successful population. + */ +static void pcpu_chunk_populated(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + int nr = page_end - page_start; + + lockdep_assert_held(&pcpu_lock); + + bitmap_set(chunk->populated, page_start, nr); + chunk->nr_populated += nr; + pcpu_nr_empty_pop_pages += nr; +} + +/** + * pcpu_chunk_depopulated - post-depopulation bookkeeping + * @chunk: pcpu_chunk which got depopulated + * @page_start: the start page + * @page_end: the end page + * + * Pages in [@page_start,@page_end) have been depopulated from @chunk. + * Update the bookkeeping information accordingly. Must be called after + * each successful depopulation. + */ +static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + int nr = page_end - page_start; + + lockdep_assert_held(&pcpu_lock); + + bitmap_clear(chunk->populated, page_start, nr); + chunk->nr_populated -= nr; + pcpu_nr_empty_pop_pages -= nr; +} + /* * Chunk management implementation. * @@ -695,21 +862,23 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * @reserved: allocate from the reserved chunk if available + * @gfp: allocation flags * - * Allocate percpu area of @size bytes aligned at @align. - * - * CONTEXT: - * Does GFP_KERNEL allocation. + * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't + * contain %GFP_KERNEL, the allocation is atomic. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ -static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) +static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, + gfp_t gfp) { static int warn_limit = 10; struct pcpu_chunk *chunk; const char *err; - int slot, off, new_alloc; + bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; + int occ_pages = 0; + int slot, off, new_alloc, cpu, ret; unsigned long flags; void __percpu *ptr; @@ -728,7 +897,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) return NULL; } - mutex_lock(&pcpu_alloc_mutex); spin_lock_irqsave(&pcpu_lock, flags); /* serve reserved allocations from the reserved chunk if available */ @@ -740,16 +908,18 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) goto fail_unlock; } - while ((new_alloc = pcpu_need_to_extend(chunk))) { + while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) { spin_unlock_irqrestore(&pcpu_lock, flags); - if (pcpu_extend_area_map(chunk, new_alloc) < 0) { + if (is_atomic || + pcpu_extend_area_map(chunk, new_alloc) < 0) { err = "failed to extend area map of reserved chunk"; - goto fail_unlock_mutex; + goto fail; } spin_lock_irqsave(&pcpu_lock, flags); } - off = pcpu_alloc_area(chunk, size, align); + off = pcpu_alloc_area(chunk, size, align, is_atomic, + &occ_pages); if (off >= 0) goto area_found; @@ -764,13 +934,15 @@ restart: if (size > chunk->contig_hint) continue; - new_alloc = pcpu_need_to_extend(chunk); + new_alloc = pcpu_need_to_extend(chunk, is_atomic); if (new_alloc) { + if (is_atomic) + continue; spin_unlock_irqrestore(&pcpu_lock, flags); if (pcpu_extend_area_map(chunk, new_alloc) < 0) { err = "failed to extend area map"; - goto fail_unlock_mutex; + goto fail; } spin_lock_irqsave(&pcpu_lock, flags); /* @@ -780,74 +952,134 @@ restart: goto restart; } - off = pcpu_alloc_area(chunk, size, align); + off = pcpu_alloc_area(chunk, size, align, is_atomic, + &occ_pages); if (off >= 0) goto area_found; } } - /* hmmm... no space left, create a new chunk */ spin_unlock_irqrestore(&pcpu_lock, flags); - chunk = pcpu_create_chunk(); - if (!chunk) { - err = "failed to allocate new chunk"; - goto fail_unlock_mutex; + /* + * No space left. Create a new chunk. We don't want multiple + * tasks to create chunks simultaneously. Serialize and create iff + * there's still no empty chunk after grabbing the mutex. + */ + if (is_atomic) + goto fail; + + mutex_lock(&pcpu_alloc_mutex); + + if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { + chunk = pcpu_create_chunk(); + if (!chunk) { + mutex_unlock(&pcpu_alloc_mutex); + err = "failed to allocate new chunk"; + goto fail; + } + + spin_lock_irqsave(&pcpu_lock, flags); + pcpu_chunk_relocate(chunk, -1); + } else { + spin_lock_irqsave(&pcpu_lock, flags); } - spin_lock_irqsave(&pcpu_lock, flags); - pcpu_chunk_relocate(chunk, -1); + mutex_unlock(&pcpu_alloc_mutex); goto restart; area_found: spin_unlock_irqrestore(&pcpu_lock, flags); - /* populate, map and clear the area */ - if (pcpu_populate_chunk(chunk, off, size)) { - spin_lock_irqsave(&pcpu_lock, flags); - pcpu_free_area(chunk, off); - err = "failed to populate"; - goto fail_unlock; + /* populate if not all pages are already there */ + if (!is_atomic) { + int page_start, page_end, rs, re; + + mutex_lock(&pcpu_alloc_mutex); + + page_start = PFN_DOWN(off); + page_end = PFN_UP(off + size); + + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + WARN_ON(chunk->immutable); + + ret = pcpu_populate_chunk(chunk, rs, re); + + spin_lock_irqsave(&pcpu_lock, flags); + if (ret) { + mutex_unlock(&pcpu_alloc_mutex); + pcpu_free_area(chunk, off, &occ_pages); + err = "failed to populate"; + goto fail_unlock; + } + pcpu_chunk_populated(chunk, rs, re); + spin_unlock_irqrestore(&pcpu_lock, flags); + } + + mutex_unlock(&pcpu_alloc_mutex); } - mutex_unlock(&pcpu_alloc_mutex); + if (chunk != pcpu_reserved_chunk) + pcpu_nr_empty_pop_pages -= occ_pages; + + if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) + pcpu_schedule_balance_work(); + + /* clear the areas and return address relative to base address */ + for_each_possible_cpu(cpu) + memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); - /* return address relative to base address */ ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); kmemleak_alloc_percpu(ptr, size); return ptr; fail_unlock: spin_unlock_irqrestore(&pcpu_lock, flags); -fail_unlock_mutex: - mutex_unlock(&pcpu_alloc_mutex); - if (warn_limit) { - pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " - "%s\n", size, align, err); +fail: + if (!is_atomic && warn_limit) { + pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n", + size, align, is_atomic, err); dump_stack(); if (!--warn_limit) pr_info("PERCPU: limit reached, disable warning\n"); } + if (is_atomic) { + /* see the flag handling in pcpu_blance_workfn() */ + pcpu_atomic_alloc_failed = true; + pcpu_schedule_balance_work(); + } return NULL; } /** - * __alloc_percpu - allocate dynamic percpu area + * __alloc_percpu_gfp - allocate dynamic percpu area * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) + * @gfp: allocation flags * - * Allocate zero-filled percpu area of @size bytes aligned at @align. - * Might sleep. Might trigger writeouts. - * - * CONTEXT: - * Does GFP_KERNEL allocation. + * Allocate zero-filled percpu area of @size bytes aligned at @align. If + * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can + * be called from any context but is a lot more likely to fail. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ +void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) +{ + return pcpu_alloc(size, align, false, gfp); +} +EXPORT_SYMBOL_GPL(__alloc_percpu_gfp); + +/** + * __alloc_percpu - allocate dynamic percpu area + * @size: size of area to allocate in bytes + * @align: alignment of area (max PAGE_SIZE) + * + * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL). + */ void __percpu *__alloc_percpu(size_t size, size_t align) { - return pcpu_alloc(size, align, false); + return pcpu_alloc(size, align, false, GFP_KERNEL); } EXPORT_SYMBOL_GPL(__alloc_percpu); @@ -869,44 +1101,121 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); */ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) { - return pcpu_alloc(size, align, true); + return pcpu_alloc(size, align, true, GFP_KERNEL); } /** - * pcpu_reclaim - reclaim fully free chunks, workqueue function + * pcpu_balance_workfn - manage the amount of free chunks and populated pages * @work: unused * * Reclaim all fully free chunks except for the first one. - * - * CONTEXT: - * workqueue context. */ -static void pcpu_reclaim(struct work_struct *work) +static void pcpu_balance_workfn(struct work_struct *work) { - LIST_HEAD(todo); - struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; + LIST_HEAD(to_free); + struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; struct pcpu_chunk *chunk, *next; + int slot, nr_to_pop, ret; + /* + * There's no reason to keep around multiple unused chunks and VM + * areas can be scarce. Destroy all free chunks except for one. + */ mutex_lock(&pcpu_alloc_mutex); spin_lock_irq(&pcpu_lock); - list_for_each_entry_safe(chunk, next, head, list) { + list_for_each_entry_safe(chunk, next, free_head, list) { WARN_ON(chunk->immutable); /* spare the first one */ - if (chunk == list_first_entry(head, struct pcpu_chunk, list)) + if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) continue; - list_move(&chunk->list, &todo); + list_move(&chunk->list, &to_free); } spin_unlock_irq(&pcpu_lock); - list_for_each_entry_safe(chunk, next, &todo, list) { - pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); + list_for_each_entry_safe(chunk, next, &to_free, list) { + int rs, re; + + pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) { + pcpu_depopulate_chunk(chunk, rs, re); + spin_lock_irq(&pcpu_lock); + pcpu_chunk_depopulated(chunk, rs, re); + spin_unlock_irq(&pcpu_lock); + } pcpu_destroy_chunk(chunk); } + /* + * Ensure there are certain number of free populated pages for + * atomic allocs. Fill up from the most packed so that atomic + * allocs don't increase fragmentation. If atomic allocation + * failed previously, always populate the maximum amount. This + * should prevent atomic allocs larger than PAGE_SIZE from keeping + * failing indefinitely; however, large atomic allocs are not + * something we support properly and can be highly unreliable and + * inefficient. + */ +retry_pop: + if (pcpu_atomic_alloc_failed) { + nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH; + /* best effort anyway, don't worry about synchronization */ + pcpu_atomic_alloc_failed = false; + } else { + nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH - + pcpu_nr_empty_pop_pages, + 0, PCPU_EMPTY_POP_PAGES_HIGH); + } + + for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) { + int nr_unpop = 0, rs, re; + + if (!nr_to_pop) + break; + + spin_lock_irq(&pcpu_lock); + list_for_each_entry(chunk, &pcpu_slot[slot], list) { + nr_unpop = pcpu_unit_pages - chunk->nr_populated; + if (nr_unpop) + break; + } + spin_unlock_irq(&pcpu_lock); + + if (!nr_unpop) + continue; + + /* @chunk can't go away while pcpu_alloc_mutex is held */ + pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) { + int nr = min(re - rs, nr_to_pop); + + ret = pcpu_populate_chunk(chunk, rs, rs + nr); + if (!ret) { + nr_to_pop -= nr; + spin_lock_irq(&pcpu_lock); + pcpu_chunk_populated(chunk, rs, rs + nr); + spin_unlock_irq(&pcpu_lock); + } else { + nr_to_pop = 0; + } + + if (!nr_to_pop) + break; + } + } + + if (nr_to_pop) { + /* ran out of chunks to populate, create a new one and retry */ + chunk = pcpu_create_chunk(); + if (chunk) { + spin_lock_irq(&pcpu_lock); + pcpu_chunk_relocate(chunk, -1); + spin_unlock_irq(&pcpu_lock); + goto retry_pop; + } + } + mutex_unlock(&pcpu_alloc_mutex); } @@ -924,7 +1233,7 @@ void free_percpu(void __percpu *ptr) void *addr; struct pcpu_chunk *chunk; unsigned long flags; - int off; + int off, occ_pages; if (!ptr) return; @@ -938,7 +1247,10 @@ void free_percpu(void __percpu *ptr) chunk = pcpu_chunk_addr_search(addr); off = addr - chunk->base_addr; - pcpu_free_area(chunk, off); + pcpu_free_area(chunk, off, &occ_pages); + + if (chunk != pcpu_reserved_chunk) + pcpu_nr_empty_pop_pages += occ_pages; /* if there are more than one fully free chunks, wake up grim reaper */ if (chunk->free_size == pcpu_unit_size) { @@ -946,7 +1258,7 @@ void free_percpu(void __percpu *ptr) list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) if (pos != chunk) { - schedule_work(&pcpu_reclaim_work); + pcpu_schedule_balance_work(); break; } } @@ -1336,11 +1648,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, */ schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&schunk->list); + INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn); schunk->base_addr = base_addr; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); schunk->immutable = true; bitmap_fill(schunk->populated, pcpu_unit_pages); + schunk->nr_populated = pcpu_unit_pages; if (ai->reserved_size) { schunk->free_size = ai->reserved_size; @@ -1364,11 +1678,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, if (dyn_size) { dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&dchunk->list); + INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn); dchunk->base_addr = base_addr; dchunk->map = dmap; dchunk->map_alloc = ARRAY_SIZE(dmap); dchunk->immutable = true; bitmap_fill(dchunk->populated, pcpu_unit_pages); + dchunk->nr_populated = pcpu_unit_pages; dchunk->contig_hint = dchunk->free_size = dyn_size; dchunk->map[0] = 1; @@ -1379,6 +1695,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* link the first chunk in */ pcpu_first_chunk = dchunk ?: schunk; + pcpu_nr_empty_pop_pages += + pcpu_count_occupied_pages(pcpu_first_chunk, 1); pcpu_chunk_relocate(pcpu_first_chunk, -1); /* we're done */ @@ -1965,3 +2283,15 @@ void __init percpu_init_late(void) spin_unlock_irqrestore(&pcpu_lock, flags); } } + +/* + * Percpu allocator is initialized early during boot when neither slab or + * workqueue is available. Plug async management until everything is up + * and running. + */ +static int __init percpu_enable_async(void) +{ + pcpu_async_enabled = true; + return 0; +} +subsys_initcall(percpu_enable_async); diff --git a/mm/rmap.c b/mm/rmap.c index 3e8491c504f8..116a5053415b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -527,7 +527,7 @@ vma_address(struct page *page, struct vm_area_struct *vma) unsigned long address = __vma_address(page, vma); /* page should be within @vma mapping range */ - VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); + VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); return address; } @@ -897,7 +897,7 @@ void page_move_anon_rmap(struct page *page, struct anon_vma *anon_vma = vma->anon_vma; VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON(!anon_vma); + VM_BUG_ON_VMA(!anon_vma, vma); VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; @@ -1024,7 +1024,7 @@ void do_page_add_anon_rmap(struct page *page, void page_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { - VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); + VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); SetPageSwapBacked(page); atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ if (PageTransHuge(page)) @@ -1355,7 +1355,11 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, continue; /* don't unmap */ } - if (ptep_clear_flush_young_notify(vma, address, pte)) + /* + * No need for _notify because we're within an + * mmu_notifier_invalidate_range_ {start|end} scope. + */ + if (ptep_clear_flush_young(vma, address, pte)) continue; /* Nuke the page table entry. */ @@ -1666,7 +1670,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) * structure at mapping cannot be freed and reused yet, * so we can safely take mapping->i_mmap_mutex. */ - VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); if (!mapping) return ret; diff --git a/mm/shmem.c b/mm/shmem.c index 0e5fb225007c..cd6fc7590e54 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2367,8 +2367,10 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc if (new_dentry->d_inode) { (void) shmem_unlink(new_dir, new_dentry); - if (they_are_dirs) + if (they_are_dirs) { + drop_nlink(new_dentry->d_inode); drop_nlink(old_dir); + } } else if (they_are_dirs) { drop_nlink(old_dir); inc_nlink(new_dir); @@ -2993,7 +2995,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) #endif spin_lock_init(&sbinfo->stat_lock); - if (percpu_counter_init(&sbinfo->used_blocks, 0)) + if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) goto failed; sbinfo->free_inodes = sbinfo->max_inodes; @@ -3075,7 +3077,9 @@ static const struct address_space_operations shmem_aops = { .write_begin = shmem_write_begin, .write_end = shmem_write_end, #endif +#ifdef CONFIG_MIGRATION .migratepage = migrate_page, +#endif .error_remove_page = generic_error_remove_page, }; diff --git a/mm/slab.c b/mm/slab.c index a467b308c682..eb2b2ea30130 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -237,11 +237,10 @@ struct arraycache_init { /* * Need this for bootstrapping a per node allocator. */ -#define NUM_INIT_LISTS (3 * MAX_NUMNODES) +#define NUM_INIT_LISTS (2 * MAX_NUMNODES) static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS]; #define CACHE_CACHE 0 -#define SIZE_AC MAX_NUMNODES -#define SIZE_NODE (2 * MAX_NUMNODES) +#define SIZE_NODE (MAX_NUMNODES) static int drain_freelist(struct kmem_cache *cache, struct kmem_cache_node *n, int tofree); @@ -253,7 +252,6 @@ static void cache_reap(struct work_struct *unused); static int slab_early_init = 1; -#define INDEX_AC kmalloc_index(sizeof(struct arraycache_init)) #define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) static void kmem_cache_node_init(struct kmem_cache_node *parent) @@ -458,9 +456,6 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache, return reciprocal_divide(offset, cache->reciprocal_buffer_size); } -static struct arraycache_init initarray_generic = - { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; - /* internal cache of cache description objs */ static struct kmem_cache kmem_cache_boot = { .batchcount = 1, @@ -476,7 +471,7 @@ static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) { - return cachep->array[smp_processor_id()]; + return this_cpu_ptr(cachep->cpu_cache); } static size_t calculate_freelist_size(int nr_objs, size_t align) @@ -785,8 +780,8 @@ static inline void *ac_get_obj(struct kmem_cache *cachep, return objp; } -static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, - void *objp) +static noinline void *__ac_put_obj(struct kmem_cache *cachep, + struct array_cache *ac, void *objp) { if (unlikely(pfmemalloc_active)) { /* Some pfmemalloc slabs exist, check if this is one */ @@ -984,46 +979,50 @@ static void drain_alien_cache(struct kmem_cache *cachep, } } -static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) +static int __cache_free_alien(struct kmem_cache *cachep, void *objp, + int node, int page_node) { - int nodeid = page_to_nid(virt_to_page(objp)); struct kmem_cache_node *n; struct alien_cache *alien = NULL; struct array_cache *ac; - int node; LIST_HEAD(list); - node = numa_mem_id(); - - /* - * Make sure we are not freeing a object from another node to the array - * cache on this cpu. - */ - if (likely(nodeid == node)) - return 0; - n = get_node(cachep, node); STATS_INC_NODEFREES(cachep); - if (n->alien && n->alien[nodeid]) { - alien = n->alien[nodeid]; + if (n->alien && n->alien[page_node]) { + alien = n->alien[page_node]; ac = &alien->ac; spin_lock(&alien->lock); if (unlikely(ac->avail == ac->limit)) { STATS_INC_ACOVERFLOW(cachep); - __drain_alien_cache(cachep, ac, nodeid, &list); + __drain_alien_cache(cachep, ac, page_node, &list); } ac_put_obj(cachep, ac, objp); spin_unlock(&alien->lock); slabs_destroy(cachep, &list); } else { - n = get_node(cachep, nodeid); + n = get_node(cachep, page_node); spin_lock(&n->list_lock); - free_block(cachep, &objp, 1, nodeid, &list); + free_block(cachep, &objp, 1, page_node, &list); spin_unlock(&n->list_lock); slabs_destroy(cachep, &list); } return 1; } + +static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) +{ + int page_node = page_to_nid(virt_to_page(objp)); + int node = numa_mem_id(); + /* + * Make sure we are not freeing a object from another node to the array + * cache on this cpu. + */ + if (likely(node == page_node)) + return 0; + + return __cache_free_alien(cachep, objp, node, page_node); +} #endif /* @@ -1092,24 +1091,25 @@ static void cpuup_canceled(long cpu) struct alien_cache **alien; LIST_HEAD(list); - /* cpu is dead; no one can alloc from it. */ - nc = cachep->array[cpu]; - cachep->array[cpu] = NULL; n = get_node(cachep, node); - if (!n) - goto free_array_cache; + continue; spin_lock_irq(&n->list_lock); /* Free limit for this kmem_cache_node */ n->free_limit -= cachep->batchcount; - if (nc) + + /* cpu is dead; no one can alloc from it. */ + nc = per_cpu_ptr(cachep->cpu_cache, cpu); + if (nc) { free_block(cachep, nc->entry, nc->avail, node, &list); + nc->avail = 0; + } if (!cpumask_empty(mask)) { spin_unlock_irq(&n->list_lock); - goto free_array_cache; + goto free_slab; } shared = n->shared; @@ -1129,9 +1129,9 @@ static void cpuup_canceled(long cpu) drain_alien_cache(cachep, alien); free_alien_cache(alien); } -free_array_cache: + +free_slab: slabs_destroy(cachep, &list); - kfree(nc); } /* * In the previous loop, all the objects were freed to @@ -1168,32 +1168,23 @@ static int cpuup_prepare(long cpu) * array caches */ list_for_each_entry(cachep, &slab_caches, list) { - struct array_cache *nc; struct array_cache *shared = NULL; struct alien_cache **alien = NULL; - nc = alloc_arraycache(node, cachep->limit, - cachep->batchcount, GFP_KERNEL); - if (!nc) - goto bad; if (cachep->shared) { shared = alloc_arraycache(node, cachep->shared * cachep->batchcount, 0xbaadf00d, GFP_KERNEL); - if (!shared) { - kfree(nc); + if (!shared) goto bad; - } } if (use_alien_caches) { alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL); if (!alien) { kfree(shared); - kfree(nc); goto bad; } } - cachep->array[cpu] = nc; n = get_node(cachep, node); BUG_ON(!n); @@ -1385,15 +1376,6 @@ static void __init set_up_node(struct kmem_cache *cachep, int index) } /* - * The memory after the last cpu cache pointer is used for the - * the node pointer. - */ -static void setup_node_pointer(struct kmem_cache *cachep) -{ - cachep->node = (struct kmem_cache_node **)&cachep->array[nr_cpu_ids]; -} - -/* * Initialisation. Called after the page allocator have been initialised and * before smp_init(). */ @@ -1404,7 +1386,6 @@ void __init kmem_cache_init(void) BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)); kmem_cache = &kmem_cache_boot; - setup_node_pointer(kmem_cache); if (num_possible_nodes() == 1) use_alien_caches = 0; @@ -1412,8 +1393,6 @@ void __init kmem_cache_init(void) for (i = 0; i < NUM_INIT_LISTS; i++) kmem_cache_node_init(&init_kmem_cache_node[i]); - set_up_node(kmem_cache, CACHE_CACHE); - /* * Fragmentation resistance on low memory - only use bigger * page orders on machines with more than 32MB of memory if @@ -1448,49 +1427,22 @@ void __init kmem_cache_init(void) * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids */ create_boot_cache(kmem_cache, "kmem_cache", - offsetof(struct kmem_cache, array[nr_cpu_ids]) + + offsetof(struct kmem_cache, node) + nr_node_ids * sizeof(struct kmem_cache_node *), SLAB_HWCACHE_ALIGN); list_add(&kmem_cache->list, &slab_caches); - - /* 2+3) create the kmalloc caches */ + slab_state = PARTIAL; /* - * Initialize the caches that provide memory for the array cache and the - * kmem_cache_node structures first. Without this, further allocations will - * bug. + * Initialize the caches that provide memory for the kmem_cache_node + * structures first. Without this, further allocations will bug. */ - - kmalloc_caches[INDEX_AC] = create_kmalloc_cache("kmalloc-ac", - kmalloc_size(INDEX_AC), ARCH_KMALLOC_FLAGS); - - if (INDEX_AC != INDEX_NODE) - kmalloc_caches[INDEX_NODE] = - create_kmalloc_cache("kmalloc-node", + kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node", kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); + slab_state = PARTIAL_NODE; slab_early_init = 0; - /* 4) Replace the bootstrap head arrays */ - { - struct array_cache *ptr; - - ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); - - memcpy(ptr, cpu_cache_get(kmem_cache), - sizeof(struct arraycache_init)); - - kmem_cache->array[smp_processor_id()] = ptr; - - ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); - - BUG_ON(cpu_cache_get(kmalloc_caches[INDEX_AC]) - != &initarray_generic.cache); - memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]), - sizeof(struct arraycache_init)); - - kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr; - } /* 5) Replace the bootstrap kmem_cache_node */ { int nid; @@ -1498,13 +1450,8 @@ void __init kmem_cache_init(void) for_each_online_node(nid) { init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); - init_list(kmalloc_caches[INDEX_AC], - &init_kmem_cache_node[SIZE_AC + nid], nid); - - if (INDEX_AC != INDEX_NODE) { - init_list(kmalloc_caches[INDEX_NODE], + init_list(kmalloc_caches[INDEX_NODE], &init_kmem_cache_node[SIZE_NODE + nid], nid); - } } } @@ -2037,56 +1984,53 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, return left_over; } +static struct array_cache __percpu *alloc_kmem_cache_cpus( + struct kmem_cache *cachep, int entries, int batchcount) +{ + int cpu; + size_t size; + struct array_cache __percpu *cpu_cache; + + size = sizeof(void *) * entries + sizeof(struct array_cache); + cpu_cache = __alloc_percpu(size, sizeof(void *)); + + if (!cpu_cache) + return NULL; + + for_each_possible_cpu(cpu) { + init_arraycache(per_cpu_ptr(cpu_cache, cpu), + entries, batchcount); + } + + return cpu_cache; +} + static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) { if (slab_state >= FULL) return enable_cpucache(cachep, gfp); + cachep->cpu_cache = alloc_kmem_cache_cpus(cachep, 1, 1); + if (!cachep->cpu_cache) + return 1; + if (slab_state == DOWN) { - /* - * Note: Creation of first cache (kmem_cache). - * The setup_node is taken care - * of by the caller of __kmem_cache_create - */ - cachep->array[smp_processor_id()] = &initarray_generic.cache; - slab_state = PARTIAL; + /* Creation of first cache (kmem_cache). */ + set_up_node(kmem_cache, CACHE_CACHE); } else if (slab_state == PARTIAL) { - /* - * Note: the second kmem_cache_create must create the cache - * that's used by kmalloc(24), otherwise the creation of - * further caches will BUG(). - */ - cachep->array[smp_processor_id()] = &initarray_generic.cache; - - /* - * If the cache that's used by kmalloc(sizeof(kmem_cache_node)) is - * the second cache, then we need to set up all its node/, - * otherwise the creation of further caches will BUG(). - */ - set_up_node(cachep, SIZE_AC); - if (INDEX_AC == INDEX_NODE) - slab_state = PARTIAL_NODE; - else - slab_state = PARTIAL_ARRAYCACHE; + /* For kmem_cache_node */ + set_up_node(cachep, SIZE_NODE); } else { - /* Remaining boot caches */ - cachep->array[smp_processor_id()] = - kmalloc(sizeof(struct arraycache_init), gfp); + int node; - if (slab_state == PARTIAL_ARRAYCACHE) { - set_up_node(cachep, SIZE_NODE); - slab_state = PARTIAL_NODE; - } else { - int node; - for_each_online_node(node) { - cachep->node[node] = - kmalloc_node(sizeof(struct kmem_cache_node), - gfp, node); - BUG_ON(!cachep->node[node]); - kmem_cache_node_init(cachep->node[node]); - } + for_each_online_node(node) { + cachep->node[node] = kmalloc_node( + sizeof(struct kmem_cache_node), gfp, node); + BUG_ON(!cachep->node[node]); + kmem_cache_node_init(cachep->node[node]); } } + cachep->node[numa_mem_id()]->next_reap = jiffies + REAPTIMEOUT_NODE + ((unsigned long)cachep) % REAPTIMEOUT_NODE; @@ -2100,6 +2044,32 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) return 0; } +unsigned long kmem_cache_flags(unsigned long object_size, + unsigned long flags, const char *name, + void (*ctor)(void *)) +{ + return flags; +} + +struct kmem_cache * +__kmem_cache_alias(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) +{ + struct kmem_cache *cachep; + + cachep = find_mergeable(size, align, flags, name, ctor); + if (cachep) { + cachep->refcount++; + + /* + * Adjust the object sizes so that we clear + * the complete object on kzalloc. + */ + cachep->object_size = max_t(int, cachep->object_size, size); + } + return cachep; +} + /** * __kmem_cache_create - Create a cache. * @cachep: cache management descriptor @@ -2124,7 +2094,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) int __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) { - size_t left_over, freelist_size, ralign; + size_t left_over, freelist_size; + size_t ralign = BYTES_PER_WORD; gfp_t gfp; int err; size_t size = cachep->size; @@ -2157,14 +2128,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) size &= ~(BYTES_PER_WORD - 1); } - /* - * Redzoning and user store require word alignment or possibly larger. - * Note this will be overridden by architecture or caller mandated - * alignment if either is greater than BYTES_PER_WORD. - */ - if (flags & SLAB_STORE_USER) - ralign = BYTES_PER_WORD; - if (flags & SLAB_RED_ZONE) { ralign = REDZONE_ALIGN; /* If redzoning, ensure that the second redzone is suitably @@ -2190,7 +2153,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) else gfp = GFP_NOWAIT; - setup_node_pointer(cachep); #if DEBUG /* @@ -2447,8 +2409,7 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep) if (rc) return rc; - for_each_online_cpu(i) - kfree(cachep->array[i]); + free_percpu(cachep->cpu_cache); /* NUMA: free the node structures */ for_each_kmem_cache_node(cachep, i, n) { @@ -2994,7 +2955,7 @@ out: #ifdef CONFIG_NUMA /* - * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set. + * Try allocating on another node if PFA_SPREAD_SLAB is a mempolicy is set. * * If we are in_interrupt, then process context, including cpusets and * mempolicy, may not apply and should not be used for allocation policy. @@ -3226,7 +3187,7 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags) { void *objp; - if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) { + if (current->mempolicy || cpuset_do_slab_mem_spread()) { objp = alternate_node_alloc(cache, flags); if (objp) goto out; @@ -3406,7 +3367,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) return; - if (likely(ac->avail < ac->limit)) { + if (ac->avail < ac->limit) { STATS_INC_FREEHIT(cachep); } else { STATS_INC_FREEMISS(cachep); @@ -3503,7 +3464,6 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) return kmem_cache_alloc_node_trace(cachep, flags, node, size); } -#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) void *__kmalloc_node(size_t size, gfp_t flags, int node) { return __do_kmalloc_node(size, flags, node, _RET_IP_); @@ -3516,13 +3476,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags, return __do_kmalloc_node(size, flags, node, caller); } EXPORT_SYMBOL(__kmalloc_node_track_caller); -#else -void *__kmalloc_node(size_t size, gfp_t flags, int node) -{ - return __do_kmalloc_node(size, flags, node, 0); -} -EXPORT_SYMBOL(__kmalloc_node); -#endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */ #endif /* CONFIG_NUMA */ /** @@ -3548,8 +3501,6 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, return ret; } - -#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) void *__kmalloc(size_t size, gfp_t flags) { return __do_kmalloc(size, flags, _RET_IP_); @@ -3562,14 +3513,6 @@ void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) } EXPORT_SYMBOL(__kmalloc_track_caller); -#else -void *__kmalloc(size_t size, gfp_t flags) -{ - return __do_kmalloc(size, flags, 0); -} -EXPORT_SYMBOL(__kmalloc); -#endif - /** * kmem_cache_free - Deallocate an object * @cachep: The cache the allocation was from. @@ -3714,72 +3657,45 @@ fail: return -ENOMEM; } -struct ccupdate_struct { - struct kmem_cache *cachep; - struct array_cache *new[0]; -}; - -static void do_ccupdate_local(void *info) -{ - struct ccupdate_struct *new = info; - struct array_cache *old; - - check_irq_off(); - old = cpu_cache_get(new->cachep); - - new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; - new->new[smp_processor_id()] = old; -} - /* Always called with the slab_mutex held */ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, int shared, gfp_t gfp) { - struct ccupdate_struct *new; - int i; + struct array_cache __percpu *cpu_cache, *prev; + int cpu; - new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *), - gfp); - if (!new) + cpu_cache = alloc_kmem_cache_cpus(cachep, limit, batchcount); + if (!cpu_cache) return -ENOMEM; - for_each_online_cpu(i) { - new->new[i] = alloc_arraycache(cpu_to_mem(i), limit, - batchcount, gfp); - if (!new->new[i]) { - for (i--; i >= 0; i--) - kfree(new->new[i]); - kfree(new); - return -ENOMEM; - } - } - new->cachep = cachep; - - on_each_cpu(do_ccupdate_local, (void *)new, 1); + prev = cachep->cpu_cache; + cachep->cpu_cache = cpu_cache; + kick_all_cpus_sync(); check_irq_on(); cachep->batchcount = batchcount; cachep->limit = limit; cachep->shared = shared; - for_each_online_cpu(i) { + if (!prev) + goto alloc_node; + + for_each_online_cpu(cpu) { LIST_HEAD(list); - struct array_cache *ccold = new->new[i]; int node; struct kmem_cache_node *n; + struct array_cache *ac = per_cpu_ptr(prev, cpu); - if (!ccold) - continue; - - node = cpu_to_mem(i); + node = cpu_to_mem(cpu); n = get_node(cachep, node); spin_lock_irq(&n->list_lock); - free_block(cachep, ccold->entry, ccold->avail, node, &list); + free_block(cachep, ac->entry, ac->avail, node, &list); spin_unlock_irq(&n->list_lock); slabs_destroy(cachep, &list); - kfree(ccold); } - kfree(new); + free_percpu(prev); + +alloc_node: return alloc_kmem_cache_node(cachep, gfp); } @@ -4262,19 +4178,15 @@ static const struct seq_operations slabstats_op = { static int slabstats_open(struct inode *inode, struct file *file) { - unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL); - int ret = -ENOMEM; - if (n) { - ret = seq_open(file, &slabstats_op); - if (!ret) { - struct seq_file *m = file->private_data; - *n = PAGE_SIZE / (2 * sizeof(unsigned long)); - m->private = n; - n = NULL; - } - kfree(n); - } - return ret; + unsigned long *n; + + n = __seq_open_private(file, &slabstats_op, PAGE_SIZE); + if (!n) + return -ENOMEM; + + *n = PAGE_SIZE / (2 * sizeof(unsigned long)); + + return 0; } static const struct file_operations proc_slabstats_operations = { diff --git a/mm/slab.h b/mm/slab.h index 0e0fdd365840..ab019e63e3c2 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -4,6 +4,41 @@ * Internal slab definitions */ +#ifdef CONFIG_SLOB +/* + * Common fields provided in kmem_cache by all slab allocators + * This struct is either used directly by the allocator (SLOB) + * or the allocator must include definitions for all fields + * provided in kmem_cache_common in their definition of kmem_cache. + * + * Once we can do anonymous structs (C11 standard) we could put a + * anonymous struct definition in these allocators so that the + * separate allocations in the kmem_cache structure of SLAB and + * SLUB is no longer needed. + */ +struct kmem_cache { + unsigned int object_size;/* The original size of the object */ + unsigned int size; /* The aligned/padded/added on size */ + unsigned int align; /* Alignment as calculated */ + unsigned long flags; /* Active flags on the slab */ + const char *name; /* Slab name for sysfs */ + int refcount; /* Use counter */ + void (*ctor)(void *); /* Called on object slot creation */ + struct list_head list; /* List of all slab caches on the system */ +}; + +#endif /* CONFIG_SLOB */ + +#ifdef CONFIG_SLAB +#include <linux/slab_def.h> +#endif + +#ifdef CONFIG_SLUB +#include <linux/slub_def.h> +#endif + +#include <linux/memcontrol.h> + /* * State of the slab allocator. * @@ -15,7 +50,6 @@ enum slab_state { DOWN, /* No slab functionality yet */ PARTIAL, /* SLUB: kmem_cache_node available */ - PARTIAL_ARRAYCACHE, /* SLAB: kmalloc size for arraycache available */ PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */ UP, /* Slab caches usable but not all extras yet */ FULL /* Everything is working */ @@ -53,15 +87,30 @@ extern void create_boot_cache(struct kmem_cache *, const char *name, size_t size, unsigned long flags); struct mem_cgroup; -#ifdef CONFIG_SLUB + +int slab_unmergeable(struct kmem_cache *s); +struct kmem_cache *find_mergeable(size_t size, size_t align, + unsigned long flags, const char *name, void (*ctor)(void *)); +#ifndef CONFIG_SLOB struct kmem_cache * __kmem_cache_alias(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)); + +unsigned long kmem_cache_flags(unsigned long object_size, + unsigned long flags, const char *name, + void (*ctor)(void *)); #else static inline struct kmem_cache * __kmem_cache_alias(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { return NULL; } + +static inline unsigned long kmem_cache_flags(unsigned long object_size, + unsigned long flags, const char *name, + void (*ctor)(void *)) +{ + return flags; +} #endif @@ -303,8 +352,8 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) * a kmem_cache_node structure allocated (which is true for all online nodes) */ #define for_each_kmem_cache_node(__s, __node, __n) \ - for (__node = 0; __n = get_node(__s, __node), __node < nr_node_ids; __node++) \ - if (__n) + for (__node = 0; __node < nr_node_ids; __node++) \ + if ((__n = get_node(__s, __node))) #endif diff --git a/mm/slab_common.c b/mm/slab_common.c index d319502b2403..3a6e0cfdf03a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -30,6 +30,43 @@ LIST_HEAD(slab_caches); DEFINE_MUTEX(slab_mutex); struct kmem_cache *kmem_cache; +/* + * Set of flags that will prevent slab merging + */ +#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ + SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ + SLAB_FAILSLAB) + +#define SLAB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ + SLAB_CACHE_DMA | SLAB_NOTRACK) + +/* + * Merge control. If this is set then no merging of slab caches will occur. + * (Could be removed. This was introduced to pacify the merge skeptics.) + */ +static int slab_nomerge; + +static int __init setup_slab_nomerge(char *str) +{ + slab_nomerge = 1; + return 1; +} + +#ifdef CONFIG_SLUB +__setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0); +#endif + +__setup("slab_nomerge", setup_slab_nomerge); + +/* + * Determine the size of a slab object + */ +unsigned int kmem_cache_size(struct kmem_cache *s) +{ + return s->object_size; +} +EXPORT_SYMBOL(kmem_cache_size); + #ifdef CONFIG_DEBUG_VM static int kmem_cache_sanity_check(const char *name, size_t size) { @@ -79,6 +116,65 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size) #endif #ifdef CONFIG_MEMCG_KMEM +static int memcg_alloc_cache_params(struct mem_cgroup *memcg, + struct kmem_cache *s, struct kmem_cache *root_cache) +{ + size_t size; + + if (!memcg_kmem_enabled()) + return 0; + + if (!memcg) { + size = offsetof(struct memcg_cache_params, memcg_caches); + size += memcg_limited_groups_array_size * sizeof(void *); + } else + size = sizeof(struct memcg_cache_params); + + s->memcg_params = kzalloc(size, GFP_KERNEL); + if (!s->memcg_params) + return -ENOMEM; + + if (memcg) { + s->memcg_params->memcg = memcg; + s->memcg_params->root_cache = root_cache; + } else + s->memcg_params->is_root_cache = true; + + return 0; +} + +static void memcg_free_cache_params(struct kmem_cache *s) +{ + kfree(s->memcg_params); +} + +static int memcg_update_cache_params(struct kmem_cache *s, int num_memcgs) +{ + int size; + struct memcg_cache_params *new_params, *cur_params; + + BUG_ON(!is_root_cache(s)); + + size = offsetof(struct memcg_cache_params, memcg_caches); + size += num_memcgs * sizeof(void *); + + new_params = kzalloc(size, GFP_KERNEL); + if (!new_params) + return -ENOMEM; + + cur_params = s->memcg_params; + memcpy(new_params->memcg_caches, cur_params->memcg_caches, + memcg_limited_groups_array_size * sizeof(void *)); + + new_params->is_root_cache = true; + + rcu_assign_pointer(s->memcg_params, new_params); + if (cur_params) + kfree_rcu(cur_params, rcu_head); + + return 0; +} + int memcg_update_all_caches(int num_memcgs) { struct kmem_cache *s; @@ -89,9 +185,8 @@ int memcg_update_all_caches(int num_memcgs) if (!is_root_cache(s)) continue; - ret = memcg_update_cache_size(s, num_memcgs); + ret = memcg_update_cache_params(s, num_memcgs); /* - * See comment in memcontrol.c, memcg_update_cache_size: * Instead of freeing the memory, we'll just leave the caches * up to this point in an updated state. */ @@ -104,7 +199,80 @@ out: mutex_unlock(&slab_mutex); return ret; } -#endif +#else +static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg, + struct kmem_cache *s, struct kmem_cache *root_cache) +{ + return 0; +} + +static inline void memcg_free_cache_params(struct kmem_cache *s) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + +/* + * Find a mergeable slab cache + */ +int slab_unmergeable(struct kmem_cache *s) +{ + if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE)) + return 1; + + if (!is_root_cache(s)) + return 1; + + if (s->ctor) + return 1; + + /* + * We may have set a slab to be unmergeable during bootstrap. + */ + if (s->refcount < 0) + return 1; + + return 0; +} + +struct kmem_cache *find_mergeable(size_t size, size_t align, + unsigned long flags, const char *name, void (*ctor)(void *)) +{ + struct kmem_cache *s; + + if (slab_nomerge || (flags & SLAB_NEVER_MERGE)) + return NULL; + + if (ctor) + return NULL; + + size = ALIGN(size, sizeof(void *)); + align = calculate_alignment(flags, align, size); + size = ALIGN(size, align); + flags = kmem_cache_flags(size, flags, name, NULL); + + list_for_each_entry(s, &slab_caches, list) { + if (slab_unmergeable(s)) + continue; + + if (size > s->size) + continue; + + if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME)) + continue; + /* + * Check if alignment is compatible. + * Courtesy of Adrian Drzewiecki + */ + if ((s->size & ~(align - 1)) != s->size) + continue; + + if (s->size - size >= sizeof(void *)) + continue; + + return s; + } + return NULL; +} /* * Figure out what the alignment of the objects will be given a set of @@ -211,8 +379,10 @@ kmem_cache_create(const char *name, size_t size, size_t align, mutex_lock(&slab_mutex); err = kmem_cache_sanity_check(name, size); - if (err) + if (err) { + s = NULL; /* suppress uninit var warning */ goto out_unlock; + } /* * Some allocators will constraint the set of valid flags to a subset diff --git a/mm/slob.c b/mm/slob.c index 21980e0f39a8..96a86206a26b 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -468,7 +468,6 @@ void *__kmalloc(size_t size, gfp_t gfp) } EXPORT_SYMBOL(__kmalloc); -#ifdef CONFIG_TRACING void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller) { return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller); @@ -481,7 +480,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfp, return __do_kmalloc_node(size, gfp, node, caller); } #endif -#endif void kfree(const void *block) { diff --git a/mm/slub.c b/mm/slub.c index 3e8afcc07a76..ae7b9f1ad394 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -169,16 +169,6 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) */ #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) -/* - * Set of flags that will prevent slab merging - */ -#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ - SLAB_FAILSLAB) - -#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_CACHE_DMA | SLAB_NOTRACK) - #define OO_SHIFT 16 #define OO_MASK ((1 << OO_SHIFT) - 1) #define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ @@ -1176,7 +1166,7 @@ out: __setup("slub_debug", setup_slub_debug); -static unsigned long kmem_cache_flags(unsigned long object_size, +unsigned long kmem_cache_flags(unsigned long object_size, unsigned long flags, const char *name, void (*ctor)(void *)) { @@ -1208,7 +1198,7 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) {} static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) {} -static inline unsigned long kmem_cache_flags(unsigned long object_size, +unsigned long kmem_cache_flags(unsigned long object_size, unsigned long flags, const char *name, void (*ctor)(void *)) { @@ -1699,7 +1689,12 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, struct kmem_cache_cpu *c) { void *object; - int searchnode = (node == NUMA_NO_NODE) ? numa_mem_id() : node; + int searchnode = node; + + if (node == NUMA_NO_NODE) + searchnode = numa_mem_id(); + else if (!node_present_pages(node)) + searchnode = node_to_mem_node(node); object = get_partial_node(s, get_node(s, searchnode), c, flags); if (object || node != NUMA_NO_NODE) @@ -2280,11 +2275,18 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, redo: if (unlikely(!node_match(page, node))) { - stat(s, ALLOC_NODE_MISMATCH); - deactivate_slab(s, page, c->freelist); - c->page = NULL; - c->freelist = NULL; - goto new_slab; + int searchnode = node; + + if (node != NUMA_NO_NODE && !node_present_pages(node)) + searchnode = node_to_mem_node(node); + + if (unlikely(!node_match(page, searchnode))) { + stat(s, ALLOC_NODE_MISMATCH); + deactivate_slab(s, page, c->freelist); + c->page = NULL; + c->freelist = NULL; + goto new_slab; + } } /* @@ -2707,12 +2709,6 @@ static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; static int slub_min_objects; /* - * Merge control. If this is set then no merging of slab caches will occur. - * (Could be removed. This was introduced to pacify the merge skeptics.) - */ -static int slub_nomerge; - -/* * Calculate the order of allocation given an slab object size. * * The order of allocation has significant impact on performance and other @@ -3240,14 +3236,6 @@ static int __init setup_slub_min_objects(char *str) __setup("slub_min_objects=", setup_slub_min_objects); -static int __init setup_slub_nomerge(char *str) -{ - slub_nomerge = 1; - return 1; -} - -__setup("slub_nomerge", setup_slub_nomerge); - void *__kmalloc(size_t size, gfp_t flags) { struct kmem_cache *s; @@ -3625,69 +3613,6 @@ void __init kmem_cache_init_late(void) { } -/* - * Find a mergeable slab cache - */ -static int slab_unmergeable(struct kmem_cache *s) -{ - if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) - return 1; - - if (!is_root_cache(s)) - return 1; - - if (s->ctor) - return 1; - - /* - * We may have set a slab to be unmergeable during bootstrap. - */ - if (s->refcount < 0) - return 1; - - return 0; -} - -static struct kmem_cache *find_mergeable(size_t size, size_t align, - unsigned long flags, const char *name, void (*ctor)(void *)) -{ - struct kmem_cache *s; - - if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) - return NULL; - - if (ctor) - return NULL; - - size = ALIGN(size, sizeof(void *)); - align = calculate_alignment(flags, align, size); - size = ALIGN(size, align); - flags = kmem_cache_flags(size, flags, name, NULL); - - list_for_each_entry(s, &slab_caches, list) { - if (slab_unmergeable(s)) - continue; - - if (size > s->size) - continue; - - if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) - continue; - /* - * Check if alignment is compatible. - * Courtesy of Adrian Drzewiecki - */ - if ((s->size & ~(align - 1)) != s->size) - continue; - - if (s->size - size >= sizeof(void *)) - continue; - - return s; - } - return NULL; -} - struct kmem_cache * __kmem_cache_alias(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) @@ -4604,6 +4529,14 @@ static ssize_t trace_show(struct kmem_cache *s, char *buf) static ssize_t trace_store(struct kmem_cache *s, const char *buf, size_t length) { + /* + * Tracing a merged cache is going to give confusing results + * as well as cause other issues like converting a mergeable + * cache into an umergeable one. + */ + if (s->refcount > 1) + return -EINVAL; + s->flags &= ~SLAB_TRACE; if (buf[0] == '1') { s->flags &= ~__CMPXCHG_DOUBLE; @@ -4721,6 +4654,9 @@ static ssize_t failslab_show(struct kmem_cache *s, char *buf) static ssize_t failslab_store(struct kmem_cache *s, const char *buf, size_t length) { + if (s->refcount > 1) + return -EINVAL; + s->flags &= ~SLAB_FAILSLAB; if (buf[0] == '1') s->flags |= SLAB_FAILSLAB; diff --git a/mm/swap.c b/mm/swap.c index 6b2dc3897cd5..8a12b33936b4 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -887,18 +887,14 @@ void lru_add_drain_all(void) mutex_unlock(&lock); } -/* - * Batched page_cache_release(). Decrement the reference count on all the - * passed pages. If it fell to zero then remove the page from the LRU and - * free it. - * - * Avoid taking zone->lru_lock if possible, but if it is taken, retain it - * for the remainder of the operation. +/** + * release_pages - batched page_cache_release() + * @pages: array of pages to release + * @nr: number of pages + * @cold: whether the pages are cache cold * - * The locking in this function is against shrink_inactive_list(): we recheck - * the page count inside the lock to see whether shrink_inactive_list() - * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() - * will free it. + * Decrement the reference count on all the pages in @pages. If it + * fell to zero, remove the page from the LRU and free it. */ void release_pages(struct page **pages, int nr, bool cold) { @@ -907,6 +903,7 @@ void release_pages(struct page **pages, int nr, bool cold) struct zone *zone = NULL; struct lruvec *lruvec; unsigned long uninitialized_var(flags); + unsigned int uninitialized_var(lock_batch); for (i = 0; i < nr; i++) { struct page *page = pages[i]; @@ -920,6 +917,16 @@ void release_pages(struct page **pages, int nr, bool cold) continue; } + /* + * Make sure the IRQ-safe lock-holding time does not get + * excessive with a continuous string of pages from the + * same zone. The lock is held only if zone != NULL. + */ + if (zone && ++lock_batch == SWAP_CLUSTER_MAX) { + spin_unlock_irqrestore(&zone->lru_lock, flags); + zone = NULL; + } + if (!put_page_testzero(page)) continue; @@ -930,6 +937,7 @@ void release_pages(struct page **pages, int nr, bool cold) if (zone) spin_unlock_irqrestore(&zone->lru_lock, flags); + lock_batch = 0; zone = pagezone; spin_lock_irqsave(&zone->lru_lock, flags); } diff --git a/mm/swap_state.c b/mm/swap_state.c index 3e0ec83d000c..154444918685 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -28,7 +28,9 @@ static const struct address_space_operations swap_aops = { .writepage = swap_writepage, .set_page_dirty = swap_set_page_dirty, +#ifdef CONFIG_MIGRATION .migratepage = migrate_page, +#endif }; static struct backing_dev_info swap_backing_dev_info = { @@ -263,18 +265,12 @@ void free_page_and_swap_cache(struct page *page) void free_pages_and_swap_cache(struct page **pages, int nr) { struct page **pagep = pages; + int i; lru_add_drain(); - while (nr) { - int todo = min(nr, PAGEVEC_SIZE); - int i; - - for (i = 0; i < todo; i++) - free_swap_cache(pagep[i]); - release_pages(pagep, todo, false); - pagep += todo; - nr -= todo; - } + for (i = 0; i < nr; i++) + free_swap_cache(pagep[i]); + release_pages(pagep, nr, false); } /* diff --git a/mm/truncate.c b/mm/truncate.c index 96d167372d89..261eaf6e5a19 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -20,6 +20,7 @@ #include <linux/buffer_head.h> /* grr. try_to_release_page, do_invalidatepage */ #include <linux/cleancache.h> +#include <linux/rmap.h> #include "internal.h" static void clear_exceptional_entry(struct address_space *mapping, @@ -719,12 +720,68 @@ EXPORT_SYMBOL(truncate_pagecache); */ void truncate_setsize(struct inode *inode, loff_t newsize) { + loff_t oldsize = inode->i_size; + i_size_write(inode, newsize); + if (newsize > oldsize) + pagecache_isize_extended(inode, oldsize, newsize); truncate_pagecache(inode, newsize); } EXPORT_SYMBOL(truncate_setsize); /** + * pagecache_isize_extended - update pagecache after extension of i_size + * @inode: inode for which i_size was extended + * @from: original inode size + * @to: new inode size + * + * Handle extension of inode size either caused by extending truncate or by + * write starting after current i_size. We mark the page straddling current + * i_size RO so that page_mkwrite() is called on the nearest write access to + * the page. This way filesystem can be sure that page_mkwrite() is called on + * the page before user writes to the page via mmap after the i_size has been + * changed. + * + * The function must be called after i_size is updated so that page fault + * coming after we unlock the page will already see the new i_size. + * The function must be called while we still hold i_mutex - this not only + * makes sure i_size is stable but also that userspace cannot observe new + * i_size value before we are prepared to store mmap writes at new inode size. + */ +void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) +{ + int bsize = 1 << inode->i_blkbits; + loff_t rounded_from; + struct page *page; + pgoff_t index; + + WARN_ON(!mutex_is_locked(&inode->i_mutex)); + WARN_ON(to > inode->i_size); + + if (from >= to || bsize == PAGE_CACHE_SIZE) + return; + /* Page straddling @from will not have any hole block created? */ + rounded_from = round_up(from, bsize); + if (to <= rounded_from || !(rounded_from & (PAGE_CACHE_SIZE - 1))) + return; + + index = from >> PAGE_CACHE_SHIFT; + page = find_lock_page(inode->i_mapping, index); + /* Page not cached? Nothing to do */ + if (!page) + return; + /* + * See clear_page_dirty_for_io() for details why set_page_dirty() + * is needed. + */ + if (page_mkclean(page)) + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); +} +EXPORT_SYMBOL(pagecache_isize_extended); + +/** * truncate_pagecache_range - unmap and remove pagecache that is hole-punched * @inode: inode * @lstart: offset of beginning of hole diff --git a/mm/util.c b/mm/util.c index 093c973f1697..fec39d4509a9 100644 --- a/mm/util.c +++ b/mm/util.c @@ -170,32 +170,25 @@ static int vm_is_stack_for_task(struct task_struct *t, /* * Check if the vma is being used as a stack. * If is_group is non-zero, check in the entire thread group or else - * just check in the current task. Returns the pid of the task that - * the vma is stack for. + * just check in the current task. Returns the task_struct of the task + * that the vma is stack for. Must be called under rcu_read_lock(). */ -pid_t vm_is_stack(struct task_struct *task, - struct vm_area_struct *vma, int in_group) +struct task_struct *task_of_stack(struct task_struct *task, + struct vm_area_struct *vma, bool in_group) { - pid_t ret = 0; - if (vm_is_stack_for_task(task, vma)) - return task->pid; + return task; if (in_group) { struct task_struct *t; - rcu_read_lock(); for_each_thread(task, t) { - if (vm_is_stack_for_task(t, vma)) { - ret = t->pid; - goto done; - } + if (vm_is_stack_for_task(t, vma)) + return t; } -done: - rcu_read_unlock(); } - return ret; + return NULL; } #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2b0aa5486092..90520af7f186 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2646,21 +2646,11 @@ static const struct seq_operations vmalloc_op = { static int vmalloc_open(struct inode *inode, struct file *file) { - unsigned int *ptr = NULL; - int ret; - - if (IS_ENABLED(CONFIG_NUMA)) { - ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); - if (ptr == NULL) - return -ENOMEM; - } - ret = seq_open(file, &vmalloc_op); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = ptr; - } else - kfree(ptr); - return ret; + if (IS_ENABLED(CONFIG_NUMA)) + return seq_open_private(file, &vmalloc_op, + nr_node_ids * sizeof(unsigned int)); + else + return seq_open(file, &vmalloc_op); } static const struct file_operations proc_vmalloc_operations = { diff --git a/mm/vmscan.c b/mm/vmscan.c index 2836b5373b2e..dcb47074ae03 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -920,7 +920,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* Case 1 above */ if (current_is_kswapd() && PageReclaim(page) && - zone_is_reclaim_writeback(zone)) { + test_bit(ZONE_WRITEBACK, &zone->flags)) { nr_immediate++; goto keep_locked; @@ -1002,7 +1002,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ if (page_is_file_cache(page) && (!current_is_kswapd() || - !zone_is_reclaim_dirty(zone))) { + !test_bit(ZONE_DIRTY, &zone->flags))) { /* * Immediately reclaim when written back. * Similar in principal to deactivate_page() @@ -1563,7 +1563,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * are encountered in the nr_immediate check below. */ if (nr_writeback && nr_writeback == nr_taken) - zone_set_flag(zone, ZONE_WRITEBACK); + set_bit(ZONE_WRITEBACK, &zone->flags); /* * memcg will stall in page writeback so only consider forcibly @@ -1575,16 +1575,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * backed by a congested BDI and wait_iff_congested will stall. */ if (nr_dirty && nr_dirty == nr_congested) - zone_set_flag(zone, ZONE_CONGESTED); + set_bit(ZONE_CONGESTED, &zone->flags); /* * If dirty pages are scanned that are not queued for IO, it * implies that flushers are not keeping up. In this case, flag - * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing - * pages from reclaim context. + * the zone ZONE_DIRTY and kswapd will start writing pages from + * reclaim context. */ if (nr_unqueued_dirty == nr_taken) - zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); + set_bit(ZONE_DIRTY, &zone->flags); /* * If kswapd scans pages marked marked for immediate @@ -2315,7 +2315,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc) return reclaimable; } -/* Returns true if compaction should go ahead for a high-order request */ +/* + * Returns true if compaction should go ahead for a high-order request, or + * the high-order allocation would succeed without compaction. + */ static inline bool compaction_ready(struct zone *zone, int order) { unsigned long balance_gap, watermark; @@ -2339,8 +2342,11 @@ static inline bool compaction_ready(struct zone *zone, int order) if (compaction_deferred(zone, order)) return watermark_ok; - /* If compaction is not ready to start, keep reclaiming */ - if (!compaction_suitable(zone, order)) + /* + * If compaction is not ready to start and allocation is not likely + * to succeed without it, then keep reclaiming. + */ + if (compaction_suitable(zone, order) == COMPACT_SKIPPED) return false; return watermark_ok; @@ -2753,21 +2759,22 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, } unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, + unsigned long nr_pages, gfp_t gfp_mask, - bool noswap) + bool may_swap) { struct zonelist *zonelist; unsigned long nr_reclaimed; int nid; struct scan_control sc = { - .nr_to_reclaim = SWAP_CLUSTER_MAX, + .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), .target_mem_cgroup = memcg, .priority = DEF_PRIORITY, .may_writepage = !laptop_mode, .may_unmap = 1, - .may_swap = !noswap, + .may_swap = may_swap, }; /* @@ -2818,7 +2825,7 @@ static bool zone_balanced(struct zone *zone, int order, return false; if (IS_ENABLED(CONFIG_COMPACTION) && order && - !compaction_suitable(zone, order)) + compaction_suitable(zone, order) == COMPACT_SKIPPED) return false; return true; @@ -2978,7 +2985,7 @@ static bool kswapd_shrink_zone(struct zone *zone, /* Account for the number of pages attempted to reclaim */ *nr_attempted += sc->nr_to_reclaim; - zone_clear_flag(zone, ZONE_WRITEBACK); + clear_bit(ZONE_WRITEBACK, &zone->flags); /* * If a zone reaches its high watermark, consider it to be no longer @@ -2988,8 +2995,8 @@ static bool kswapd_shrink_zone(struct zone *zone, */ if (zone_reclaimable(zone) && zone_balanced(zone, testorder, 0, classzone_idx)) { - zone_clear_flag(zone, ZONE_CONGESTED); - zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); + clear_bit(ZONE_CONGESTED, &zone->flags); + clear_bit(ZONE_DIRTY, &zone->flags); } return sc->nr_scanned >= sc->nr_to_reclaim; @@ -3080,8 +3087,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, * If balanced, clear the dirty and congested * flags */ - zone_clear_flag(zone, ZONE_CONGESTED); - zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); + clear_bit(ZONE_CONGESTED, &zone->flags); + clear_bit(ZONE_DIRTY, &zone->flags); } } @@ -3708,11 +3715,11 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) if (node_state(node_id, N_CPU) && node_id != numa_node_id()) return ZONE_RECLAIM_NOSCAN; - if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) + if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags)) return ZONE_RECLAIM_NOSCAN; ret = __zone_reclaim(zone, gfp_mask, order); - zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); + clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags); if (!ret) count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); @@ -3791,66 +3798,3 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) } } #endif /* CONFIG_SHMEM */ - -static void warn_scan_unevictable_pages(void) -{ - printk_once(KERN_WARNING - "%s: The scan_unevictable_pages sysctl/node-interface has been " - "disabled for lack of a legitimate use case. If you have " - "one, please send an email to linux-mm@kvack.org.\n", - current->comm); -} - -/* - * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of - * all nodes' unevictable lists for evictable pages - */ -unsigned long scan_unevictable_pages; - -int scan_unevictable_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *length, loff_t *ppos) -{ - warn_scan_unevictable_pages(); - proc_doulongvec_minmax(table, write, buffer, length, ppos); - scan_unevictable_pages = 0; - return 0; -} - -#ifdef CONFIG_NUMA -/* - * per node 'scan_unevictable_pages' attribute. On demand re-scan of - * a specified node's per zone unevictable lists for evictable pages. - */ - -static ssize_t read_scan_unevictable_node(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - warn_scan_unevictable_pages(); - return sprintf(buf, "0\n"); /* always zero; should fit... */ -} - -static ssize_t write_scan_unevictable_node(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - warn_scan_unevictable_pages(); - return 1; -} - - -static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, - read_scan_unevictable_node, - write_scan_unevictable_node); - -int scan_unevictable_register_node(struct node *node) -{ - return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages); -} - -void scan_unevictable_unregister_node(struct node *node) -{ - device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages); -} -#endif diff --git a/mm/vmstat.c b/mm/vmstat.c index e9ab104b956f..1b12d390dc68 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -7,6 +7,7 @@ * zoned VM statistics * Copyright (C) 2006 Silicon Graphics, Inc., * Christoph Lameter <christoph@lameter.com> + * Copyright (C) 2008-2014 Christoph Lameter */ #include <linux/fs.h> #include <linux/mm.h> @@ -14,6 +15,7 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/cpu.h> +#include <linux/cpumask.h> #include <linux/vmstat.h> #include <linux/sched.h> #include <linux/math64.h> @@ -419,13 +421,22 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item) EXPORT_SYMBOL(dec_zone_page_state); #endif -static inline void fold_diff(int *diff) + +/* + * Fold a differential into the global counters. + * Returns the number of counters updated. + */ +static int fold_diff(int *diff) { int i; + int changes = 0; for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - if (diff[i]) + if (diff[i]) { atomic_long_add(diff[i], &vm_stat[i]); + changes++; + } + return changes; } /* @@ -441,12 +452,15 @@ static inline void fold_diff(int *diff) * statistics in the remote zone struct as well as the global cachelines * with the global counters. These could cause remote node cache line * bouncing and will have to be only done when necessary. + * + * The function returns the number of global counters updated. */ -static void refresh_cpu_vm_stats(void) +static int refresh_cpu_vm_stats(void) { struct zone *zone; int i; int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; + int changes = 0; for_each_populated_zone(zone) { struct per_cpu_pageset __percpu *p = zone->pageset; @@ -486,15 +500,17 @@ static void refresh_cpu_vm_stats(void) continue; } - if (__this_cpu_dec_return(p->expire)) continue; - if (__this_cpu_read(p->pcp.count)) + if (__this_cpu_read(p->pcp.count)) { drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); + changes++; + } #endif } - fold_diff(global_diff); + changes += fold_diff(global_diff); + return changes; } /* @@ -735,7 +751,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, TEXT_FOR_HIGHMEM(xx) xx "_movable", const char * const vmstat_text[] = { - /* Zoned VM counters */ + /* enum zone_stat_item countes */ "nr_free_pages", "nr_alloc_batch", "nr_inactive_anon", @@ -778,10 +794,13 @@ const char * const vmstat_text[] = { "workingset_nodereclaim", "nr_anon_transparent_hugepages", "nr_free_cma", + + /* enum writeback_stat_item counters */ "nr_dirty_threshold", "nr_dirty_background_threshold", #ifdef CONFIG_VM_EVENT_COUNTERS + /* enum vm_event_item counters */ "pgpgin", "pgpgout", "pswpin", @@ -860,6 +879,13 @@ const char * const vmstat_text[] = { "thp_zero_page_alloc", "thp_zero_page_alloc_failed", #endif +#ifdef CONFIG_MEMORY_BALLOON + "balloon_inflate", + "balloon_deflate", +#ifdef CONFIG_BALLOON_COMPACTION + "balloon_migrate", +#endif +#endif /* CONFIG_MEMORY_BALLOON */ #ifdef CONFIG_DEBUG_TLBFLUSH #ifdef CONFIG_SMP "nr_tlb_remote_flush", @@ -1229,20 +1255,108 @@ static const struct file_operations proc_vmstat_file_operations = { #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct delayed_work, vmstat_work); int sysctl_stat_interval __read_mostly = HZ; +static cpumask_var_t cpu_stat_off; static void vmstat_update(struct work_struct *w) { - refresh_cpu_vm_stats(); - schedule_delayed_work(this_cpu_ptr(&vmstat_work), + if (refresh_cpu_vm_stats()) + /* + * Counters were updated so we expect more updates + * to occur in the future. Keep on running the + * update worker thread. + */ + schedule_delayed_work(this_cpu_ptr(&vmstat_work), + round_jiffies_relative(sysctl_stat_interval)); + else { + /* + * We did not update any counters so the app may be in + * a mode where it does not cause counter updates. + * We may be uselessly running vmstat_update. + * Defer the checking for differentials to the + * shepherd thread on a different processor. + */ + int r; + /* + * Shepherd work thread does not race since it never + * changes the bit if its zero but the cpu + * online / off line code may race if + * worker threads are still allowed during + * shutdown / startup. + */ + r = cpumask_test_and_set_cpu(smp_processor_id(), + cpu_stat_off); + VM_BUG_ON(r); + } +} + +/* + * Check if the diffs for a certain cpu indicate that + * an update is needed. + */ +static bool need_update(int cpu) +{ + struct zone *zone; + + for_each_populated_zone(zone) { + struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu); + + BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1); + /* + * The fast way of checking if there are any vmstat diffs. + * This works because the diffs are byte sized items. + */ + if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS)) + return true; + + } + return false; +} + + +/* + * Shepherd worker thread that checks the + * differentials of processors that have their worker + * threads for vm statistics updates disabled because of + * inactivity. + */ +static void vmstat_shepherd(struct work_struct *w); + +static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd); + +static void vmstat_shepherd(struct work_struct *w) +{ + int cpu; + + get_online_cpus(); + /* Check processors whose vmstat worker threads have been disabled */ + for_each_cpu(cpu, cpu_stat_off) + if (need_update(cpu) && + cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) + + schedule_delayed_work_on(cpu, &per_cpu(vmstat_work, cpu), + __round_jiffies_relative(sysctl_stat_interval, cpu)); + + put_online_cpus(); + + schedule_delayed_work(&shepherd, round_jiffies_relative(sysctl_stat_interval)); + } -static void start_cpu_timer(int cpu) +static void __init start_shepherd_timer(void) { - struct delayed_work *work = &per_cpu(vmstat_work, cpu); + int cpu; + + for_each_possible_cpu(cpu) + INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), + vmstat_update); + + if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL)) + BUG(); + cpumask_copy(cpu_stat_off, cpu_online_mask); - INIT_DEFERRABLE_WORK(work, vmstat_update); - schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); + schedule_delayed_work(&shepherd, + round_jiffies_relative(sysctl_stat_interval)); } static void vmstat_cpu_dead(int node) @@ -1273,17 +1387,17 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb, case CPU_ONLINE: case CPU_ONLINE_FROZEN: refresh_zone_stat_thresholds(); - start_cpu_timer(cpu); node_set_state(cpu_to_node(cpu), N_CPU); + cpumask_set_cpu(cpu, cpu_stat_off); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); - per_cpu(vmstat_work, cpu).work.func = NULL; + cpumask_clear_cpu(cpu, cpu_stat_off); break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - start_cpu_timer(cpu); + cpumask_set_cpu(cpu, cpu_stat_off); break; case CPU_DEAD: case CPU_DEAD_FROZEN: @@ -1303,15 +1417,10 @@ static struct notifier_block vmstat_notifier = static int __init setup_vmstat(void) { #ifdef CONFIG_SMP - int cpu; - cpu_notifier_register_begin(); __register_cpu_notifier(&vmstat_notifier); - for_each_online_cpu(cpu) { - start_cpu_timer(cpu); - node_set_state(cpu_to_node(cpu), N_CPU); - } + start_shepherd_timer(); cpu_notifier_register_done(); #endif #ifdef CONFIG_PROC_FS diff --git a/mm/zbud.c b/mm/zbud.c index f26e7fcc7fa2..ecf1dbef6983 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -60,15 +60,17 @@ * NCHUNKS_ORDER determines the internal allocation granularity, effectively * adjusting internal fragmentation. It also determines the number of * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the - * allocation granularity will be in chunks of size PAGE_SIZE/64, and there - * will be 64 freelists per pool. + * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk + * in allocated page is occupied by zbud header, NCHUNKS will be calculated to + * 63 which shows the max number of free chunks in zbud page, also there will be + * 63 freelists per pool. */ #define NCHUNKS_ORDER 6 #define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) #define CHUNK_SIZE (1 << CHUNK_SHIFT) -#define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT) #define ZHDR_SIZE_ALIGNED CHUNK_SIZE +#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) /** * struct zbud_pool - stores metadata for each zbud pool @@ -268,10 +270,9 @@ static int num_free_chunks(struct zbud_header *zhdr) { /* * Rather than branch for different situations, just use the fact that - * free buddies have a length of zero to simplify everything. -1 at the - * end for the zbud header. + * free buddies have a length of zero to simplify everything. */ - return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks - 1; + return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks; } /***************** diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 94f38fac5e81..839a48c3ca27 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -175,7 +175,7 @@ enum fullness_group { * n <= N / f, where * n = number of allocated objects * N = total number of objects zspage can store - * f = 1/fullness_threshold_frac + * f = fullness_threshold_frac * * Similarly, we assign zspage to: * ZS_ALMOST_FULL when n > N / f @@ -199,9 +199,6 @@ struct size_class { spinlock_t lock; - /* stats */ - u64 pages_allocated; - struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; }; @@ -220,6 +217,7 @@ struct zs_pool { struct size_class size_class[ZS_SIZE_CLASSES]; gfp_t flags; /* allocation flags used when growing pool */ + atomic_long_t pages_allocated; }; /* @@ -299,7 +297,7 @@ static void zs_zpool_unmap(void *pool, unsigned long handle) static u64 zs_zpool_total_size(void *pool) { - return zs_get_total_size_bytes(pool); + return zs_get_total_pages(pool) << PAGE_SHIFT; } static struct zpool_driver zs_zpool_driver = { @@ -630,7 +628,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) while (page) { struct page *next_page; struct link_free *link; - unsigned int i, objs_on_page; + unsigned int i = 1; /* * page->index stores offset of first object starting @@ -643,14 +641,10 @@ static void init_zspage(struct page *first_page, struct size_class *class) link = (struct link_free *)kmap_atomic(page) + off / sizeof(*link); - objs_on_page = (PAGE_SIZE - off) / class->size; - for (i = 1; i <= objs_on_page; i++) { - off += class->size; - if (off < PAGE_SIZE) { - link->next = obj_location_to_handle(page, i); - link += class->size / sizeof(*link); - } + while ((off += class->size) < PAGE_SIZE) { + link->next = obj_location_to_handle(page, i++); + link += class->size / sizeof(*link); } /* @@ -662,7 +656,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) link->next = obj_location_to_handle(next_page, 0); kunmap_atomic(link); page = next_page; - off = (off + class->size) % PAGE_SIZE; + off %= PAGE_SIZE; } } @@ -1028,8 +1022,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) return 0; set_zspage_mapping(first_page, class->index, ZS_EMPTY); + atomic_long_add(class->pages_per_zspage, + &pool->pages_allocated); spin_lock(&class->lock); - class->pages_allocated += class->pages_per_zspage; } obj = (unsigned long)first_page->freelist; @@ -1082,14 +1077,13 @@ void zs_free(struct zs_pool *pool, unsigned long obj) first_page->inuse--; fullness = fix_fullness_group(pool, first_page); - - if (fullness == ZS_EMPTY) - class->pages_allocated -= class->pages_per_zspage; - spin_unlock(&class->lock); - if (fullness == ZS_EMPTY) + if (fullness == ZS_EMPTY) { + atomic_long_sub(class->pages_per_zspage, + &pool->pages_allocated); free_zspage(first_page); + } } EXPORT_SYMBOL_GPL(zs_free); @@ -1183,17 +1177,11 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) } EXPORT_SYMBOL_GPL(zs_unmap_object); -u64 zs_get_total_size_bytes(struct zs_pool *pool) +unsigned long zs_get_total_pages(struct zs_pool *pool) { - int i; - u64 npages = 0; - - for (i = 0; i < ZS_SIZE_CLASSES; i++) - npages += pool->size_class[i].pages_allocated; - - return npages << PAGE_SHIFT; + return atomic_long_read(&pool->pages_allocated); } -EXPORT_SYMBOL_GPL(zs_get_total_size_bytes); +EXPORT_SYMBOL_GPL(zs_get_total_pages); module_init(zs_init); module_exit(zs_exit); |