diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/bootmem.c | 9 | ||||
-rw-r--r-- | mm/cma.c | 82 | ||||
-rw-r--r-- | mm/compaction.c | 157 | ||||
-rw-r--r-- | mm/debug.c | 5 | ||||
-rw-r--r-- | mm/frontswap.c | 6 | ||||
-rw-r--r-- | mm/huge_memory.c | 1 | ||||
-rw-r--r-- | mm/hugetlb.c | 4 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 103 | ||||
-rw-r--r-- | mm/internal.h | 32 | ||||
-rw-r--r-- | mm/iov_iter.c | 1062 | ||||
-rw-r--r-- | mm/memcontrol.c | 1710 | ||||
-rw-r--r-- | mm/memory-failure.c | 6 | ||||
-rw-r--r-- | mm/memory.c | 56 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 30 | ||||
-rw-r--r-- | mm/mmap.c | 12 | ||||
-rw-r--r-- | mm/nobootmem.c | 8 | ||||
-rw-r--r-- | mm/oom_kill.c | 4 | ||||
-rw-r--r-- | mm/page-writeback.c | 4 | ||||
-rw-r--r-- | mm/page_alloc.c | 205 | ||||
-rw-r--r-- | mm/page_cgroup.c | 530 | ||||
-rw-r--r-- | mm/page_counter.c | 192 | ||||
-rw-r--r-- | mm/page_isolation.c | 45 | ||||
-rw-r--r-- | mm/rmap.c | 10 | ||||
-rw-r--r-- | mm/slab.c | 25 | ||||
-rw-r--r-- | mm/slab.h | 8 | ||||
-rw-r--r-- | mm/slab_common.c | 44 | ||||
-rw-r--r-- | mm/slub.c | 21 | ||||
-rw-r--r-- | mm/swap_cgroup.c | 208 | ||||
-rw-r--r-- | mm/swap_state.c | 1 | ||||
-rw-r--r-- | mm/swapfile.c | 2 | ||||
-rw-r--r-- | mm/truncate.c | 6 | ||||
-rw-r--r-- | mm/vmalloc.c | 3 | ||||
-rw-r--r-- | mm/vmpressure.c | 8 | ||||
-rw-r--r-- | mm/vmscan.c | 18 |
35 files changed, 1902 insertions, 2719 deletions
diff --git a/mm/Makefile b/mm/Makefile index 8405eb0023a9..b3c6ce932c64 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -55,7 +55,9 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o -obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o +obj-$(CONFIG_PAGE_COUNTER) += page_counter.o +obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o +obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o diff --git a/mm/bootmem.c b/mm/bootmem.c index 8a000cebb0d7..477be696511d 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -243,13 +243,10 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) static int reset_managed_pages_done __initdata; -static inline void __init reset_node_managed_pages(pg_data_t *pgdat) +void reset_node_managed_pages(pg_data_t *pgdat) { struct zone *z; - if (reset_managed_pages_done) - return; - for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) z->managed_pages = 0; } @@ -258,8 +255,12 @@ void __init reset_all_zones_managed_pages(void) { struct pglist_data *pgdat; + if (reset_managed_pages_done) + return; + for_each_online_pgdat(pgdat) reset_node_managed_pages(pgdat); + reset_managed_pages_done = 1; } @@ -124,6 +124,7 @@ static int __init cma_activate_area(struct cma *cma) err: kfree(cma->bitmap); + cma->count = 0; return -EINVAL; } @@ -214,12 +215,23 @@ int __init cma_declare_contiguous(phys_addr_t base, bool fixed, struct cma **res_cma) { phys_addr_t memblock_end = memblock_end_of_DRAM(); - phys_addr_t highmem_start = __pa(high_memory); + phys_addr_t highmem_start; int ret = 0; - pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n", - __func__, (unsigned long)size, (unsigned long)base, - (unsigned long)limit, (unsigned long)alignment); +#ifdef CONFIG_X86 + /* + * high_memory isn't direct mapped memory so retrieving its physical + * address isn't appropriate. But it would be useful to check the + * physical address of the highmem boundary so it's justfiable to get + * the physical address from it. On x86 there is a validation check for + * this case, so the following workaround is needed to avoid it. + */ + highmem_start = __pa_nodebug(high_memory); +#else + highmem_start = __pa(high_memory); +#endif + pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n", + __func__, &size, &base, &limit, &alignment); if (cma_area_count == ARRAY_SIZE(cma_areas)) { pr_err("Not enough slots for CMA reserved regions!\n"); @@ -244,52 +256,72 @@ int __init cma_declare_contiguous(phys_addr_t base, size = ALIGN(size, alignment); limit &= ~(alignment - 1); + if (!base) + fixed = false; + /* size should be aligned with order_per_bit */ if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) return -EINVAL; /* - * adjust limit to avoid crossing low/high memory boundary for - * automatically allocated regions + * If allocating at a fixed base the request region must not cross the + * low/high memory boundary. */ - if (((limit == 0 || limit > memblock_end) && - (memblock_end - size < highmem_start && - memblock_end > highmem_start)) || - (!fixed && limit > highmem_start && limit - size < highmem_start)) { - limit = highmem_start; - } - - if (fixed && base < highmem_start && base+size > highmem_start) { + if (fixed && base < highmem_start && base + size > highmem_start) { ret = -EINVAL; - pr_err("Region at %08lx defined on low/high memory boundary (%08lx)\n", - (unsigned long)base, (unsigned long)highmem_start); + pr_err("Region at %pa defined on low/high memory boundary (%pa)\n", + &base, &highmem_start); goto err; } + /* + * If the limit is unspecified or above the memblock end, its effective + * value will be the memblock end. Set it explicitly to simplify further + * checks. + */ + if (limit == 0 || limit > memblock_end) + limit = memblock_end; + /* Reserve memory */ - if (base && fixed) { + if (fixed) { if (memblock_is_region_reserved(base, size) || memblock_reserve(base, size) < 0) { ret = -EBUSY; goto err; } } else { - phys_addr_t addr = memblock_alloc_range(size, alignment, base, - limit); + phys_addr_t addr = 0; + + /* + * All pages in the reserved area must come from the same zone. + * If the requested region crosses the low/high memory boundary, + * try allocating from high memory first and fall back to low + * memory in case of failure. + */ + if (base < highmem_start && limit > highmem_start) { + addr = memblock_alloc_range(size, alignment, + highmem_start, limit); + limit = highmem_start; + } + if (!addr) { - ret = -ENOMEM; - goto err; - } else { - base = addr; + addr = memblock_alloc_range(size, alignment, base, + limit); + if (!addr) { + ret = -ENOMEM; + goto err; + } } + + base = addr; } ret = cma_init_reserved_mem(base, size, order_per_bit, res_cma); if (ret) goto err; - pr_info("Reserved %ld MiB at %08lx\n", (unsigned long)size / SZ_1M, - (unsigned long)base); + pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, + &base); return 0; err: diff --git a/mm/compaction.c b/mm/compaction.c index ec74cf0123ef..546e571e9d60 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -41,15 +41,17 @@ static inline void count_compact_events(enum vm_event_item item, long delta) static unsigned long release_freepages(struct list_head *freelist) { struct page *page, *next; - unsigned long count = 0; + unsigned long high_pfn = 0; list_for_each_entry_safe(page, next, freelist, lru) { + unsigned long pfn = page_to_pfn(page); list_del(&page->lru); __free_page(page); - count++; + if (pfn > high_pfn) + high_pfn = pfn; } - return count; + return high_pfn; } static void map_pages(struct list_head *list) @@ -195,16 +197,12 @@ static void update_pageblock_skip(struct compact_control *cc, /* Update where async and sync compaction should restart */ if (migrate_scanner) { - if (cc->finished_update_migrate) - return; if (pfn > zone->compact_cached_migrate_pfn[0]) zone->compact_cached_migrate_pfn[0] = pfn; if (cc->mode != MIGRATE_ASYNC && pfn > zone->compact_cached_migrate_pfn[1]) zone->compact_cached_migrate_pfn[1] = pfn; } else { - if (cc->finished_update_free) - return; if (pfn < zone->compact_cached_free_pfn) zone->compact_cached_free_pfn = pfn; } @@ -479,6 +477,16 @@ isolate_freepages_range(struct compact_control *cc, block_end_pfn = min(block_end_pfn, end_pfn); + /* + * pfn could pass the block_end_pfn if isolated freepage + * is more than pageblock order. In this case, we adjust + * scanning range to right one. + */ + if (pfn >= block_end_pfn) { + block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = min(block_end_pfn, end_pfn); + } + if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) break; @@ -705,7 +713,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, del_page_from_lru_list(page, lruvec, page_lru(page)); isolate_success: - cc->finished_update_migrate = true; list_add(&page->lru, migratelist); cc->nr_migratepages++; nr_isolated++; @@ -879,15 +886,6 @@ static void isolate_freepages(struct compact_control *cc) block_start_pfn - pageblock_nr_pages; /* - * Set a flag that we successfully isolated in this pageblock. - * In the next loop iteration, zone->compact_cached_free_pfn - * will not be updated and thus it will effectively contain the - * highest pageblock we isolated pages from. - */ - if (isolated) - cc->finished_update_free = true; - - /* * isolate_freepages_block() might have aborted due to async * compaction being contended */ @@ -1029,8 +1027,12 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, } acct_isolated(zone, cc); - /* Record where migration scanner will be restarted */ - cc->migrate_pfn = low_pfn; + /* + * Record where migration scanner will be restarted. If we end up in + * the same pageblock as the free scanner, make the scanners fully + * meet so that compact_finished() terminates compaction. + */ + cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn; return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; } @@ -1072,9 +1074,9 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, /* Compaction run is not finished if the watermark is not met */ watermark = low_wmark_pages(zone); - watermark += (1 << cc->order); - if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) + if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx, + cc->alloc_flags)) return COMPACT_CONTINUE; /* Direct compactor: Is a suitable page free? */ @@ -1100,7 +1102,8 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, * COMPACT_PARTIAL - If the allocation would succeed without compaction * COMPACT_CONTINUE - If compaction should run now */ -unsigned long compaction_suitable(struct zone *zone, int order) +unsigned long compaction_suitable(struct zone *zone, int order, + int alloc_flags, int classzone_idx) { int fragindex; unsigned long watermark; @@ -1112,21 +1115,30 @@ unsigned long compaction_suitable(struct zone *zone, int order) if (order == -1) return COMPACT_CONTINUE; + watermark = low_wmark_pages(zone); + /* + * If watermarks for high-order allocation are already met, there + * should be no need for compaction at all. + */ + if (zone_watermark_ok(zone, order, watermark, classzone_idx, + alloc_flags)) + return COMPACT_PARTIAL; + /* * Watermarks for order-0 must be met for compaction. Note the 2UL. * This is because during migration, copies of pages need to be * allocated and for a short time, the footprint is higher */ - watermark = low_wmark_pages(zone) + (2UL << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + watermark += (2UL << order); + if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags)) return COMPACT_SKIPPED; /* * fragmentation index determines if allocation failures are due to * low memory or external fragmentation * - * index of -1000 implies allocations might succeed depending on - * watermarks + * index of -1000 would imply allocations might succeed depending on + * watermarks, but we already failed the high-order watermark check * index towards 0 implies failure is due to lack of memory * index towards 1000 implies failure is due to fragmentation * @@ -1136,10 +1148,6 @@ unsigned long compaction_suitable(struct zone *zone, int order) if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) return COMPACT_SKIPPED; - if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark, - 0, 0)) - return COMPACT_PARTIAL; - return COMPACT_CONTINUE; } @@ -1150,8 +1158,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) unsigned long end_pfn = zone_end_pfn(zone); const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); const bool sync = cc->mode != MIGRATE_ASYNC; + unsigned long last_migrated_pfn = 0; - ret = compaction_suitable(zone, cc->order); + ret = compaction_suitable(zone, cc->order, cc->alloc_flags, + cc->classzone_idx); switch (ret) { case COMPACT_PARTIAL: case COMPACT_SKIPPED: @@ -1194,6 +1204,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) while ((ret = compact_finished(zone, cc, migratetype)) == COMPACT_CONTINUE) { int err; + unsigned long isolate_start_pfn = cc->migrate_pfn; switch (isolate_migratepages(zone, cc)) { case ISOLATE_ABORT: @@ -1202,7 +1213,12 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) cc->nr_migratepages = 0; goto out; case ISOLATE_NONE: - continue; + /* + * We haven't isolated and migrated anything, but + * there might still be unflushed migrations from + * previous cc->order aligned block. + */ + goto check_drain; case ISOLATE_SUCCESS: ; } @@ -1227,12 +1243,61 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) goto out; } } + + /* + * Record where we could have freed pages by migration and not + * yet flushed them to buddy allocator. We use the pfn that + * isolate_migratepages() started from in this loop iteration + * - this is the lowest page that could have been isolated and + * then freed by migration. + */ + if (!last_migrated_pfn) + last_migrated_pfn = isolate_start_pfn; + +check_drain: + /* + * Has the migration scanner moved away from the previous + * cc->order aligned block where we migrated from? If yes, + * flush the pages that were freed, so that they can merge and + * compact_finished() can detect immediately if allocation + * would succeed. + */ + if (cc->order > 0 && last_migrated_pfn) { + int cpu; + unsigned long current_block_start = + cc->migrate_pfn & ~((1UL << cc->order) - 1); + + if (last_migrated_pfn < current_block_start) { + cpu = get_cpu(); + lru_add_drain_cpu(cpu); + drain_local_pages(zone); + put_cpu(); + /* No more flushing until we migrate again */ + last_migrated_pfn = 0; + } + } + } out: - /* Release free pages and check accounting */ - cc->nr_freepages -= release_freepages(&cc->freepages); - VM_BUG_ON(cc->nr_freepages != 0); + /* + * Release free pages and update where the free scanner should restart, + * so we don't leave any returned pages behind in the next attempt. + */ + if (cc->nr_freepages > 0) { + unsigned long free_pfn = release_freepages(&cc->freepages); + + cc->nr_freepages = 0; + VM_BUG_ON(free_pfn == 0); + /* The cached pfn is always the first in a pageblock */ + free_pfn &= ~(pageblock_nr_pages-1); + /* + * Only go back, not forward. The cached pfn might have been + * already reset to zone end in compact_finished() + */ + if (free_pfn > zone->compact_cached_free_pfn) + zone->compact_cached_free_pfn = free_pfn; + } trace_mm_compaction_end(ret); @@ -1240,7 +1305,8 @@ out: } static unsigned long compact_zone_order(struct zone *zone, int order, - gfp_t gfp_mask, enum migrate_mode mode, int *contended) + gfp_t gfp_mask, enum migrate_mode mode, int *contended, + int alloc_flags, int classzone_idx) { unsigned long ret; struct compact_control cc = { @@ -1250,6 +1316,8 @@ static unsigned long compact_zone_order(struct zone *zone, int order, .gfp_mask = gfp_mask, .zone = zone, .mode = mode, + .alloc_flags = alloc_flags, + .classzone_idx = classzone_idx, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -1274,14 +1342,13 @@ int sysctl_extfrag_threshold = 500; * @mode: The migration mode for async, sync light, or sync migration * @contended: Return value that determines if compaction was aborted due to * need_resched() or lock contention - * @candidate_zone: Return the zone where we think allocation should succeed * * This is the main entry point for direct page compaction. */ unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, enum migrate_mode mode, int *contended, - struct zone **candidate_zone) + int alloc_flags, int classzone_idx) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; @@ -1289,7 +1356,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, struct zoneref *z; struct zone *zone; int rc = COMPACT_DEFERRED; - int alloc_flags = 0; int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ *contended = COMPACT_CONTENDED_NONE; @@ -1298,10 +1364,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, if (!order || !may_enter_fs || !may_perform_io) return COMPACT_SKIPPED; -#ifdef CONFIG_CMA - if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) - alloc_flags |= ALLOC_CMA; -#endif /* Compact each zone in the list */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { @@ -1312,7 +1374,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, continue; status = compact_zone_order(zone, order, gfp_mask, mode, - &zone_contended); + &zone_contended, alloc_flags, classzone_idx); rc = max(status, rc); /* * It takes at least one zone that wasn't lock contended @@ -1321,9 +1383,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, all_zones_contended &= zone_contended; /* If a normal allocation would succeed, stop compacting */ - if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, - alloc_flags)) { - *candidate_zone = zone; + if (zone_watermark_ok(zone, order, low_wmark_pages(zone), + classzone_idx, alloc_flags)) { /* * We think the allocation will succeed in this zone, * but it is not certain, hence the false. The caller @@ -1345,7 +1406,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, goto break_loop; } - if (mode != MIGRATE_ASYNC) { + if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) { /* * We think that allocation won't succeed in this zone * so we defer compaction there. If it ends up diff --git a/mm/debug.c b/mm/debug.c index 5ce45c9a29b5..0e58f3211f89 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -95,7 +95,10 @@ void dump_page_badflags(struct page *page, const char *reason, dump_flags(page->flags & badflags, pageflag_names, ARRAY_SIZE(pageflag_names)); } - mem_cgroup_print_bad_page(page); +#ifdef CONFIG_MEMCG + if (page->mem_cgroup) + pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup); +#endif } void dump_page(struct page *page, const char *reason) diff --git a/mm/frontswap.c b/mm/frontswap.c index c30eec536f03..8d82809eb085 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -182,7 +182,7 @@ void __frontswap_init(unsigned type, unsigned long *map) if (frontswap_ops) frontswap_ops->init(type); else { - BUG_ON(type > MAX_SWAPFILES); + BUG_ON(type >= MAX_SWAPFILES); set_bit(type, need_init); } } @@ -244,8 +244,10 @@ int __frontswap_store(struct page *page) the (older) page from frontswap */ inc_frontswap_failed_stores(); - if (dup) + if (dup) { __frontswap_clear(sis, offset); + frontswap_ops->invalidate_page(type, offset); + } } if (frontswap_writethrough_enabled) /* report failure so swap also writes to swap device */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index de984159cf0b..5b2c6875fc38 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -784,7 +784,6 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, if (!pmd_none(*pmd)) return false; entry = mk_pmd(zero_page, vma->vm_page_prot); - entry = pmd_wrprotect(entry); entry = pmd_mkhuge(entry); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9fd722769927..30cd96879152 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2638,8 +2638,9 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb_start_vma(tlb, vma); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + address = start; again: - for (address = start; address < end; address += sz) { + for (; address < end; address += sz) { ptep = huge_pte_offset(mm, address); if (!ptep) continue; @@ -2686,6 +2687,7 @@ again: page_remove_rmap(page); force_flush = !__tlb_remove_page(tlb, page); if (force_flush) { + address += sz; spin_unlock(ptl); break; } diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index a67c26e0f360..037e1c00a5b7 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -14,6 +14,7 @@ */ #include <linux/cgroup.h> +#include <linux/page_counter.h> #include <linux/slab.h> #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> @@ -23,7 +24,7 @@ struct hugetlb_cgroup { /* * the counter to account for hugepages from hugetlb. */ - struct res_counter hugepage[HUGE_MAX_HSTATE]; + struct page_counter hugepage[HUGE_MAX_HSTATE]; }; #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) @@ -60,7 +61,7 @@ static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) int idx; for (idx = 0; idx < hugetlb_max_hstate; idx++) { - if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0) + if (page_counter_read(&h_cg->hugepage[idx])) return true; } return false; @@ -79,12 +80,12 @@ hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (parent_h_cgroup) { for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) - res_counter_init(&h_cgroup->hugepage[idx], - &parent_h_cgroup->hugepage[idx]); + page_counter_init(&h_cgroup->hugepage[idx], + &parent_h_cgroup->hugepage[idx]); } else { root_h_cgroup = h_cgroup; for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) - res_counter_init(&h_cgroup->hugepage[idx], NULL); + page_counter_init(&h_cgroup->hugepage[idx], NULL); } return &h_cgroup->css; } @@ -108,9 +109,8 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, struct page *page) { - int csize; - struct res_counter *counter; - struct res_counter *fail_res; + unsigned int nr_pages; + struct page_counter *counter; struct hugetlb_cgroup *page_hcg; struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); @@ -123,15 +123,15 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, if (!page_hcg || page_hcg != h_cg) goto out; - csize = PAGE_SIZE << compound_order(page); + nr_pages = 1 << compound_order(page); if (!parent) { parent = root_h_cgroup; /* root has no limit */ - res_counter_charge_nofail(&parent->hugepage[idx], - csize, &fail_res); + page_counter_charge(&parent->hugepage[idx], nr_pages); } counter = &h_cg->hugepage[idx]; - res_counter_uncharge_until(counter, counter->parent, csize); + /* Take the pages off the local counter */ + page_counter_cancel(counter, nr_pages); set_hugetlb_cgroup(page, parent); out: @@ -166,9 +166,8 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr) { int ret = 0; - struct res_counter *fail_res; + struct page_counter *counter; struct hugetlb_cgroup *h_cg = NULL; - unsigned long csize = nr_pages * PAGE_SIZE; if (hugetlb_cgroup_disabled()) goto done; @@ -187,7 +186,7 @@ again: } rcu_read_unlock(); - ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res); + ret = page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter); css_put(&h_cg->css); done: *ptr = h_cg; @@ -213,7 +212,6 @@ void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, struct page *page) { struct hugetlb_cgroup *h_cg; - unsigned long csize = nr_pages * PAGE_SIZE; if (hugetlb_cgroup_disabled()) return; @@ -222,61 +220,76 @@ void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, if (unlikely(!h_cg)) return; set_hugetlb_cgroup(page, NULL); - res_counter_uncharge(&h_cg->hugepage[idx], csize); + page_counter_uncharge(&h_cg->hugepage[idx], nr_pages); return; } void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg) { - unsigned long csize = nr_pages * PAGE_SIZE; - if (hugetlb_cgroup_disabled() || !h_cg) return; if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) return; - res_counter_uncharge(&h_cg->hugepage[idx], csize); + page_counter_uncharge(&h_cg->hugepage[idx], nr_pages); return; } +enum { + RES_USAGE, + RES_LIMIT, + RES_MAX_USAGE, + RES_FAILCNT, +}; + static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { - int idx, name; + struct page_counter *counter; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); - idx = MEMFILE_IDX(cft->private); - name = MEMFILE_ATTR(cft->private); + counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)]; - return res_counter_read_u64(&h_cg->hugepage[idx], name); + switch (MEMFILE_ATTR(cft->private)) { + case RES_USAGE: + return (u64)page_counter_read(counter) * PAGE_SIZE; + case RES_LIMIT: + return (u64)counter->limit * PAGE_SIZE; + case RES_MAX_USAGE: + return (u64)counter->watermark * PAGE_SIZE; + case RES_FAILCNT: + return counter->failcnt; + default: + BUG(); + } } +static DEFINE_MUTEX(hugetlb_limit_mutex); + static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - int idx, name, ret; - unsigned long long val; + int ret, idx; + unsigned long nr_pages; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); + if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */ + return -EINVAL; + buf = strstrip(buf); + ret = page_counter_memparse(buf, &nr_pages); + if (ret) + return ret; + idx = MEMFILE_IDX(of_cft(of)->private); - name = MEMFILE_ATTR(of_cft(of)->private); - switch (name) { + switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_LIMIT: - if (hugetlb_cgroup_is_root(h_cg)) { - /* Can't set limit on root */ - ret = -EINVAL; - break; - } - /* This function does all necessary parse...reuse it */ - ret = res_counter_memparse_write_strategy(buf, &val); - if (ret) - break; - val = ALIGN(val, 1ULL << huge_page_shift(&hstates[idx])); - ret = res_counter_set_limit(&h_cg->hugepage[idx], val); + mutex_lock(&hugetlb_limit_mutex); + ret = page_counter_limit(&h_cg->hugepage[idx], nr_pages); + mutex_unlock(&hugetlb_limit_mutex); break; default: ret = -EINVAL; @@ -288,18 +301,18 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - int idx, name, ret = 0; + int ret = 0; + struct page_counter *counter; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); - idx = MEMFILE_IDX(of_cft(of)->private); - name = MEMFILE_ATTR(of_cft(of)->private); + counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)]; - switch (name) { + switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_MAX_USAGE: - res_counter_reset_max(&h_cg->hugepage[idx]); + page_counter_reset_watermark(counter); break; case RES_FAILCNT: - res_counter_reset_failcnt(&h_cg->hugepage[idx]); + counter->failcnt = 0; break; default: ret = -EINVAL; diff --git a/mm/internal.h b/mm/internal.h index 829304090b90..efad241f7014 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -108,6 +108,31 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); /* * in mm/page_alloc.c */ + +/* + * Locate the struct page for both the matching buddy in our + * pair (buddy1) and the combined O(n+1) page they form (page). + * + * 1) Any buddy B1 will have an order O twin B2 which satisfies + * the following equation: + * B2 = B1 ^ (1 << O) + * For example, if the starting buddy (buddy2) is #8 its order + * 1 buddy is #10: + * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 + * + * 2) Any buddy B will have an order O+1 parent P which + * satisfies the following equation: + * P = B & ~(1 << O) + * + * Assumption: *_mem_map is contiguous at least up to MAX_ORDER + */ +static inline unsigned long +__find_buddy_index(unsigned long page_idx, unsigned int order) +{ + return page_idx ^ (1 << order); +} + +extern int __isolate_free_page(struct page *page, unsigned int order); extern void __free_pages_bootmem(struct page *page, unsigned int order); extern void prep_compound_page(struct page *page, unsigned long order); #ifdef CONFIG_MEMORY_FAILURE @@ -136,13 +161,10 @@ struct compact_control { unsigned long migrate_pfn; /* isolate_migratepages search base */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ - bool finished_update_free; /* True when the zone cached pfns are - * no longer being updated - */ - bool finished_update_migrate; - int order; /* order a direct compactor needs */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ + const int alloc_flags; /* alloc flags of a direct compactor */ + const int classzone_idx; /* zone index of a direct compactor */ struct zone *zone; int contended; /* Signal need_sched() or lock * contention detected during diff --git a/mm/iov_iter.c b/mm/iov_iter.c index eafcf60f6b83..a1599ca4ab0e 100644 --- a/mm/iov_iter.c +++ b/mm/iov_iter.c @@ -3,95 +3,136 @@ #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/vmalloc.h> - -static size_t copy_to_iter_iovec(void *from, size_t bytes, struct iov_iter *i) -{ - size_t skip, copy, left, wanted; - const struct iovec *iov; - char __user *buf; - - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - wanted = bytes; - iov = i->iov; - skip = i->iov_offset; - buf = iov->iov_base + skip; - copy = min(bytes, iov->iov_len - skip); - - left = __copy_to_user(buf, from, copy); - copy -= left; - skip += copy; - from += copy; - bytes -= copy; - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = __copy_to_user(buf, from, copy); - copy -= left; - skip = copy; - from += copy; - bytes -= copy; - } - - if (skip == iov->iov_len) { - iov++; - skip = 0; - } - i->count -= wanted - bytes; - i->nr_segs -= iov - i->iov; - i->iov = iov; - i->iov_offset = skip; - return wanted - bytes; -} - -static size_t copy_from_iter_iovec(void *to, size_t bytes, struct iov_iter *i) -{ - size_t skip, copy, left, wanted; - const struct iovec *iov; - char __user *buf; - - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - wanted = bytes; - iov = i->iov; - skip = i->iov_offset; - buf = iov->iov_base + skip; - copy = min(bytes, iov->iov_len - skip); - - left = __copy_from_user(to, buf, copy); - copy -= left; - skip += copy; - to += copy; - bytes -= copy; - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = __copy_from_user(to, buf, copy); - copy -= left; - skip = copy; - to += copy; - bytes -= copy; - } - - if (skip == iov->iov_len) { - iov++; - skip = 0; - } - i->count -= wanted - bytes; - i->nr_segs -= iov - i->iov; - i->iov = iov; - i->iov_offset = skip; - return wanted - bytes; +#include <net/checksum.h> + +#define iterate_iovec(i, n, __v, __p, skip, STEP) { \ + size_t left; \ + size_t wanted = n; \ + __p = i->iov; \ + __v.iov_len = min(n, __p->iov_len - skip); \ + if (likely(__v.iov_len)) { \ + __v.iov_base = __p->iov_base + skip; \ + left = (STEP); \ + __v.iov_len -= left; \ + skip += __v.iov_len; \ + n -= __v.iov_len; \ + } else { \ + left = 0; \ + } \ + while (unlikely(!left && n)) { \ + __p++; \ + __v.iov_len = min(n, __p->iov_len); \ + if (unlikely(!__v.iov_len)) \ + continue; \ + __v.iov_base = __p->iov_base; \ + left = (STEP); \ + __v.iov_len -= left; \ + skip = __v.iov_len; \ + n -= __v.iov_len; \ + } \ + n = wanted - n; \ +} + +#define iterate_kvec(i, n, __v, __p, skip, STEP) { \ + size_t wanted = n; \ + __p = i->kvec; \ + __v.iov_len = min(n, __p->iov_len - skip); \ + if (likely(__v.iov_len)) { \ + __v.iov_base = __p->iov_base + skip; \ + (void)(STEP); \ + skip += __v.iov_len; \ + n -= __v.iov_len; \ + } \ + while (unlikely(n)) { \ + __p++; \ + __v.iov_len = min(n, __p->iov_len); \ + if (unlikely(!__v.iov_len)) \ + continue; \ + __v.iov_base = __p->iov_base; \ + (void)(STEP); \ + skip = __v.iov_len; \ + n -= __v.iov_len; \ + } \ + n = wanted; \ +} + +#define iterate_bvec(i, n, __v, __p, skip, STEP) { \ + size_t wanted = n; \ + __p = i->bvec; \ + __v.bv_len = min_t(size_t, n, __p->bv_len - skip); \ + if (likely(__v.bv_len)) { \ + __v.bv_page = __p->bv_page; \ + __v.bv_offset = __p->bv_offset + skip; \ + (void)(STEP); \ + skip += __v.bv_len; \ + n -= __v.bv_len; \ + } \ + while (unlikely(n)) { \ + __p++; \ + __v.bv_len = min_t(size_t, n, __p->bv_len); \ + if (unlikely(!__v.bv_len)) \ + continue; \ + __v.bv_page = __p->bv_page; \ + __v.bv_offset = __p->bv_offset; \ + (void)(STEP); \ + skip = __v.bv_len; \ + n -= __v.bv_len; \ + } \ + n = wanted; \ +} + +#define iterate_all_kinds(i, n, v, I, B, K) { \ + size_t skip = i->iov_offset; \ + if (unlikely(i->type & ITER_BVEC)) { \ + const struct bio_vec *bvec; \ + struct bio_vec v; \ + iterate_bvec(i, n, v, bvec, skip, (B)) \ + } else if (unlikely(i->type & ITER_KVEC)) { \ + const struct kvec *kvec; \ + struct kvec v; \ + iterate_kvec(i, n, v, kvec, skip, (K)) \ + } else { \ + const struct iovec *iov; \ + struct iovec v; \ + iterate_iovec(i, n, v, iov, skip, (I)) \ + } \ +} + +#define iterate_and_advance(i, n, v, I, B, K) { \ + size_t skip = i->iov_offset; \ + if (unlikely(i->type & ITER_BVEC)) { \ + const struct bio_vec *bvec; \ + struct bio_vec v; \ + iterate_bvec(i, n, v, bvec, skip, (B)) \ + if (skip == bvec->bv_len) { \ + bvec++; \ + skip = 0; \ + } \ + i->nr_segs -= bvec - i->bvec; \ + i->bvec = bvec; \ + } else if (unlikely(i->type & ITER_KVEC)) { \ + const struct kvec *kvec; \ + struct kvec v; \ + iterate_kvec(i, n, v, kvec, skip, (K)) \ + if (skip == kvec->iov_len) { \ + kvec++; \ + skip = 0; \ + } \ + i->nr_segs -= kvec - i->kvec; \ + i->kvec = kvec; \ + } else { \ + const struct iovec *iov; \ + struct iovec v; \ + iterate_iovec(i, n, v, iov, skip, (I)) \ + if (skip == iov->iov_len) { \ + iov++; \ + skip = 0; \ + } \ + i->nr_segs -= iov - i->iov; \ + i->iov = iov; \ + } \ + i->count -= n; \ + i->iov_offset = skip; \ } static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes, @@ -256,134 +297,6 @@ done: return wanted - bytes; } -static size_t zero_iovec(size_t bytes, struct iov_iter *i) -{ - size_t skip, copy, left, wanted; - const struct iovec *iov; - char __user *buf; - - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - wanted = bytes; - iov = i->iov; - skip = i->iov_offset; - buf = iov->iov_base + skip; - copy = min(bytes, iov->iov_len - skip); - - left = __clear_user(buf, copy); - copy -= left; - skip += copy; - bytes -= copy; - - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = __clear_user(buf, copy); - copy -= left; - skip = copy; - bytes -= copy; - } - - if (skip == iov->iov_len) { - iov++; - skip = 0; - } - i->count -= wanted - bytes; - i->nr_segs -= iov - i->iov; - i->iov = iov; - i->iov_offset = skip; - return wanted - bytes; -} - -static size_t __iovec_copy_from_user_inatomic(char *vaddr, - const struct iovec *iov, size_t base, size_t bytes) -{ - size_t copied = 0, left = 0; - - while (bytes) { - char __user *buf = iov->iov_base + base; - int copy = min(bytes, iov->iov_len - base); - - base = 0; - left = __copy_from_user_inatomic(vaddr, buf, copy); - copied += copy; - bytes -= copy; - vaddr += copy; - iov++; - - if (unlikely(left)) - break; - } - return copied - left; -} - -/* - * Copy as much as we can into the page and return the number of bytes which - * were successfully copied. If a fault is encountered then return the number of - * bytes which were copied. - */ -static size_t copy_from_user_atomic_iovec(struct page *page, - struct iov_iter *i, unsigned long offset, size_t bytes) -{ - char *kaddr; - size_t copied; - - kaddr = kmap_atomic(page); - if (likely(i->nr_segs == 1)) { - int left; - char __user *buf = i->iov->iov_base + i->iov_offset; - left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); - copied = bytes - left; - } else { - copied = __iovec_copy_from_user_inatomic(kaddr + offset, - i->iov, i->iov_offset, bytes); - } - kunmap_atomic(kaddr); - - return copied; -} - -static void advance_iovec(struct iov_iter *i, size_t bytes) -{ - BUG_ON(i->count < bytes); - - if (likely(i->nr_segs == 1)) { - i->iov_offset += bytes; - i->count -= bytes; - } else { - const struct iovec *iov = i->iov; - size_t base = i->iov_offset; - unsigned long nr_segs = i->nr_segs; - - /* - * The !iov->iov_len check ensures we skip over unlikely - * zero-length segments (without overruning the iovec). - */ - while (bytes || unlikely(i->count && !iov->iov_len)) { - int copy; - - copy = min(bytes, iov->iov_len - base); - BUG_ON(!i->count || i->count < copy); - i->count -= copy; - bytes -= copy; - base += copy; - if (iov->iov_len == base) { - iov++; - nr_segs--; - base = 0; - } - } - i->iov = iov; - i->iov_offset = base; - i->nr_segs = nr_segs; - } -} - /* * Fault in the first iovec of the given iov_iter, to a maximum length * of bytes. Returns 0 on success, or non-zero if the memory could not be @@ -395,7 +308,7 @@ static void advance_iovec(struct iov_iter *i, size_t bytes) */ int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) { - if (!(i->type & ITER_BVEC)) { + if (!(i->type & (ITER_BVEC|ITER_KVEC))) { char __user *buf = i->iov->iov_base + i->iov_offset; bytes = min(bytes, i->iov->iov_len - i->iov_offset); return fault_in_pages_readable(buf, bytes); @@ -404,136 +317,25 @@ int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) } EXPORT_SYMBOL(iov_iter_fault_in_readable); -static unsigned long alignment_iovec(const struct iov_iter *i) -{ - const struct iovec *iov = i->iov; - unsigned long res; - size_t size = i->count; - size_t n; - - if (!size) - return 0; - - res = (unsigned long)iov->iov_base + i->iov_offset; - n = iov->iov_len - i->iov_offset; - if (n >= size) - return res | size; - size -= n; - res |= n; - while (size > (++iov)->iov_len) { - res |= (unsigned long)iov->iov_base | iov->iov_len; - size -= iov->iov_len; - } - res |= (unsigned long)iov->iov_base | size; - return res; -} - void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov, unsigned long nr_segs, size_t count) { /* It will get better. Eventually... */ - if (segment_eq(get_fs(), KERNEL_DS)) + if (segment_eq(get_fs(), KERNEL_DS)) { direction |= ITER_KVEC; - i->type = direction; - i->iov = iov; + i->type = direction; + i->kvec = (struct kvec *)iov; + } else { + i->type = direction; + i->iov = iov; + } i->nr_segs = nr_segs; i->iov_offset = 0; i->count = count; } EXPORT_SYMBOL(iov_iter_init); -static ssize_t get_pages_iovec(struct iov_iter *i, - struct page **pages, size_t maxsize, unsigned maxpages, - size_t *start) -{ - size_t offset = i->iov_offset; - const struct iovec *iov = i->iov; - size_t len; - unsigned long addr; - int n; - int res; - - len = iov->iov_len - offset; - if (len > i->count) - len = i->count; - if (len > maxsize) - len = maxsize; - addr = (unsigned long)iov->iov_base + offset; - len += *start = addr & (PAGE_SIZE - 1); - if (len > maxpages * PAGE_SIZE) - len = maxpages * PAGE_SIZE; - addr &= ~(PAGE_SIZE - 1); - n = (len + PAGE_SIZE - 1) / PAGE_SIZE; - res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages); - if (unlikely(res < 0)) - return res; - return (res == n ? len : res * PAGE_SIZE) - *start; -} - -static ssize_t get_pages_alloc_iovec(struct iov_iter *i, - struct page ***pages, size_t maxsize, - size_t *start) -{ - size_t offset = i->iov_offset; - const struct iovec *iov = i->iov; - size_t len; - unsigned long addr; - void *p; - int n; - int res; - - len = iov->iov_len - offset; - if (len > i->count) - len = i->count; - if (len > maxsize) - len = maxsize; - addr = (unsigned long)iov->iov_base + offset; - len += *start = addr & (PAGE_SIZE - 1); - addr &= ~(PAGE_SIZE - 1); - n = (len + PAGE_SIZE - 1) / PAGE_SIZE; - - p = kmalloc(n * sizeof(struct page *), GFP_KERNEL); - if (!p) - p = vmalloc(n * sizeof(struct page *)); - if (!p) - return -ENOMEM; - - res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p); - if (unlikely(res < 0)) { - kvfree(p); - return res; - } - *pages = p; - return (res == n ? len : res * PAGE_SIZE) - *start; -} - -static int iov_iter_npages_iovec(const struct iov_iter *i, int maxpages) -{ - size_t offset = i->iov_offset; - size_t size = i->count; - const struct iovec *iov = i->iov; - int npages = 0; - int n; - - for (n = 0; size && n < i->nr_segs; n++, iov++) { - unsigned long addr = (unsigned long)iov->iov_base + offset; - size_t len = iov->iov_len - offset; - offset = 0; - if (unlikely(!len)) /* empty segment */ - continue; - if (len > size) - len = size; - npages += (addr + len + PAGE_SIZE - 1) / PAGE_SIZE - - addr / PAGE_SIZE; - if (npages >= maxpages) /* don't bother going further */ - return maxpages; - size -= len; - offset = 0; - } - return min(npages, maxpages); -} - static void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len) { char *from = kmap_atomic(page); @@ -555,293 +357,78 @@ static void memzero_page(struct page *page, size_t offset, size_t len) kunmap_atomic(addr); } -static size_t copy_to_iter_bvec(void *from, size_t bytes, struct iov_iter *i) +size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i) { - size_t skip, copy, wanted; - const struct bio_vec *bvec; - + char *from = addr; if (unlikely(bytes > i->count)) bytes = i->count; if (unlikely(!bytes)) return 0; - wanted = bytes; - bvec = i->bvec; - skip = i->iov_offset; - copy = min_t(size_t, bytes, bvec->bv_len - skip); + iterate_and_advance(i, bytes, v, + __copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len, + v.iov_len), + memcpy_to_page(v.bv_page, v.bv_offset, + (from += v.bv_len) - v.bv_len, v.bv_len), + memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len) + ) - memcpy_to_page(bvec->bv_page, skip + bvec->bv_offset, from, copy); - skip += copy; - from += copy; - bytes -= copy; - while (bytes) { - bvec++; - copy = min(bytes, (size_t)bvec->bv_len); - memcpy_to_page(bvec->bv_page, bvec->bv_offset, from, copy); - skip = copy; - from += copy; - bytes -= copy; - } - if (skip == bvec->bv_len) { - bvec++; - skip = 0; - } - i->count -= wanted - bytes; - i->nr_segs -= bvec - i->bvec; - i->bvec = bvec; - i->iov_offset = skip; - return wanted - bytes; + return bytes; } +EXPORT_SYMBOL(copy_to_iter); -static size_t copy_from_iter_bvec(void *to, size_t bytes, struct iov_iter *i) +size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { - size_t skip, copy, wanted; - const struct bio_vec *bvec; - + char *to = addr; if (unlikely(bytes > i->count)) bytes = i->count; if (unlikely(!bytes)) return 0; - wanted = bytes; - bvec = i->bvec; - skip = i->iov_offset; - - copy = min(bytes, bvec->bv_len - skip); - - memcpy_from_page(to, bvec->bv_page, bvec->bv_offset + skip, copy); - - to += copy; - skip += copy; - bytes -= copy; - - while (bytes) { - bvec++; - copy = min(bytes, (size_t)bvec->bv_len); - memcpy_from_page(to, bvec->bv_page, bvec->bv_offset, copy); - skip = copy; - to += copy; - bytes -= copy; - } - if (skip == bvec->bv_len) { - bvec++; - skip = 0; - } - i->count -= wanted; - i->nr_segs -= bvec - i->bvec; - i->bvec = bvec; - i->iov_offset = skip; - return wanted; -} - -static size_t copy_page_to_iter_bvec(struct page *page, size_t offset, - size_t bytes, struct iov_iter *i) -{ - void *kaddr = kmap_atomic(page); - size_t wanted = copy_to_iter_bvec(kaddr + offset, bytes, i); - kunmap_atomic(kaddr); - return wanted; -} + iterate_and_advance(i, bytes, v, + __copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base, + v.iov_len), + memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, + v.bv_offset, v.bv_len), + memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) + ) -static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, - size_t bytes, struct iov_iter *i) -{ - void *kaddr = kmap_atomic(page); - size_t wanted = copy_from_iter_bvec(kaddr + offset, bytes, i); - kunmap_atomic(kaddr); - return wanted; + return bytes; } +EXPORT_SYMBOL(copy_from_iter); -static size_t zero_bvec(size_t bytes, struct iov_iter *i) +size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { - size_t skip, copy, wanted; - const struct bio_vec *bvec; - + char *to = addr; if (unlikely(bytes > i->count)) bytes = i->count; if (unlikely(!bytes)) return 0; - wanted = bytes; - bvec = i->bvec; - skip = i->iov_offset; - copy = min_t(size_t, bytes, bvec->bv_len - skip); + iterate_and_advance(i, bytes, v, + __copy_from_user_nocache((to += v.iov_len) - v.iov_len, + v.iov_base, v.iov_len), + memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, + v.bv_offset, v.bv_len), + memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) + ) - memzero_page(bvec->bv_page, skip + bvec->bv_offset, copy); - skip += copy; - bytes -= copy; - while (bytes) { - bvec++; - copy = min(bytes, (size_t)bvec->bv_len); - memzero_page(bvec->bv_page, bvec->bv_offset, copy); - skip = copy; - bytes -= copy; - } - if (skip == bvec->bv_len) { - bvec++; - skip = 0; - } - i->count -= wanted - bytes; - i->nr_segs -= bvec - i->bvec; - i->bvec = bvec; - i->iov_offset = skip; - return wanted - bytes; -} - -static size_t copy_from_user_bvec(struct page *page, - struct iov_iter *i, unsigned long offset, size_t bytes) -{ - char *kaddr; - size_t left; - const struct bio_vec *bvec; - size_t base = i->iov_offset; - - kaddr = kmap_atomic(page); - for (left = bytes, bvec = i->bvec; left; bvec++, base = 0) { - size_t copy = min(left, bvec->bv_len - base); - if (!bvec->bv_len) - continue; - memcpy_from_page(kaddr + offset, bvec->bv_page, - bvec->bv_offset + base, copy); - offset += copy; - left -= copy; - } - kunmap_atomic(kaddr); return bytes; } - -static void advance_bvec(struct iov_iter *i, size_t bytes) -{ - BUG_ON(i->count < bytes); - - if (likely(i->nr_segs == 1)) { - i->iov_offset += bytes; - i->count -= bytes; - } else { - const struct bio_vec *bvec = i->bvec; - size_t base = i->iov_offset; - unsigned long nr_segs = i->nr_segs; - - /* - * The !iov->iov_len check ensures we skip over unlikely - * zero-length segments (without overruning the iovec). - */ - while (bytes || unlikely(i->count && !bvec->bv_len)) { - int copy; - - copy = min(bytes, bvec->bv_len - base); - BUG_ON(!i->count || i->count < copy); - i->count -= copy; - bytes -= copy; - base += copy; - if (bvec->bv_len == base) { - bvec++; - nr_segs--; - base = 0; - } - } - i->bvec = bvec; - i->iov_offset = base; - i->nr_segs = nr_segs; - } -} - -static unsigned long alignment_bvec(const struct iov_iter *i) -{ - const struct bio_vec *bvec = i->bvec; - unsigned long res; - size_t size = i->count; - size_t n; - - if (!size) - return 0; - - res = bvec->bv_offset + i->iov_offset; - n = bvec->bv_len - i->iov_offset; - if (n >= size) - return res | size; - size -= n; - res |= n; - while (size > (++bvec)->bv_len) { - res |= bvec->bv_offset | bvec->bv_len; - size -= bvec->bv_len; - } - res |= bvec->bv_offset | size; - return res; -} - -static ssize_t get_pages_bvec(struct iov_iter *i, - struct page **pages, size_t maxsize, unsigned maxpages, - size_t *start) -{ - const struct bio_vec *bvec = i->bvec; - size_t len = bvec->bv_len - i->iov_offset; - if (len > i->count) - len = i->count; - if (len > maxsize) - len = maxsize; - /* can't be more than PAGE_SIZE */ - *start = bvec->bv_offset + i->iov_offset; - - get_page(*pages = bvec->bv_page); - - return len; -} - -static ssize_t get_pages_alloc_bvec(struct iov_iter *i, - struct page ***pages, size_t maxsize, - size_t *start) -{ - const struct bio_vec *bvec = i->bvec; - size_t len = bvec->bv_len - i->iov_offset; - if (len > i->count) - len = i->count; - if (len > maxsize) - len = maxsize; - *start = bvec->bv_offset + i->iov_offset; - - *pages = kmalloc(sizeof(struct page *), GFP_KERNEL); - if (!*pages) - return -ENOMEM; - - get_page(**pages = bvec->bv_page); - - return len; -} - -static int iov_iter_npages_bvec(const struct iov_iter *i, int maxpages) -{ - size_t offset = i->iov_offset; - size_t size = i->count; - const struct bio_vec *bvec = i->bvec; - int npages = 0; - int n; - - for (n = 0; size && n < i->nr_segs; n++, bvec++) { - size_t len = bvec->bv_len - offset; - offset = 0; - if (unlikely(!len)) /* empty segment */ - continue; - if (len > size) - len = size; - npages++; - if (npages >= maxpages) /* don't bother going further */ - return maxpages; - size -= len; - offset = 0; - } - return min(npages, maxpages); -} +EXPORT_SYMBOL(copy_from_iter_nocache); size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { - if (i->type & ITER_BVEC) - return copy_page_to_iter_bvec(page, offset, bytes, i); - else + if (i->type & (ITER_BVEC|ITER_KVEC)) { + void *kaddr = kmap_atomic(page); + size_t wanted = copy_to_iter(kaddr + offset, bytes, i); + kunmap_atomic(kaddr); + return wanted; + } else return copy_page_to_iter_iovec(page, offset, bytes, i); } EXPORT_SYMBOL(copy_page_to_iter); @@ -849,57 +436,53 @@ EXPORT_SYMBOL(copy_page_to_iter); size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { - if (i->type & ITER_BVEC) - return copy_page_from_iter_bvec(page, offset, bytes, i); - else + if (i->type & (ITER_BVEC|ITER_KVEC)) { + void *kaddr = kmap_atomic(page); + size_t wanted = copy_from_iter(kaddr + offset, bytes, i); + kunmap_atomic(kaddr); + return wanted; + } else return copy_page_from_iter_iovec(page, offset, bytes, i); } EXPORT_SYMBOL(copy_page_from_iter); -size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i) +size_t iov_iter_zero(size_t bytes, struct iov_iter *i) { - if (i->type & ITER_BVEC) - return copy_to_iter_bvec(addr, bytes, i); - else - return copy_to_iter_iovec(addr, bytes, i); -} -EXPORT_SYMBOL(copy_to_iter); + if (unlikely(bytes > i->count)) + bytes = i->count; -size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) -{ - if (i->type & ITER_BVEC) - return copy_from_iter_bvec(addr, bytes, i); - else - return copy_from_iter_iovec(addr, bytes, i); -} -EXPORT_SYMBOL(copy_from_iter); + if (unlikely(!bytes)) + return 0; -size_t iov_iter_zero(size_t bytes, struct iov_iter *i) -{ - if (i->type & ITER_BVEC) { - return zero_bvec(bytes, i); - } else { - return zero_iovec(bytes, i); - } + iterate_and_advance(i, bytes, v, + __clear_user(v.iov_base, v.iov_len), + memzero_page(v.bv_page, v.bv_offset, v.bv_len), + memset(v.iov_base, 0, v.iov_len) + ) + + return bytes; } EXPORT_SYMBOL(iov_iter_zero); size_t iov_iter_copy_from_user_atomic(struct page *page, struct iov_iter *i, unsigned long offset, size_t bytes) { - if (i->type & ITER_BVEC) - return copy_from_user_bvec(page, i, offset, bytes); - else - return copy_from_user_atomic_iovec(page, i, offset, bytes); + char *kaddr = kmap_atomic(page), *p = kaddr + offset; + iterate_all_kinds(i, bytes, v, + __copy_from_user_inatomic((p += v.iov_len) - v.iov_len, + v.iov_base, v.iov_len), + memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page, + v.bv_offset, v.bv_len), + memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) + ) + kunmap_atomic(kaddr); + return bytes; } EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); void iov_iter_advance(struct iov_iter *i, size_t size) { - if (i->type & ITER_BVEC) - advance_bvec(i, size); - else - advance_iovec(i, size); + iterate_and_advance(i, size, v, 0, 0, 0) } EXPORT_SYMBOL(iov_iter_advance); @@ -911,18 +494,39 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i) if (i->nr_segs == 1) return i->count; else if (i->type & ITER_BVEC) - return min(i->count, i->iov->iov_len - i->iov_offset); - else return min(i->count, i->bvec->bv_len - i->iov_offset); + else + return min(i->count, i->iov->iov_len - i->iov_offset); } EXPORT_SYMBOL(iov_iter_single_seg_count); +void iov_iter_kvec(struct iov_iter *i, int direction, + const struct kvec *iov, unsigned long nr_segs, + size_t count) +{ + BUG_ON(!(direction & ITER_KVEC)); + i->type = direction; + i->kvec = (struct kvec *)iov; + i->nr_segs = nr_segs; + i->iov_offset = 0; + i->count = count; +} +EXPORT_SYMBOL(iov_iter_kvec); + unsigned long iov_iter_alignment(const struct iov_iter *i) { - if (i->type & ITER_BVEC) - return alignment_bvec(i); - else - return alignment_iovec(i); + unsigned long res = 0; + size_t size = i->count; + + if (!size) + return 0; + + iterate_all_kinds(i, size, v, + (res |= (unsigned long)v.iov_base | v.iov_len, 0), + res |= v.bv_offset | v.bv_len, + res |= (unsigned long)v.iov_base | v.iov_len + ) + return res; } EXPORT_SYMBOL(iov_iter_alignment); @@ -930,29 +534,207 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start) { - if (i->type & ITER_BVEC) - return get_pages_bvec(i, pages, maxsize, maxpages, start); - else - return get_pages_iovec(i, pages, maxsize, maxpages, start); + if (maxsize > i->count) + maxsize = i->count; + + if (!maxsize) + return 0; + + iterate_all_kinds(i, maxsize, v, ({ + unsigned long addr = (unsigned long)v.iov_base; + size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); + int n; + int res; + + if (len > maxpages * PAGE_SIZE) + len = maxpages * PAGE_SIZE; + addr &= ~(PAGE_SIZE - 1); + n = DIV_ROUND_UP(len, PAGE_SIZE); + res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages); + if (unlikely(res < 0)) + return res; + return (res == n ? len : res * PAGE_SIZE) - *start; + 0;}),({ + /* can't be more than PAGE_SIZE */ + *start = v.bv_offset; + get_page(*pages = v.bv_page); + return v.bv_len; + }),({ + return -EFAULT; + }) + ) + return 0; } EXPORT_SYMBOL(iov_iter_get_pages); +static struct page **get_pages_array(size_t n) +{ + struct page **p = kmalloc(n * sizeof(struct page *), GFP_KERNEL); + if (!p) + p = vmalloc(n * sizeof(struct page *)); + return p; +} + ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start) { - if (i->type & ITER_BVEC) - return get_pages_alloc_bvec(i, pages, maxsize, start); - else - return get_pages_alloc_iovec(i, pages, maxsize, start); + struct page **p; + + if (maxsize > i->count) + maxsize = i->count; + + if (!maxsize) + return 0; + + iterate_all_kinds(i, maxsize, v, ({ + unsigned long addr = (unsigned long)v.iov_base; + size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); + int n; + int res; + + addr &= ~(PAGE_SIZE - 1); + n = DIV_ROUND_UP(len, PAGE_SIZE); + p = get_pages_array(n); + if (!p) + return -ENOMEM; + res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p); + if (unlikely(res < 0)) { + kvfree(p); + return res; + } + *pages = p; + return (res == n ? len : res * PAGE_SIZE) - *start; + 0;}),({ + /* can't be more than PAGE_SIZE */ + *start = v.bv_offset; + *pages = p = get_pages_array(1); + if (!p) + return -ENOMEM; + get_page(*p = v.bv_page); + return v.bv_len; + }),({ + return -EFAULT; + }) + ) + return 0; } EXPORT_SYMBOL(iov_iter_get_pages_alloc); +size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, + struct iov_iter *i) +{ + char *to = addr; + __wsum sum, next; + size_t off = 0; + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + sum = *csum; + iterate_and_advance(i, bytes, v, ({ + int err = 0; + next = csum_and_copy_from_user(v.iov_base, + (to += v.iov_len) - v.iov_len, + v.iov_len, 0, &err); + if (!err) { + sum = csum_block_add(sum, next, off); + off += v.iov_len; + } + err ? v.iov_len : 0; + }), ({ + char *p = kmap_atomic(v.bv_page); + next = csum_partial_copy_nocheck(p + v.bv_offset, + (to += v.bv_len) - v.bv_len, + v.bv_len, 0); + kunmap_atomic(p); + sum = csum_block_add(sum, next, off); + off += v.bv_len; + }),({ + next = csum_partial_copy_nocheck(v.iov_base, + (to += v.iov_len) - v.iov_len, + v.iov_len, 0); + sum = csum_block_add(sum, next, off); + off += v.iov_len; + }) + ) + *csum = sum; + return bytes; +} +EXPORT_SYMBOL(csum_and_copy_from_iter); + +size_t csum_and_copy_to_iter(void *addr, size_t bytes, __wsum *csum, + struct iov_iter *i) +{ + char *from = addr; + __wsum sum, next; + size_t off = 0; + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + sum = *csum; + iterate_and_advance(i, bytes, v, ({ + int err = 0; + next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len, + v.iov_base, + v.iov_len, 0, &err); + if (!err) { + sum = csum_block_add(sum, next, off); + off += v.iov_len; + } + err ? v.iov_len : 0; + }), ({ + char *p = kmap_atomic(v.bv_page); + next = csum_partial_copy_nocheck((from += v.bv_len) - v.bv_len, + p + v.bv_offset, + v.bv_len, 0); + kunmap_atomic(p); + sum = csum_block_add(sum, next, off); + off += v.bv_len; + }),({ + next = csum_partial_copy_nocheck((from += v.iov_len) - v.iov_len, + v.iov_base, + v.iov_len, 0); + sum = csum_block_add(sum, next, off); + off += v.iov_len; + }) + ) + *csum = sum; + return bytes; +} +EXPORT_SYMBOL(csum_and_copy_to_iter); + int iov_iter_npages(const struct iov_iter *i, int maxpages) { - if (i->type & ITER_BVEC) - return iov_iter_npages_bvec(i, maxpages); - else - return iov_iter_npages_iovec(i, maxpages); + size_t size = i->count; + int npages = 0; + + if (!size) + return 0; + + iterate_all_kinds(i, size, v, ({ + unsigned long p = (unsigned long)v.iov_base; + npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE) + - p / PAGE_SIZE; + if (npages >= maxpages) + return maxpages; + 0;}),({ + npages++; + if (npages >= maxpages) + return maxpages; + }),({ + unsigned long p = (unsigned long)v.iov_base; + npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE) + - p / PAGE_SIZE; + if (npages >= maxpages) + return maxpages; + }) + ) + return npages; } EXPORT_SYMBOL(iov_iter_npages); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d6ac0e33e150..85df503ec023 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -25,7 +25,7 @@ * GNU General Public License for more details. */ -#include <linux/res_counter.h> +#include <linux/page_counter.h> #include <linux/memcontrol.h> #include <linux/cgroup.h> #include <linux/mm.h> @@ -51,7 +51,7 @@ #include <linux/seq_file.h> #include <linux/vmpressure.h> #include <linux/mm_inline.h> -#include <linux/page_cgroup.h> +#include <linux/swap_cgroup.h> #include <linux/cpu.h> #include <linux/oom.h> #include <linux/lockdep.h> @@ -143,14 +143,8 @@ struct mem_cgroup_stat_cpu { unsigned long targets[MEM_CGROUP_NTARGETS]; }; -struct mem_cgroup_reclaim_iter { - /* - * last scanned hierarchy member. Valid only if last_dead_count - * matches memcg->dead_count of the hierarchy root group. - */ - struct mem_cgroup *last_visited; - int last_dead_count; - +struct reclaim_iter { + struct mem_cgroup *position; /* scan generation, increased every round-trip */ unsigned int generation; }; @@ -162,10 +156,10 @@ struct mem_cgroup_per_zone { struct lruvec lruvec; unsigned long lru_size[NR_LRU_LISTS]; - struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; + struct reclaim_iter iter[DEF_PRIORITY + 1]; struct rb_node tree_node; /* RB tree node */ - unsigned long long usage_in_excess;/* Set to the value by which */ + unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; struct mem_cgroup *memcg; /* Back pointer, we cannot */ @@ -198,7 +192,7 @@ static struct mem_cgroup_tree soft_limit_tree __read_mostly; struct mem_cgroup_threshold { struct eventfd_ctx *eventfd; - u64 threshold; + unsigned long threshold; }; /* For threshold */ @@ -284,10 +278,13 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); */ struct mem_cgroup { struct cgroup_subsys_state css; - /* - * the counter to account for memory usage - */ - struct res_counter res; + + /* Accounted resources */ + struct page_counter memory; + struct page_counter memsw; + struct page_counter kmem; + + unsigned long soft_limit; /* vmpressure notifications */ struct vmpressure vmpressure; @@ -296,15 +293,6 @@ struct mem_cgroup { int initialized; /* - * the counter to account for mem+swap usage. - */ - struct res_counter memsw; - - /* - * the counter to account for kernel memory usage. - */ - struct res_counter kmem; - /* * Should the accounting and control be hierarchical, per subtree? */ bool use_hierarchy; @@ -352,7 +340,6 @@ struct mem_cgroup { struct mem_cgroup_stat_cpu nocpu_base; spinlock_t pcp_counter_lock; - atomic_t dead_count; #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) struct cg_proto tcp_mem; #endif @@ -382,7 +369,6 @@ struct mem_cgroup { /* internal only representation about the status of kmem accounting. */ enum { KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */ - KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ }; #ifdef CONFIG_MEMCG_KMEM @@ -396,22 +382,6 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg) return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); } -static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) -{ - /* - * Our caller must use css_get() first, because memcg_uncharge_kmem() - * will call css_put() if it sees the memcg is dead. - */ - smp_wmb(); - if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) - set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); -} - -static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) -{ - return test_and_clear_bit(KMEM_ACCOUNTED_DEAD, - &memcg->kmem_account_flags); -} #endif /* Stuffs for move charges at task migration. */ @@ -650,7 +620,7 @@ static void disarm_kmem_keys(struct mem_cgroup *memcg) * This check can't live in kmem destruction function, * since the charges will outlive the cgroup */ - WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0); + WARN_ON(page_counter_read(&memcg->kmem)); } #else static void disarm_kmem_keys(struct mem_cgroup *memcg) @@ -664,8 +634,6 @@ static void disarm_static_keys(struct mem_cgroup *memcg) disarm_kmem_keys(memcg); } -static void drain_all_stock_async(struct mem_cgroup *memcg); - static struct mem_cgroup_per_zone * mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) { @@ -706,7 +674,7 @@ soft_limit_tree_from_page(struct page *page) static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, struct mem_cgroup_tree_per_zone *mctz, - unsigned long long new_usage_in_excess) + unsigned long new_usage_in_excess) { struct rb_node **p = &mctz->rb_root.rb_node; struct rb_node *parent = NULL; @@ -755,10 +723,21 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, spin_unlock_irqrestore(&mctz->lock, flags); } +static unsigned long soft_limit_excess(struct mem_cgroup *memcg) +{ + unsigned long nr_pages = page_counter_read(&memcg->memory); + unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit); + unsigned long excess = 0; + + if (nr_pages > soft_limit) + excess = nr_pages - soft_limit; + + return excess; +} static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) { - unsigned long long excess; + unsigned long excess; struct mem_cgroup_per_zone *mz; struct mem_cgroup_tree_per_zone *mctz; @@ -769,7 +748,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) */ for (; memcg; memcg = parent_mem_cgroup(memcg)) { mz = mem_cgroup_page_zoneinfo(memcg, page); - excess = res_counter_soft_limit_excess(&memcg->res); + excess = soft_limit_excess(memcg); /* * We have to update the tree if mz is on RB-tree or * mem is over its softlimit. @@ -825,7 +804,7 @@ retry: * position in the tree. */ __mem_cgroup_remove_exceeded(mz, mctz); - if (!res_counter_soft_limit_excess(&mz->memcg->res) || + if (!soft_limit_excess(mz->memcg) || !css_tryget_online(&mz->memcg->css)) goto retry; done: @@ -1062,122 +1041,6 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) return memcg; } -/* - * Returns a next (in a pre-order walk) alive memcg (with elevated css - * ref. count) or NULL if the whole root's subtree has been visited. - * - * helper function to be used by mem_cgroup_iter - */ -static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, - struct mem_cgroup *last_visited) -{ - struct cgroup_subsys_state *prev_css, *next_css; - - prev_css = last_visited ? &last_visited->css : NULL; -skip_node: - next_css = css_next_descendant_pre(prev_css, &root->css); - - /* - * Even if we found a group we have to make sure it is - * alive. css && !memcg means that the groups should be - * skipped and we should continue the tree walk. - * last_visited css is safe to use because it is - * protected by css_get and the tree walk is rcu safe. - * - * We do not take a reference on the root of the tree walk - * because we might race with the root removal when it would - * be the only node in the iterated hierarchy and mem_cgroup_iter - * would end up in an endless loop because it expects that at - * least one valid node will be returned. Root cannot disappear - * because caller of the iterator should hold it already so - * skipping css reference should be safe. - */ - if (next_css) { - struct mem_cgroup *memcg = mem_cgroup_from_css(next_css); - - if (next_css == &root->css) - return memcg; - - if (css_tryget_online(next_css)) { - /* - * Make sure the memcg is initialized: - * mem_cgroup_css_online() orders the the - * initialization against setting the flag. - */ - if (smp_load_acquire(&memcg->initialized)) - return memcg; - css_put(next_css); - } - - prev_css = next_css; - goto skip_node; - } - - return NULL; -} - -static void mem_cgroup_iter_invalidate(struct mem_cgroup *root) -{ - /* - * When a group in the hierarchy below root is destroyed, the - * hierarchy iterator can no longer be trusted since it might - * have pointed to the destroyed group. Invalidate it. - */ - atomic_inc(&root->dead_count); -} - -static struct mem_cgroup * -mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, - struct mem_cgroup *root, - int *sequence) -{ - struct mem_cgroup *position = NULL; - /* - * A cgroup destruction happens in two stages: offlining and - * release. They are separated by a RCU grace period. - * - * If the iterator is valid, we may still race with an - * offlining. The RCU lock ensures the object won't be - * released, tryget will fail if we lost the race. - */ - *sequence = atomic_read(&root->dead_count); - if (iter->last_dead_count == *sequence) { - smp_rmb(); - position = iter->last_visited; - - /* - * We cannot take a reference to root because we might race - * with root removal and returning NULL would end up in - * an endless loop on the iterator user level when root - * would be returned all the time. - */ - if (position && position != root && - !css_tryget_online(&position->css)) - position = NULL; - } - return position; -} - -static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, - struct mem_cgroup *last_visited, - struct mem_cgroup *new_position, - struct mem_cgroup *root, - int sequence) -{ - /* root reference counting symmetric to mem_cgroup_iter_load */ - if (last_visited && last_visited != root) - css_put(&last_visited->css); - /* - * We store the sequence count from the time @last_visited was - * loaded successfully instead of rereading it here so that we - * don't lose destruction events in between. We could have - * raced with the destruction of @new_position after all. - */ - iter->last_visited = new_position; - smp_wmb(); - iter->last_dead_count = sequence; -} - /** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root @@ -1199,8 +1062,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, struct mem_cgroup_reclaim_cookie *reclaim) { + struct reclaim_iter *uninitialized_var(iter); + struct cgroup_subsys_state *css = NULL; struct mem_cgroup *memcg = NULL; - struct mem_cgroup *last_visited = NULL; + struct mem_cgroup *pos = NULL; if (mem_cgroup_disabled()) return NULL; @@ -1209,50 +1074,101 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, root = root_mem_cgroup; if (prev && !reclaim) - last_visited = prev; + pos = prev; if (!root->use_hierarchy && root != root_mem_cgroup) { if (prev) - goto out_css_put; + goto out; return root; } rcu_read_lock(); - while (!memcg) { - struct mem_cgroup_reclaim_iter *uninitialized_var(iter); - int uninitialized_var(seq); - - if (reclaim) { - struct mem_cgroup_per_zone *mz; - - mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); - iter = &mz->reclaim_iter[reclaim->priority]; - if (prev && reclaim->generation != iter->generation) { - iter->last_visited = NULL; - goto out_unlock; - } - last_visited = mem_cgroup_iter_load(iter, root, &seq); + if (reclaim) { + struct mem_cgroup_per_zone *mz; + + mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); + iter = &mz->iter[reclaim->priority]; + + if (prev && reclaim->generation != iter->generation) + goto out_unlock; + + do { + pos = ACCESS_ONCE(iter->position); + /* + * A racing update may change the position and + * put the last reference, hence css_tryget(), + * or retry to see the updated position. + */ + } while (pos && !css_tryget(&pos->css)); + } + + if (pos) + css = &pos->css; + + for (;;) { + css = css_next_descendant_pre(css, &root->css); + if (!css) { + /* + * Reclaimers share the hierarchy walk, and a + * new one might jump in right at the end of + * the hierarchy - make sure they see at least + * one group and restart from the beginning. + */ + if (!prev) + continue; + break; } - memcg = __mem_cgroup_iter_next(root, last_visited); + /* + * Verify the css and acquire a reference. The root + * is provided by the caller, so we know it's alive + * and kicking, and don't take an extra reference. + */ + memcg = mem_cgroup_from_css(css); + + if (css == &root->css) + break; - if (reclaim) { - mem_cgroup_iter_update(iter, last_visited, memcg, root, - seq); + if (css_tryget(css)) { + /* + * Make sure the memcg is initialized: + * mem_cgroup_css_online() orders the the + * initialization against setting the flag. + */ + if (smp_load_acquire(&memcg->initialized)) + break; - if (!memcg) - iter->generation++; - else if (!prev && memcg) - reclaim->generation = iter->generation; + css_put(css); } - if (prev && !memcg) - goto out_unlock; + memcg = NULL; + } + + if (reclaim) { + if (cmpxchg(&iter->position, pos, memcg) == pos) { + if (memcg) + css_get(&memcg->css); + if (pos) + css_put(&pos->css); + } + + /* + * pairs with css_tryget when dereferencing iter->position + * above. + */ + if (pos) + css_put(&pos->css); + + if (!memcg) + iter->generation++; + else if (!prev) + reclaim->generation = iter->generation; } + out_unlock: rcu_read_unlock(); -out_css_put: +out: if (prev && prev != root) css_put(&prev->css); @@ -1346,15 +1262,18 @@ out: } /** - * mem_cgroup_page_lruvec - return lruvec for adding an lru page + * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page * @page: the page * @zone: zone of the page + * + * This function is only safe when following the LRU page isolation + * and putback protocol: the LRU lock must be held, and the page must + * either be PageLRU() or the caller must have isolated/allocated it. */ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) { struct mem_cgroup_per_zone *mz; struct mem_cgroup *memcg; - struct page_cgroup *pc; struct lruvec *lruvec; if (mem_cgroup_disabled()) { @@ -1362,20 +1281,13 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) goto out; } - pc = lookup_page_cgroup(page); - memcg = pc->mem_cgroup; - + memcg = page->mem_cgroup; /* - * Surreptitiously switch any uncharged offlist page to root: - * an uncharged page off lru does nothing to secure - * its former mem_cgroup from sudden removal. - * - * Our caller holds lru_lock, and PageCgroupUsed is updated - * under page_cgroup lock: between them, they make all uses - * of pc->mem_cgroup safe. + * Swapcache readahead pages are added to the LRU - and + * possibly migrated - before they are charged. */ - if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) - pc->mem_cgroup = memcg = root_mem_cgroup; + if (!memcg) + memcg = root_mem_cgroup; mz = mem_cgroup_page_zoneinfo(memcg, page); lruvec = &mz->lruvec; @@ -1414,41 +1326,24 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, VM_BUG_ON((long)(*lru_size) < 0); } -/* - * Checks whether given mem is same or in the root_mem_cgroup's - * hierarchy subtree - */ -bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, - struct mem_cgroup *memcg) +bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root) { - if (root_memcg == memcg) + if (root == memcg) return true; - if (!root_memcg->use_hierarchy || !memcg) + if (!root->use_hierarchy) return false; - return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup); -} - -static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, - struct mem_cgroup *memcg) -{ - bool ret; - - rcu_read_lock(); - ret = __mem_cgroup_same_or_subtree(root_memcg, memcg); - rcu_read_unlock(); - return ret; + return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup); } -bool task_in_mem_cgroup(struct task_struct *task, - const struct mem_cgroup *memcg) +bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) { - struct mem_cgroup *curr = NULL; + struct mem_cgroup *task_memcg; struct task_struct *p; bool ret; p = find_lock_task_mm(task); if (p) { - curr = get_mem_cgroup_from_mm(p->mm); + task_memcg = get_mem_cgroup_from_mm(p->mm); task_unlock(p); } else { /* @@ -1457,19 +1352,12 @@ bool task_in_mem_cgroup(struct task_struct *task, * killed to prevent needlessly killing additional tasks. */ rcu_read_lock(); - curr = mem_cgroup_from_task(task); - if (curr) - css_get(&curr->css); + task_memcg = mem_cgroup_from_task(task); + css_get(&task_memcg->css); rcu_read_unlock(); } - /* - * We should check use_hierarchy of "memcg" not "curr". Because checking - * use_hierarchy of "curr" here make this function true if hierarchy is - * enabled in "curr" and "curr" is a child of "memcg" in *cgroup* - * hierarchy(even if use_hierarchy is disabled in "memcg"). - */ - ret = mem_cgroup_same_or_subtree(memcg, curr); - css_put(&curr->css); + ret = mem_cgroup_is_descendant(task_memcg, memcg); + css_put(&task_memcg->css); return ret; } @@ -1492,7 +1380,7 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) return inactive * inactive_ratio < active; } -#define mem_cgroup_from_res_counter(counter, member) \ +#define mem_cgroup_from_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) /** @@ -1504,12 +1392,23 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) */ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) { - unsigned long long margin; + unsigned long margin = 0; + unsigned long count; + unsigned long limit; - margin = res_counter_margin(&memcg->res); - if (do_swap_account) - margin = min(margin, res_counter_margin(&memcg->memsw)); - return margin >> PAGE_SHIFT; + count = page_counter_read(&memcg->memory); + limit = ACCESS_ONCE(memcg->memory.limit); + if (count < limit) + margin = limit - count; + + if (do_swap_account) { + count = page_counter_read(&memcg->memsw); + limit = ACCESS_ONCE(memcg->memsw.limit); + if (count <= limit) + margin = min(margin, limit - count); + } + + return margin; } int mem_cgroup_swappiness(struct mem_cgroup *memcg) @@ -1522,37 +1421,6 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg) } /* - * memcg->moving_account is used for checking possibility that some thread is - * calling move_account(). When a thread on CPU-A starts moving pages under - * a memcg, other threads should check memcg->moving_account under - * rcu_read_lock(), like this: - * - * CPU-A CPU-B - * rcu_read_lock() - * memcg->moving_account+1 if (memcg->mocing_account) - * take heavy locks. - * synchronize_rcu() update something. - * rcu_read_unlock() - * start move here. - */ - -static void mem_cgroup_start_move(struct mem_cgroup *memcg) -{ - atomic_inc(&memcg->moving_account); - synchronize_rcu(); -} - -static void mem_cgroup_end_move(struct mem_cgroup *memcg) -{ - /* - * Now, mem_cgroup_clear_mc() may call this function with NULL. - * We check NULL in callee rather than caller. - */ - if (memcg) - atomic_dec(&memcg->moving_account); -} - -/* * A routine for checking "mem" is under move_account() or not. * * Checking a cgroup is mc.from or mc.to or under hierarchy of @@ -1574,8 +1442,8 @@ static bool mem_cgroup_under_move(struct mem_cgroup *memcg) if (!from) goto unlock; - ret = mem_cgroup_same_or_subtree(memcg, from) - || mem_cgroup_same_or_subtree(memcg, to); + ret = mem_cgroup_is_descendant(from, memcg) || + mem_cgroup_is_descendant(to, memcg); unlock: spin_unlock(&mc.lock); return ret; @@ -1597,23 +1465,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) return false; } -/* - * Take this lock when - * - a code tries to modify page's memcg while it's USED. - * - a code tries to modify page state accounting in a memcg. - */ -static void move_lock_mem_cgroup(struct mem_cgroup *memcg, - unsigned long *flags) -{ - spin_lock_irqsave(&memcg->move_lock, *flags); -} - -static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, - unsigned long *flags) -{ - spin_unlock_irqrestore(&memcg->move_lock, *flags); -} - #define K(x) ((x) << (PAGE_SHIFT-10)) /** * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. @@ -1644,18 +1495,15 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) rcu_read_unlock(); - pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", - res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, - res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, - res_counter_read_u64(&memcg->res, RES_FAILCNT)); - pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n", - res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, - res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, - res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); - pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n", - res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, - res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, - res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); + pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->memory)), + K((u64)memcg->memory.limit), memcg->memory.failcnt); + pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->memsw)), + K((u64)memcg->memsw.limit), memcg->memsw.failcnt); + pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->kmem)), + K((u64)memcg->kmem.limit), memcg->kmem.failcnt); for_each_mem_cgroup_tree(iter, memcg) { pr_info("Memory cgroup stats for "); @@ -1695,28 +1543,17 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg) /* * Return the memory (and swap, if configured) limit for a memcg. */ -static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) +static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) { - u64 limit; - - limit = res_counter_read_u64(&memcg->res, RES_LIMIT); + unsigned long limit; - /* - * Do not consider swap space if we cannot swap due to swappiness - */ + limit = memcg->memory.limit; if (mem_cgroup_swappiness(memcg)) { - u64 memsw; + unsigned long memsw_limit; - limit += total_swap_pages << PAGE_SHIFT; - memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); - - /* - * If memsw is finite and limits the amount of swap space - * available to this memcg, return that limit. - */ - limit = min(limit, memsw); + memsw_limit = memcg->memsw.limit; + limit = min(limit + total_swap_pages, memsw_limit); } - return limit; } @@ -1740,7 +1577,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, } check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); - totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; + totalpages = mem_cgroup_get_limit(memcg) ? : 1; for_each_mem_cgroup_tree(iter, memcg) { struct css_task_iter it; struct task_struct *task; @@ -1880,52 +1717,11 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) memcg->last_scanned_node = node; return node; } - -/* - * Check all nodes whether it contains reclaimable pages or not. - * For quick scan, we make use of scan_nodes. This will allow us to skip - * unused nodes. But scan_nodes is lazily updated and may not cotain - * enough new information. We need to do double check. - */ -static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) -{ - int nid; - - /* - * quick check...making use of scan_node. - * We can skip unused nodes. - */ - if (!nodes_empty(memcg->scan_nodes)) { - for (nid = first_node(memcg->scan_nodes); - nid < MAX_NUMNODES; - nid = next_node(nid, memcg->scan_nodes)) { - - if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) - return true; - } - } - /* - * Check rest of nodes. - */ - for_each_node_state(nid, N_MEMORY) { - if (node_isset(nid, memcg->scan_nodes)) - continue; - if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) - return true; - } - return false; -} - #else int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) { return 0; } - -static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) -{ - return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); -} #endif static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, @@ -1943,7 +1739,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, .priority = 0, }; - excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; + excess = soft_limit_excess(root_memcg); while (1) { victim = mem_cgroup_iter(root_memcg, victim, &reclaim); @@ -1969,12 +1765,10 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, } continue; } - if (!mem_cgroup_reclaimable(victim, false)) - continue; total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, zone, &nr_scanned); *total_scanned += nr_scanned; - if (!res_counter_soft_limit_excess(&root_memcg->res)) + if (!soft_limit_excess(root_memcg)) break; } mem_cgroup_iter_break(root_memcg, victim); @@ -2081,12 +1875,8 @@ static int memcg_oom_wake_function(wait_queue_t *wait, oom_wait_info = container_of(wait, struct oom_wait_info, wait); oom_wait_memcg = oom_wait_info->memcg; - /* - * Both of oom_wait_info->memcg and wake_memcg are stable under us. - * Then we can use css_is_ancestor without taking care of RCU. - */ - if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) - && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg)) + if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && + !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) return 0; return autoremove_wake_function(wait, mode, sync, arg); } @@ -2228,26 +2018,23 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, unsigned long *flags) { struct mem_cgroup *memcg; - struct page_cgroup *pc; rcu_read_lock(); if (mem_cgroup_disabled()) return NULL; - - pc = lookup_page_cgroup(page); again: - memcg = pc->mem_cgroup; - if (unlikely(!memcg || !PageCgroupUsed(pc))) + memcg = page->mem_cgroup; + if (unlikely(!memcg)) return NULL; *locked = false; if (atomic_read(&memcg->moving_account) <= 0) return memcg; - move_lock_mem_cgroup(memcg, flags); - if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { - move_unlock_mem_cgroup(memcg, flags); + spin_lock_irqsave(&memcg->move_lock, *flags); + if (memcg != page->mem_cgroup) { + spin_unlock_irqrestore(&memcg->move_lock, *flags); goto again; } *locked = true; @@ -2261,11 +2048,11 @@ again: * @locked: value received from mem_cgroup_begin_page_stat() * @flags: value received from mem_cgroup_begin_page_stat() */ -void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked, - unsigned long flags) +void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked, + unsigned long *flags) { - if (memcg && locked) - move_unlock_mem_cgroup(memcg, &flags); + if (memcg && *locked) + spin_unlock_irqrestore(&memcg->move_lock, *flags); rcu_read_unlock(); } @@ -2316,33 +2103,32 @@ static DEFINE_MUTEX(percpu_charge_mutex); static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; - bool ret = true; + bool ret = false; if (nr_pages > CHARGE_BATCH) - return false; + return ret; stock = &get_cpu_var(memcg_stock); - if (memcg == stock->cached && stock->nr_pages >= nr_pages) + if (memcg == stock->cached && stock->nr_pages >= nr_pages) { stock->nr_pages -= nr_pages; - else /* need to call res_counter_charge */ - ret = false; + ret = true; + } put_cpu_var(memcg_stock); return ret; } /* - * Returns stocks cached in percpu to res_counter and reset cached information. + * Returns stocks cached in percpu and reset cached information. */ static void drain_stock(struct memcg_stock_pcp *stock) { struct mem_cgroup *old = stock->cached; if (stock->nr_pages) { - unsigned long bytes = stock->nr_pages * PAGE_SIZE; - - res_counter_uncharge(&old->res, bytes); + page_counter_uncharge(&old->memory, stock->nr_pages); if (do_swap_account) - res_counter_uncharge(&old->memsw, bytes); + page_counter_uncharge(&old->memsw, stock->nr_pages); + css_put_many(&old->css, stock->nr_pages); stock->nr_pages = 0; } stock->cached = NULL; @@ -2371,7 +2157,7 @@ static void __init memcg_stock_init(void) } /* - * Cache charges(val) which is from res_counter, to local per_cpu area. + * Cache charges(val) to local per_cpu area. * This will be consumed by consume_stock() function, later. */ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) @@ -2388,13 +2174,15 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) /* * Drains all per-CPU charge caches for given root_memcg resp. subtree - * of the hierarchy under it. sync flag says whether we should block - * until the work is done. + * of the hierarchy under it. */ -static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) +static void drain_all_stock(struct mem_cgroup *root_memcg) { int cpu, curcpu; + /* If someone's already draining, avoid adding running more workers. */ + if (!mutex_trylock(&percpu_charge_mutex)) + return; /* Notify other cpus that system-wide "drain" is running */ get_online_cpus(); curcpu = get_cpu(); @@ -2405,7 +2193,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) memcg = stock->cached; if (!memcg || !stock->nr_pages) continue; - if (!mem_cgroup_same_or_subtree(root_memcg, memcg)) + if (!mem_cgroup_is_descendant(memcg, root_memcg)) continue; if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { if (cpu == curcpu) @@ -2415,42 +2203,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) } } put_cpu(); - - if (!sync) - goto out; - - for_each_online_cpu(cpu) { - struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); - if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) - flush_work(&stock->work); - } -out: put_online_cpus(); -} - -/* - * Tries to drain stocked charges in other cpus. This function is asynchronous - * and just put a work per cpu for draining localy on each cpu. Caller can - * expects some charges will be back to res_counter later but cannot wait for - * it. - */ -static void drain_all_stock_async(struct mem_cgroup *root_memcg) -{ - /* - * If someone calls draining, avoid adding more kworker runs. - */ - if (!mutex_trylock(&percpu_charge_mutex)) - return; - drain_all_stock(root_memcg, false); - mutex_unlock(&percpu_charge_mutex); -} - -/* This is a synchronous drain interface. */ -static void drain_all_stock_sync(struct mem_cgroup *root_memcg) -{ - /* called when force_empty is called */ - mutex_lock(&percpu_charge_mutex); - drain_all_stock(root_memcg, true); mutex_unlock(&percpu_charge_mutex); } @@ -2506,9 +2259,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int batch = max(CHARGE_BATCH, nr_pages); int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem_over_limit; - struct res_counter *fail_res; + struct page_counter *counter; unsigned long nr_reclaimed; - unsigned long long size; bool may_swap = true; bool drained = false; int ret = 0; @@ -2519,16 +2271,15 @@ retry: if (consume_stock(memcg, nr_pages)) goto done; - size = batch * PAGE_SIZE; if (!do_swap_account || - !res_counter_charge(&memcg->memsw, size, &fail_res)) { - if (!res_counter_charge(&memcg->res, size, &fail_res)) + !page_counter_try_charge(&memcg->memsw, batch, &counter)) { + if (!page_counter_try_charge(&memcg->memory, batch, &counter)) goto done_restock; if (do_swap_account) - res_counter_uncharge(&memcg->memsw, size); - mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); + page_counter_uncharge(&memcg->memsw, batch); + mem_over_limit = mem_cgroup_from_counter(counter, memory); } else { - mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); + mem_over_limit = mem_cgroup_from_counter(counter, memsw); may_swap = false; } @@ -2561,7 +2312,7 @@ retry: goto retry; if (!drained) { - drain_all_stock_async(mem_over_limit); + drain_all_stock(mem_over_limit); drained = true; goto retry; } @@ -2603,6 +2354,7 @@ bypass: return -EINTR; done_restock: + css_get_many(&memcg->css, batch); if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); done: @@ -2611,32 +2363,14 @@ done: static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { - unsigned long bytes = nr_pages * PAGE_SIZE; - if (mem_cgroup_is_root(memcg)) return; - res_counter_uncharge(&memcg->res, bytes); + page_counter_uncharge(&memcg->memory, nr_pages); if (do_swap_account) - res_counter_uncharge(&memcg->memsw, bytes); -} - -/* - * Cancel chrages in this cgroup....doesn't propagate to parent cgroup. - * This is useful when moving usage to parent cgroup. - */ -static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, - unsigned int nr_pages) -{ - unsigned long bytes = nr_pages * PAGE_SIZE; - - if (mem_cgroup_is_root(memcg)) - return; + page_counter_uncharge(&memcg->memsw, nr_pages); - res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); - if (do_swap_account) - res_counter_uncharge_until(&memcg->memsw, - memcg->memsw.parent, bytes); + css_put_many(&memcg->css, nr_pages); } /* @@ -2665,17 +2399,15 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) */ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) { - struct mem_cgroup *memcg = NULL; - struct page_cgroup *pc; + struct mem_cgroup *memcg; unsigned short id; swp_entry_t ent; VM_BUG_ON_PAGE(!PageLocked(page), page); - pc = lookup_page_cgroup(page); - if (PageCgroupUsed(pc)) { - memcg = pc->mem_cgroup; - if (memcg && !css_tryget_online(&memcg->css)) + memcg = page->mem_cgroup; + if (memcg) { + if (!css_tryget_online(&memcg->css)) memcg = NULL; } else if (PageSwapCache(page)) { ent.val = page_private(page); @@ -2723,14 +2455,9 @@ static void unlock_page_lru(struct page *page, int isolated) static void commit_charge(struct page *page, struct mem_cgroup *memcg, bool lrucare) { - struct page_cgroup *pc = lookup_page_cgroup(page); int isolated; - VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); - /* - * we don't need page_cgroup_lock about tail pages, becase they are not - * accessed by any other context at this point. - */ + VM_BUG_ON_PAGE(page->mem_cgroup, page); /* * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page @@ -2741,7 +2468,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, /* * Nobody should be changing or seriously looking at - * pc->mem_cgroup and pc->flags at this point: + * page->mem_cgroup at this point: * * - the page is uncharged * @@ -2753,15 +2480,12 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, * - a page cache insertion, a swapin fault, or a migration * have the page locked */ - pc->mem_cgroup = memcg; - pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0); + page->mem_cgroup = memcg; if (lrucare) unlock_page_lru(page, isolated); } -static DEFINE_MUTEX(set_limit_mutex); - #ifdef CONFIG_MEMCG_KMEM /* * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or @@ -2769,8 +2493,6 @@ static DEFINE_MUTEX(set_limit_mutex); */ static DEFINE_MUTEX(memcg_slab_mutex); -static DEFINE_MUTEX(activate_kmem_mutex); - /* * This is a bit cumbersome, but it is rarely used and avoids a backpointer * in the memcg_cache_params struct. @@ -2784,36 +2506,17 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); } -#ifdef CONFIG_SLABINFO -static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) -{ - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - struct memcg_cache_params *params; - - if (!memcg_kmem_is_active(memcg)) - return -EIO; - - print_slabinfo_header(m); - - mutex_lock(&memcg_slab_mutex); - list_for_each_entry(params, &memcg->memcg_slab_caches, list) - cache_show(memcg_params_to_cache(params), m); - mutex_unlock(&memcg_slab_mutex); - - return 0; -} -#endif - -static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) +static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, + unsigned long nr_pages) { - struct res_counter *fail_res; + struct page_counter *counter; int ret = 0; - ret = res_counter_charge(&memcg->kmem, size, &fail_res); - if (ret) + ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter); + if (ret < 0) return ret; - ret = try_charge(memcg, gfp, size >> PAGE_SHIFT); + ret = try_charge(memcg, gfp, nr_pages); if (ret == -EINTR) { /* * try_charge() chose to bypass to root due to OOM kill or @@ -2830,37 +2533,27 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) * when the allocation triggers should have been already * directed to the root cgroup in memcontrol.h */ - res_counter_charge_nofail(&memcg->res, size, &fail_res); + page_counter_charge(&memcg->memory, nr_pages); if (do_swap_account) - res_counter_charge_nofail(&memcg->memsw, size, - &fail_res); + page_counter_charge(&memcg->memsw, nr_pages); + css_get_many(&memcg->css, nr_pages); ret = 0; } else if (ret) - res_counter_uncharge(&memcg->kmem, size); + page_counter_uncharge(&memcg->kmem, nr_pages); return ret; } -static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) +static void memcg_uncharge_kmem(struct mem_cgroup *memcg, + unsigned long nr_pages) { - res_counter_uncharge(&memcg->res, size); + page_counter_uncharge(&memcg->memory, nr_pages); if (do_swap_account) - res_counter_uncharge(&memcg->memsw, size); + page_counter_uncharge(&memcg->memsw, nr_pages); - /* Not down to 0 */ - if (res_counter_uncharge(&memcg->kmem, size)) - return; + page_counter_uncharge(&memcg->kmem, nr_pages); - /* - * Releases a reference taken in kmem_cgroup_css_offline in case - * this last uncharge is racing with the offlining code or it is - * outliving the memcg existence. - * - * The memory barrier imposed by test&clear is paired with the - * explicit one in memcg_kmem_mark_dead(). - */ - if (memcg_kmem_test_and_clear_dead(memcg)) - css_put(&memcg->css); + css_put_many(&memcg->css, nr_pages); } /* @@ -3124,19 +2817,21 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg, int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) { + unsigned int nr_pages = 1 << order; int res; - res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, - PAGE_SIZE << order); + res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); if (!res) - atomic_add(1 << order, &cachep->memcg_params->nr_pages); + atomic_add(nr_pages, &cachep->memcg_params->nr_pages); return res; } void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) { - memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order); - atomic_sub(1 << order, &cachep->memcg_params->nr_pages); + unsigned int nr_pages = 1 << order; + + memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); + atomic_sub(nr_pages, &cachep->memcg_params->nr_pages); } /* @@ -3257,7 +2952,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) return true; } - ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); + ret = memcg_charge_kmem(memcg, gfp, 1 << order); if (!ret) *_memcg = memcg; @@ -3268,46 +2963,27 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) { - struct page_cgroup *pc; - VM_BUG_ON(mem_cgroup_is_root(memcg)); /* The page allocation failed. Revert */ if (!page) { - memcg_uncharge_kmem(memcg, PAGE_SIZE << order); + memcg_uncharge_kmem(memcg, 1 << order); return; } - /* - * The page is freshly allocated and not visible to any - * outside callers yet. Set up pc non-atomically. - */ - pc = lookup_page_cgroup(page); - pc->mem_cgroup = memcg; - pc->flags = PCG_USED; + page->mem_cgroup = memcg; } void __memcg_kmem_uncharge_pages(struct page *page, int order) { - struct mem_cgroup *memcg = NULL; - struct page_cgroup *pc; - + struct mem_cgroup *memcg = page->mem_cgroup; - pc = lookup_page_cgroup(page); - if (!PageCgroupUsed(pc)) - return; - - memcg = pc->mem_cgroup; - pc->flags = 0; - - /* - * We trust that only if there is a memcg associated with the page, it - * is a valid allocation - */ if (!memcg) return; VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); - memcg_uncharge_kmem(memcg, PAGE_SIZE << order); + + memcg_uncharge_kmem(memcg, 1 << order); + page->mem_cgroup = NULL; } #else static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) @@ -3325,21 +3001,15 @@ static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) */ void mem_cgroup_split_huge_fixup(struct page *head) { - struct page_cgroup *head_pc = lookup_page_cgroup(head); - struct page_cgroup *pc; - struct mem_cgroup *memcg; int i; if (mem_cgroup_disabled()) return; - memcg = head_pc->mem_cgroup; - for (i = 1; i < HPAGE_PMD_NR; i++) { - pc = head_pc + i; - pc->mem_cgroup = memcg; - pc->flags = head_pc->flags; - } - __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], + for (i = 1; i < HPAGE_PMD_NR; i++) + head[i].mem_cgroup = head->mem_cgroup; + + __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE], HPAGE_PMD_NR); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -3348,7 +3018,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) * mem_cgroup_move_account - move account of the page * @page: the page * @nr_pages: number of regular pages (>1 for huge pages) - * @pc: page_cgroup of the page. * @from: mem_cgroup which the page is moved from. * @to: mem_cgroup which the page is moved to. @from != @to. * @@ -3361,7 +3030,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) */ static int mem_cgroup_move_account(struct page *page, unsigned int nr_pages, - struct page_cgroup *pc, struct mem_cgroup *from, struct mem_cgroup *to) { @@ -3381,7 +3049,7 @@ static int mem_cgroup_move_account(struct page *page, goto out; /* - * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup + * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup * of its source page while we change it: page migration takes * both pages off the LRU, but page cache replacement doesn't. */ @@ -3389,10 +3057,10 @@ static int mem_cgroup_move_account(struct page *page, goto out; ret = -EINVAL; - if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) + if (page->mem_cgroup != from) goto out_unlock; - move_lock_mem_cgroup(from, &flags); + spin_lock_irqsave(&from->move_lock, flags); if (!PageAnon(page) && page_mapped(page)) { __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], @@ -3409,14 +3077,15 @@ static int mem_cgroup_move_account(struct page *page, } /* - * It is safe to change pc->mem_cgroup here because the page + * It is safe to change page->mem_cgroup here because the page * is referenced, charged, and isolated - we can't race with * uncharging, charging, migration, or LRU putback. */ /* caller should have done css_get */ - pc->mem_cgroup = to; - move_unlock_mem_cgroup(from, &flags); + page->mem_cgroup = to; + spin_unlock_irqrestore(&from->move_lock, flags); + ret = 0; local_irq_disable(); @@ -3431,72 +3100,6 @@ out: return ret; } -/** - * mem_cgroup_move_parent - moves page to the parent group - * @page: the page to move - * @pc: page_cgroup of the page - * @child: page's cgroup - * - * move charges to its parent or the root cgroup if the group has no - * parent (aka use_hierarchy==0). - * Although this might fail (get_page_unless_zero, isolate_lru_page or - * mem_cgroup_move_account fails) the failure is always temporary and - * it signals a race with a page removal/uncharge or migration. In the - * first case the page is on the way out and it will vanish from the LRU - * on the next attempt and the call should be retried later. - * Isolation from the LRU fails only if page has been isolated from - * the LRU since we looked at it and that usually means either global - * reclaim or migration going on. The page will either get back to the - * LRU or vanish. - * Finaly mem_cgroup_move_account fails only if the page got uncharged - * (!PageCgroupUsed) or moved to a different group. The page will - * disappear in the next attempt. - */ -static int mem_cgroup_move_parent(struct page *page, - struct page_cgroup *pc, - struct mem_cgroup *child) -{ - struct mem_cgroup *parent; - unsigned int nr_pages; - unsigned long uninitialized_var(flags); - int ret; - - VM_BUG_ON(mem_cgroup_is_root(child)); - - ret = -EBUSY; - if (!get_page_unless_zero(page)) - goto out; - if (isolate_lru_page(page)) - goto put; - - nr_pages = hpage_nr_pages(page); - - parent = parent_mem_cgroup(child); - /* - * If no parent, move charges to root cgroup. - */ - if (!parent) - parent = root_mem_cgroup; - - if (nr_pages > 1) { - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - flags = compound_lock_irqsave(page); - } - - ret = mem_cgroup_move_account(page, nr_pages, - pc, child, parent); - if (!ret) - __mem_cgroup_cancel_local_charge(child, nr_pages); - - if (nr_pages > 1) - compound_unlock_irqrestore(page, flags); - putback_lru_page(page); -put: - put_page(page); -out: - return ret; -} - #ifdef CONFIG_MEMCG_SWAP static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, bool charge) @@ -3516,7 +3119,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, * * Returns 0 on success, -EINVAL on failure. * - * The caller must have charged to @to, IOW, called res_counter_charge() about + * The caller must have charged to @to, IOW, called page_counter_charge() about * both res and memsw, and called css_get(). */ static int mem_cgroup_move_swap_account(swp_entry_t entry, @@ -3532,7 +3135,7 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, mem_cgroup_swap_statistics(to, true); /* * This function is only called from task migration context now. - * It postpones res_counter and refcount handling till the end + * It postpones page_counter and refcount handling till the end * of task migration(mem_cgroup_clear_mc()) for performance * improvement. But we cannot postpone css_get(to) because if * the process that has been moved to @to does swap-in, the @@ -3554,96 +3157,57 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, } #endif -#ifdef CONFIG_DEBUG_VM -static struct page_cgroup *lookup_page_cgroup_used(struct page *page) -{ - struct page_cgroup *pc; - - pc = lookup_page_cgroup(page); - /* - * Can be NULL while feeding pages into the page allocator for - * the first time, i.e. during boot or memory hotplug; - * or when mem_cgroup_disabled(). - */ - if (likely(pc) && PageCgroupUsed(pc)) - return pc; - return NULL; -} - -bool mem_cgroup_bad_page_check(struct page *page) -{ - if (mem_cgroup_disabled()) - return false; - - return lookup_page_cgroup_used(page) != NULL; -} - -void mem_cgroup_print_bad_page(struct page *page) -{ - struct page_cgroup *pc; - - pc = lookup_page_cgroup_used(page); - if (pc) { - pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", - pc, pc->flags, pc->mem_cgroup); - } -} -#endif +static DEFINE_MUTEX(memcg_limit_mutex); static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, - unsigned long long val) + unsigned long limit) { + unsigned long curusage; + unsigned long oldusage; + bool enlarge = false; int retry_count; - int ret = 0; - int children = mem_cgroup_count_children(memcg); - u64 curusage, oldusage; - int enlarge; + int ret; /* * For keeping hierarchical_reclaim simple, how long we should retry * is depends on callers. We set our retry-count to be function * of # of children which we should visit in this loop. */ - retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; + retry_count = MEM_CGROUP_RECLAIM_RETRIES * + mem_cgroup_count_children(memcg); - oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); + oldusage = page_counter_read(&memcg->memory); - enlarge = 0; - while (retry_count) { + do { if (signal_pending(current)) { ret = -EINTR; break; } - /* - * Rather than hide all in some function, I do this in - * open coded manner. You see what this really does. - * We have to guarantee memcg->res.limit <= memcg->memsw.limit. - */ - mutex_lock(&set_limit_mutex); - if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) { + + mutex_lock(&memcg_limit_mutex); + if (limit > memcg->memsw.limit) { + mutex_unlock(&memcg_limit_mutex); ret = -EINVAL; - mutex_unlock(&set_limit_mutex); break; } - - if (res_counter_read_u64(&memcg->res, RES_LIMIT) < val) - enlarge = 1; - - ret = res_counter_set_limit(&memcg->res, val); - mutex_unlock(&set_limit_mutex); + if (limit > memcg->memory.limit) + enlarge = true; + ret = page_counter_limit(&memcg->memory, limit); + mutex_unlock(&memcg_limit_mutex); if (!ret) break; try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); - curusage = res_counter_read_u64(&memcg->res, RES_USAGE); + curusage = page_counter_read(&memcg->memory); /* Usage is reduced ? */ if (curusage >= oldusage) retry_count--; else oldusage = curusage; - } + } while (retry_count); + if (!ret && enlarge) memcg_oom_recover(memcg); @@ -3651,52 +3215,53 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, } static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, - unsigned long long val) + unsigned long limit) { + unsigned long curusage; + unsigned long oldusage; + bool enlarge = false; int retry_count; - u64 oldusage, curusage; - int children = mem_cgroup_count_children(memcg); - int ret = -EBUSY; - int enlarge = 0; + int ret; /* see mem_cgroup_resize_res_limit */ - retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; - oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); - while (retry_count) { + retry_count = MEM_CGROUP_RECLAIM_RETRIES * + mem_cgroup_count_children(memcg); + + oldusage = page_counter_read(&memcg->memsw); + + do { if (signal_pending(current)) { ret = -EINTR; break; } - /* - * Rather than hide all in some function, I do this in - * open coded manner. You see what this really does. - * We have to guarantee memcg->res.limit <= memcg->memsw.limit. - */ - mutex_lock(&set_limit_mutex); - if (res_counter_read_u64(&memcg->res, RES_LIMIT) > val) { + + mutex_lock(&memcg_limit_mutex); + if (limit < memcg->memory.limit) { + mutex_unlock(&memcg_limit_mutex); ret = -EINVAL; - mutex_unlock(&set_limit_mutex); break; } - if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) - enlarge = 1; - ret = res_counter_set_limit(&memcg->memsw, val); - mutex_unlock(&set_limit_mutex); + if (limit > memcg->memsw.limit) + enlarge = true; + ret = page_counter_limit(&memcg->memsw, limit); + mutex_unlock(&memcg_limit_mutex); if (!ret) break; try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); - curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); + curusage = page_counter_read(&memcg->memsw); /* Usage is reduced ? */ if (curusage >= oldusage) retry_count--; else oldusage = curusage; - } + } while (retry_count); + if (!ret && enlarge) memcg_oom_recover(memcg); + return ret; } @@ -3709,7 +3274,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, unsigned long reclaimed; int loop = 0; struct mem_cgroup_tree_per_zone *mctz; - unsigned long long excess; + unsigned long excess; unsigned long nr_scanned; if (order > 0) @@ -3735,35 +3300,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, nr_reclaimed += reclaimed; *total_scanned += nr_scanned; spin_lock_irq(&mctz->lock); + __mem_cgroup_remove_exceeded(mz, mctz); /* * If we failed to reclaim anything from this memory cgroup * it is time to move on to the next cgroup */ next_mz = NULL; - if (!reclaimed) { - do { - /* - * Loop until we find yet another one. - * - * By the time we get the soft_limit lock - * again, someone might have aded the - * group back on the RB tree. Iterate to - * make sure we get a different mem. - * mem_cgroup_largest_soft_limit_node returns - * NULL if no other cgroup is present on - * the tree - */ - next_mz = - __mem_cgroup_largest_soft_limit_node(mctz); - if (next_mz == mz) - css_put(&next_mz->memcg->css); - else /* next_mz == NULL or other memcg */ - break; - } while (1); - } - __mem_cgroup_remove_exceeded(mz, mctz); - excess = res_counter_soft_limit_excess(&mz->memcg->res); + if (!reclaimed) + next_mz = __mem_cgroup_largest_soft_limit_node(mctz); + + excess = soft_limit_excess(mz->memcg); /* * One school of thought says that we should not add * back the node to the tree if reclaim returns 0. @@ -3792,107 +3339,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, return nr_reclaimed; } -/** - * mem_cgroup_force_empty_list - clears LRU of a group - * @memcg: group to clear - * @node: NUMA node - * @zid: zone id - * @lru: lru to to clear - * - * Traverse a specified page_cgroup list and try to drop them all. This doesn't - * reclaim the pages page themselves - pages are moved to the parent (or root) - * group. - */ -static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, - int node, int zid, enum lru_list lru) -{ - struct lruvec *lruvec; - unsigned long flags; - struct list_head *list; - struct page *busy; - struct zone *zone; - - zone = &NODE_DATA(node)->node_zones[zid]; - lruvec = mem_cgroup_zone_lruvec(zone, memcg); - list = &lruvec->lists[lru]; - - busy = NULL; - do { - struct page_cgroup *pc; - struct page *page; - - spin_lock_irqsave(&zone->lru_lock, flags); - if (list_empty(list)) { - spin_unlock_irqrestore(&zone->lru_lock, flags); - break; - } - page = list_entry(list->prev, struct page, lru); - if (busy == page) { - list_move(&page->lru, list); - busy = NULL; - spin_unlock_irqrestore(&zone->lru_lock, flags); - continue; - } - spin_unlock_irqrestore(&zone->lru_lock, flags); - - pc = lookup_page_cgroup(page); - - if (mem_cgroup_move_parent(page, pc, memcg)) { - /* found lock contention or "pc" is obsolete. */ - busy = page; - } else - busy = NULL; - cond_resched(); - } while (!list_empty(list)); -} - -/* - * make mem_cgroup's charge to be 0 if there is no task by moving - * all the charges and pages to the parent. - * This enables deleting this mem_cgroup. - * - * Caller is responsible for holding css reference on the memcg. - */ -static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) -{ - int node, zid; - u64 usage; - - do { - /* This is for making all *used* pages to be on LRU. */ - lru_add_drain_all(); - drain_all_stock_sync(memcg); - mem_cgroup_start_move(memcg); - for_each_node_state(node, N_MEMORY) { - for (zid = 0; zid < MAX_NR_ZONES; zid++) { - enum lru_list lru; - for_each_lru(lru) { - mem_cgroup_force_empty_list(memcg, - node, zid, lru); - } - } - } - mem_cgroup_end_move(memcg); - memcg_oom_recover(memcg); - cond_resched(); - - /* - * Kernel memory may not necessarily be trackable to a specific - * process. So they are not migrated, and therefore we can't - * expect their value to drop to 0 here. - * Having res filled up with kmem only is enough. - * - * This is a safety check because mem_cgroup_force_empty_list - * could have raced with mem_cgroup_replace_page_cache callers - * so the lru seemed empty but the page could have been added - * right after the check. RES_USAGE should be safe as we always - * charge before adding to the LRU. - */ - usage = res_counter_read_u64(&memcg->res, RES_USAGE) - - res_counter_read_u64(&memcg->kmem, RES_USAGE); - } while (usage > 0); -} - /* * Test whether @memcg has children, dead or alive. Note that this * function doesn't care whether @memcg has use_hierarchy enabled and @@ -3930,7 +3376,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) /* we call try-to-free pages for make this cgroup empty */ lru_add_drain_all(); /* try to free all pages in this cgroup */ - while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { + while (nr_retries && page_counter_read(&memcg->memory)) { int progress; if (signal_pending(current)) @@ -4001,8 +3447,8 @@ out: return retval; } -static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, - enum mem_cgroup_stat_index idx) +static unsigned long tree_stat(struct mem_cgroup *memcg, + enum mem_cgroup_stat_index idx) { struct mem_cgroup *iter; long val = 0; @@ -4020,55 +3466,71 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { u64 val; - if (!mem_cgroup_is_root(memcg)) { + if (mem_cgroup_is_root(memcg)) { + val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); + val += tree_stat(memcg, MEM_CGROUP_STAT_RSS); + if (swap) + val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP); + } else { if (!swap) - return res_counter_read_u64(&memcg->res, RES_USAGE); + val = page_counter_read(&memcg->memory); else - return res_counter_read_u64(&memcg->memsw, RES_USAGE); + val = page_counter_read(&memcg->memsw); } - - /* - * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS - * as well as in MEM_CGROUP_STAT_RSS_HUGE. - */ - val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); - - if (swap) - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); - return val << PAGE_SHIFT; } +enum { + RES_USAGE, + RES_LIMIT, + RES_MAX_USAGE, + RES_FAILCNT, + RES_SOFT_LIMIT, +}; static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - enum res_type type = MEMFILE_TYPE(cft->private); - int name = MEMFILE_ATTR(cft->private); + struct page_counter *counter; - switch (type) { + switch (MEMFILE_TYPE(cft->private)) { case _MEM: - if (name == RES_USAGE) - return mem_cgroup_usage(memcg, false); - return res_counter_read_u64(&memcg->res, name); + counter = &memcg->memory; + break; case _MEMSWAP: - if (name == RES_USAGE) - return mem_cgroup_usage(memcg, true); - return res_counter_read_u64(&memcg->memsw, name); + counter = &memcg->memsw; + break; case _KMEM: - return res_counter_read_u64(&memcg->kmem, name); + counter = &memcg->kmem; break; default: BUG(); } + + switch (MEMFILE_ATTR(cft->private)) { + case RES_USAGE: + if (counter == &memcg->memory) + return mem_cgroup_usage(memcg, false); + if (counter == &memcg->memsw) + return mem_cgroup_usage(memcg, true); + return (u64)page_counter_read(counter) * PAGE_SIZE; + case RES_LIMIT: + return (u64)counter->limit * PAGE_SIZE; + case RES_MAX_USAGE: + return (u64)counter->watermark * PAGE_SIZE; + case RES_FAILCNT: + return counter->failcnt; + case RES_SOFT_LIMIT: + return (u64)memcg->soft_limit * PAGE_SIZE; + default: + BUG(); + } } #ifdef CONFIG_MEMCG_KMEM -/* should be called with activate_kmem_mutex held */ -static int __memcg_activate_kmem(struct mem_cgroup *memcg, - unsigned long long limit) +static int memcg_activate_kmem(struct mem_cgroup *memcg, + unsigned long nr_pages) { int err = 0; int memcg_id; @@ -4115,7 +3577,7 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, * We couldn't have accounted to this cgroup, because it hasn't got the * active bit set yet, so this should succeed. */ - err = res_counter_set_limit(&memcg->kmem, limit); + err = page_counter_limit(&memcg->kmem, nr_pages); VM_BUG_ON(err); static_key_slow_inc(&memcg_kmem_enabled_key); @@ -4130,26 +3592,17 @@ out: return err; } -static int memcg_activate_kmem(struct mem_cgroup *memcg, - unsigned long long limit) -{ - int ret; - - mutex_lock(&activate_kmem_mutex); - ret = __memcg_activate_kmem(memcg, limit); - mutex_unlock(&activate_kmem_mutex); - return ret; -} - static int memcg_update_kmem_limit(struct mem_cgroup *memcg, - unsigned long long val) + unsigned long limit) { int ret; + mutex_lock(&memcg_limit_mutex); if (!memcg_kmem_is_active(memcg)) - ret = memcg_activate_kmem(memcg, val); + ret = memcg_activate_kmem(memcg, limit); else - ret = res_counter_set_limit(&memcg->kmem, val); + ret = page_counter_limit(&memcg->kmem, limit); + mutex_unlock(&memcg_limit_mutex); return ret; } @@ -4161,19 +3614,19 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) if (!parent) return 0; - mutex_lock(&activate_kmem_mutex); + mutex_lock(&memcg_limit_mutex); /* * If the parent cgroup is not kmem-active now, it cannot be activated * after this point, because it has at least one child already. */ if (memcg_kmem_is_active(parent)) - ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX); - mutex_unlock(&activate_kmem_mutex); + ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX); + mutex_unlock(&memcg_limit_mutex); return ret; } #else static int memcg_update_kmem_limit(struct mem_cgroup *memcg, - unsigned long long val) + unsigned long limit) { return -EINVAL; } @@ -4187,110 +3640,69 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - enum res_type type; - int name; - unsigned long long val; + unsigned long nr_pages; int ret; buf = strstrip(buf); - type = MEMFILE_TYPE(of_cft(of)->private); - name = MEMFILE_ATTR(of_cft(of)->private); + ret = page_counter_memparse(buf, &nr_pages); + if (ret) + return ret; - switch (name) { + switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_LIMIT: if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ ret = -EINVAL; break; } - /* This function does all necessary parse...reuse it */ - ret = res_counter_memparse_write_strategy(buf, &val); - if (ret) + switch (MEMFILE_TYPE(of_cft(of)->private)) { + case _MEM: + ret = mem_cgroup_resize_limit(memcg, nr_pages); break; - if (type == _MEM) - ret = mem_cgroup_resize_limit(memcg, val); - else if (type == _MEMSWAP) - ret = mem_cgroup_resize_memsw_limit(memcg, val); - else if (type == _KMEM) - ret = memcg_update_kmem_limit(memcg, val); - else - return -EINVAL; - break; - case RES_SOFT_LIMIT: - ret = res_counter_memparse_write_strategy(buf, &val); - if (ret) + case _MEMSWAP: + ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages); break; - /* - * For memsw, soft limits are hard to implement in terms - * of semantics, for now, we support soft limits for - * control without swap - */ - if (type == _MEM) - ret = res_counter_set_soft_limit(&memcg->res, val); - else - ret = -EINVAL; + case _KMEM: + ret = memcg_update_kmem_limit(memcg, nr_pages); + break; + } break; - default: - ret = -EINVAL; /* should be BUG() ? */ + case RES_SOFT_LIMIT: + memcg->soft_limit = nr_pages; + ret = 0; break; } return ret ?: nbytes; } -static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, - unsigned long long *mem_limit, unsigned long long *memsw_limit) -{ - unsigned long long min_limit, min_memsw_limit, tmp; - - min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); - min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); - if (!memcg->use_hierarchy) - goto out; - - while (memcg->css.parent) { - memcg = mem_cgroup_from_css(memcg->css.parent); - if (!memcg->use_hierarchy) - break; - tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); - min_limit = min(min_limit, tmp); - tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); - min_memsw_limit = min(min_memsw_limit, tmp); - } -out: - *mem_limit = min_limit; - *memsw_limit = min_memsw_limit; -} - static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - int name; - enum res_type type; + struct page_counter *counter; - type = MEMFILE_TYPE(of_cft(of)->private); - name = MEMFILE_ATTR(of_cft(of)->private); + switch (MEMFILE_TYPE(of_cft(of)->private)) { + case _MEM: + counter = &memcg->memory; + break; + case _MEMSWAP: + counter = &memcg->memsw; + break; + case _KMEM: + counter = &memcg->kmem; + break; + default: + BUG(); + } - switch (name) { + switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_MAX_USAGE: - if (type == _MEM) - res_counter_reset_max(&memcg->res); - else if (type == _MEMSWAP) - res_counter_reset_max(&memcg->memsw); - else if (type == _KMEM) - res_counter_reset_max(&memcg->kmem); - else - return -EINVAL; + page_counter_reset_watermark(counter); break; case RES_FAILCNT: - if (type == _MEM) - res_counter_reset_failcnt(&memcg->res); - else if (type == _MEMSWAP) - res_counter_reset_failcnt(&memcg->memsw); - else if (type == _KMEM) - res_counter_reset_failcnt(&memcg->kmem); - else - return -EINVAL; + counter->failcnt = 0; break; + default: + BUG(); } return nbytes; @@ -4387,6 +3799,7 @@ static inline void mem_cgroup_lru_names_not_uptodate(void) static int memcg_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long memory, memsw; struct mem_cgroup *mi; unsigned int i; @@ -4406,14 +3819,16 @@ static int memcg_stat_show(struct seq_file *m, void *v) mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); /* Hierarchical information */ - { - unsigned long long limit, memsw_limit; - memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); - seq_printf(m, "hierarchical_memory_limit %llu\n", limit); - if (do_swap_account) - seq_printf(m, "hierarchical_memsw_limit %llu\n", - memsw_limit); + memory = memsw = PAGE_COUNTER_MAX; + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { + memory = min(memory, mi->memory.limit); + memsw = min(memsw, mi->memsw.limit); } + seq_printf(m, "hierarchical_memory_limit %llu\n", + (u64)memory * PAGE_SIZE); + if (do_swap_account) + seq_printf(m, "hierarchical_memsw_limit %llu\n", + (u64)memsw * PAGE_SIZE); for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { long long val = 0; @@ -4497,7 +3912,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) { struct mem_cgroup_threshold_ary *t; - u64 usage; + unsigned long usage; int i; rcu_read_lock(); @@ -4596,10 +4011,11 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, { struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; - u64 threshold, usage; + unsigned long threshold; + unsigned long usage; int i, size, ret; - ret = res_counter_memparse_write_strategy(args, &threshold); + ret = page_counter_memparse(args, &threshold); if (ret) return ret; @@ -4689,7 +4105,7 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, { struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; - u64 usage; + unsigned long usage; int i, j, size; mutex_lock(&memcg->thresholds_lock); @@ -4855,40 +4271,6 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg) { mem_cgroup_sockets_destroy(memcg); } - -static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) -{ - if (!memcg_kmem_is_active(memcg)) - return; - - /* - * kmem charges can outlive the cgroup. In the case of slab - * pages, for instance, a page contain objects from various - * processes. As we prevent from taking a reference for every - * such allocation we have to be careful when doing uncharge - * (see memcg_uncharge_kmem) and here during offlining. - * - * The idea is that that only the _last_ uncharge which sees - * the dead memcg will drop the last reference. An additional - * reference is taken here before the group is marked dead - * which is then paired with css_put during uncharge resp. here. - * - * Although this might sound strange as this path is called from - * css_offline() when the referencemight have dropped down to 0 and - * shouldn't be incremented anymore (css_tryget_online() would - * fail) we do not have other options because of the kmem - * allocations lifetime. - */ - css_get(&memcg->css); - - memcg_kmem_mark_dead(memcg); - - if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) - return; - - if (memcg_kmem_test_and_clear_dead(memcg)) - css_put(&memcg->css); -} #else static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { @@ -4898,10 +4280,6 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) static void memcg_destroy_kmem(struct mem_cgroup *memcg) { } - -static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) -{ -} #endif /* @@ -5064,7 +4442,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, * * DO NOT ADD NEW FILES. */ - name = cfile.file->f_dentry->d_name.name; + name = cfile.file->f_path.dentry->d_name.name; if (!strcmp(name, "memory.usage_in_bytes")) { event->register_event = mem_cgroup_usage_register_event; @@ -5088,7 +4466,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, * automatically removed on cgroup destruction but the removal is * asynchronous, so take an extra ref on @css. */ - cfile_css = css_tryget_online_from_dir(cfile.file->f_dentry->d_parent, + cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent, &memory_cgrp_subsys); ret = -EINVAL; if (IS_ERR(cfile_css)) @@ -5228,7 +4606,10 @@ static struct cftype mem_cgroup_files[] = { #ifdef CONFIG_SLABINFO { .name = "kmem.slabinfo", - .seq_show = mem_cgroup_slabinfo_read, + .seq_start = slab_start, + .seq_next = slab_next, + .seq_stop = slab_stop, + .seq_show = memcg_slab_show, }, #endif #endif @@ -5363,9 +4744,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) */ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { - if (!memcg->res.parent) + if (!memcg->memory.parent) return NULL; - return mem_cgroup_from_res_counter(memcg->res.parent, res); + return mem_cgroup_from_counter(memcg->memory.parent, memory); } EXPORT_SYMBOL(parent_mem_cgroup); @@ -5410,9 +4791,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) /* root ? */ if (parent_css == NULL) { root_mem_cgroup = memcg; - res_counter_init(&memcg->res, NULL); - res_counter_init(&memcg->memsw, NULL); - res_counter_init(&memcg->kmem, NULL); + page_counter_init(&memcg->memory, NULL); + page_counter_init(&memcg->memsw, NULL); + page_counter_init(&memcg->kmem, NULL); } memcg->last_scanned_node = MAX_NUMNODES; @@ -5451,18 +4832,18 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) memcg->swappiness = mem_cgroup_swappiness(parent); if (parent->use_hierarchy) { - res_counter_init(&memcg->res, &parent->res); - res_counter_init(&memcg->memsw, &parent->memsw); - res_counter_init(&memcg->kmem, &parent->kmem); + page_counter_init(&memcg->memory, &parent->memory); + page_counter_init(&memcg->memsw, &parent->memsw); + page_counter_init(&memcg->kmem, &parent->kmem); /* * No need to take a reference to the parent because cgroup * core guarantees its existence. */ } else { - res_counter_init(&memcg->res, NULL); - res_counter_init(&memcg->memsw, NULL); - res_counter_init(&memcg->kmem, NULL); + page_counter_init(&memcg->memory, NULL); + page_counter_init(&memcg->memsw, NULL); + page_counter_init(&memcg->kmem, NULL); /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this @@ -5487,29 +4868,10 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) return 0; } -/* - * Announce all parents that a group from their hierarchy is gone. - */ -static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) -{ - struct mem_cgroup *parent = memcg; - - while ((parent = parent_mem_cgroup(parent))) - mem_cgroup_iter_invalidate(parent); - - /* - * if the root memcg is not hierarchical we have to check it - * explicitely. - */ - if (!root_mem_cgroup->use_hierarchy) - mem_cgroup_iter_invalidate(root_mem_cgroup); -} - static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_event *event, *tmp; - struct cgroup_subsys_state *iter; /* * Unregister events and notify userspace. @@ -5523,17 +4885,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) } spin_unlock(&memcg->event_list_lock); - kmem_cgroup_css_offline(memcg); - - mem_cgroup_invalidate_reclaim_iterators(memcg); - - /* - * This requires that offlining is serialized. Right now that is - * guaranteed because css_killed_work_fn() holds the cgroup_mutex. - */ - css_for_each_descendant_post(iter, css) - mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); - memcg_unregister_all_caches(memcg); vmpressure_cleanup(&memcg->vmpressure); } @@ -5541,42 +4892,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) static void mem_cgroup_css_free(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - /* - * XXX: css_offline() would be where we should reparent all - * memory to prepare the cgroup for destruction. However, - * memcg does not do css_tryget_online() and res_counter charging - * under the same RCU lock region, which means that charging - * could race with offlining. Offlining only happens to - * cgroups with no tasks in them but charges can show up - * without any tasks from the swapin path when the target - * memcg is looked up from the swapout record and not from the - * current task as it usually is. A race like this can leak - * charges and put pages with stale cgroup pointers into - * circulation: - * - * #0 #1 - * lookup_swap_cgroup_id() - * rcu_read_lock() - * mem_cgroup_lookup() - * css_tryget_online() - * rcu_read_unlock() - * disable css_tryget_online() - * call_rcu() - * offline_css() - * reparent_charges() - * res_counter_charge() - * css_put() - * css_free() - * pc->mem_cgroup = dead memcg - * add page to lru - * - * The bulk of the charges are still moved in offline_css() to - * avoid pinning a lot of pages in case a long-term reference - * like a swapout record is deferring the css_free() to long - * after offlining. But this makes sure we catch any charges - * made after offlining: - */ - mem_cgroup_reparent_charges(memcg); memcg_destroy_kmem(memcg); __mem_cgroup_free(memcg); @@ -5599,10 +4914,10 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - mem_cgroup_resize_limit(memcg, ULLONG_MAX); - mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX); - memcg_update_kmem_limit(memcg, ULLONG_MAX); - res_counter_set_soft_limit(&memcg->res, ULLONG_MAX); + mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); + mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); + memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); + memcg->soft_limit = 0; } #ifdef CONFIG_MMU @@ -5758,7 +5073,6 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, union mc_target *target) { struct page *page = NULL; - struct page_cgroup *pc; enum mc_target_type ret = MC_TARGET_NONE; swp_entry_t ent = { .val = 0 }; @@ -5772,13 +5086,12 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, if (!page && !ent.val) return ret; if (page) { - pc = lookup_page_cgroup(page); /* * Do only loose check w/o serialization. - * mem_cgroup_move_account() checks the pc is valid or + * mem_cgroup_move_account() checks the page is valid or * not under LRU exclusion. */ - if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { + if (page->mem_cgroup == mc.from) { ret = MC_TARGET_PAGE; if (target) target->page = page; @@ -5806,15 +5119,13 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd, union mc_target *target) { struct page *page = NULL; - struct page_cgroup *pc; enum mc_target_type ret = MC_TARGET_NONE; page = pmd_page(pmd); VM_BUG_ON_PAGE(!page || !PageHead(page), page); if (!move_anon()) return ret; - pc = lookup_page_cgroup(page); - if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { + if (page->mem_cgroup == mc.from) { ret = MC_TARGET_PAGE; if (target) { get_page(page); @@ -5897,7 +5208,6 @@ static void __mem_cgroup_clear_mc(void) { struct mem_cgroup *from = mc.from; struct mem_cgroup *to = mc.to; - int i; /* we must uncharge all the leftover precharges from mc.to */ if (mc.precharge) { @@ -5916,19 +5226,17 @@ static void __mem_cgroup_clear_mc(void) if (mc.moved_swap) { /* uncharge swap account from the old cgroup */ if (!mem_cgroup_is_root(mc.from)) - res_counter_uncharge(&mc.from->memsw, - PAGE_SIZE * mc.moved_swap); - - for (i = 0; i < mc.moved_swap; i++) - css_put(&mc.from->css); + page_counter_uncharge(&mc.from->memsw, mc.moved_swap); /* - * we charged both to->res and to->memsw, so we should - * uncharge to->res. + * we charged both to->memory and to->memsw, so we + * should uncharge to->memory. */ if (!mem_cgroup_is_root(mc.to)) - res_counter_uncharge(&mc.to->res, - PAGE_SIZE * mc.moved_swap); + page_counter_uncharge(&mc.to->memory, mc.moved_swap); + + css_put_many(&mc.from->css, mc.moved_swap); + /* we've already done css_get(mc.to) */ mc.moved_swap = 0; } @@ -5939,8 +5247,6 @@ static void __mem_cgroup_clear_mc(void) static void mem_cgroup_clear_mc(void) { - struct mem_cgroup *from = mc.from; - /* * we must clear moving_task before waking up waiters at the end of * task migration. @@ -5951,7 +5257,6 @@ static void mem_cgroup_clear_mc(void) mc.from = NULL; mc.to = NULL; spin_unlock(&mc.lock); - mem_cgroup_end_move(from); } static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, @@ -5984,7 +5289,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, VM_BUG_ON(mc.precharge); VM_BUG_ON(mc.moved_charge); VM_BUG_ON(mc.moved_swap); - mem_cgroup_start_move(from); + spin_lock(&mc.lock); mc.from = from; mc.to = memcg; @@ -6004,7 +5309,8 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { - mem_cgroup_clear_mc(); + if (mc.to) + mem_cgroup_clear_mc(); } static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, @@ -6018,7 +5324,6 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, enum mc_target_type target_type; union mc_target target; struct page *page; - struct page_cgroup *pc; /* * We don't take compound_lock() here but no race with splitting thp @@ -6039,9 +5344,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, if (target_type == MC_TARGET_PAGE) { page = target.page; if (!isolate_lru_page(page)) { - pc = lookup_page_cgroup(page); if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, - pc, mc.from, mc.to)) { + mc.from, mc.to)) { mc.precharge -= HPAGE_PMD_NR; mc.moved_charge += HPAGE_PMD_NR; } @@ -6069,9 +5373,7 @@ retry: page = target.page; if (isolate_lru_page(page)) goto put; - pc = lookup_page_cgroup(page); - if (!mem_cgroup_move_account(page, 1, pc, - mc.from, mc.to)) { + if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) { mc.precharge--; /* we uncharge from mc.from later. */ mc.moved_charge++; @@ -6115,6 +5417,13 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) struct vm_area_struct *vma; lru_add_drain_all(); + /* + * Signal mem_cgroup_begin_page_stat() to take the memcg's + * move_lock while we're moving its pages to another memcg. + * Then wait for already started RCU-only updates to finish. + */ + atomic_inc(&mc.from->moving_account); + synchronize_rcu(); retry: if (unlikely(!down_read_trylock(&mm->mmap_sem))) { /* @@ -6147,6 +5456,7 @@ retry: break; } up_read(&mm->mmap_sem); + atomic_dec(&mc.from->moving_account); } static void mem_cgroup_move_task(struct cgroup_subsys_state *css, @@ -6250,7 +5560,7 @@ static void __init enable_swap_cgroup(void) */ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) { - struct page_cgroup *pc; + struct mem_cgroup *memcg; unsigned short oldid; VM_BUG_ON_PAGE(PageLRU(page), page); @@ -6259,20 +5569,26 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!do_swap_account) return; - pc = lookup_page_cgroup(page); + memcg = page->mem_cgroup; /* Readahead page, never charged */ - if (!PageCgroupUsed(pc)) + if (!memcg) return; - VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page); - - oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup)); + oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); VM_BUG_ON_PAGE(oldid, page); + mem_cgroup_swap_statistics(memcg, true); + + page->mem_cgroup = NULL; - pc->flags &= ~PCG_MEMSW; - css_get(&pc->mem_cgroup->css); - mem_cgroup_swap_statistics(pc->mem_cgroup, true); + if (!mem_cgroup_is_root(memcg)) + page_counter_uncharge(&memcg->memory, 1); + + /* XXX: caller holds IRQ-safe mapping->tree_lock */ + VM_BUG_ON(!irqs_disabled()); + + mem_cgroup_charge_statistics(memcg, page, -1); + memcg_check_events(memcg, page); } /** @@ -6294,7 +5610,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) memcg = mem_cgroup_lookup(id); if (memcg) { if (!mem_cgroup_is_root(memcg)) - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + page_counter_uncharge(&memcg->memsw, 1); mem_cgroup_swap_statistics(memcg, false); css_put(&memcg->css); } @@ -6330,7 +5646,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, goto out; if (PageSwapCache(page)) { - struct page_cgroup *pc = lookup_page_cgroup(page); /* * Every swap fault against a single page tries to charge the * page, bail as early as possible. shmem_unuse() encounters @@ -6338,7 +5653,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, * the page lock, which serializes swap cache removal, which * in turn serializes uncharging. */ - if (PageCgroupUsed(pc)) + if (page->mem_cgroup) goto out; } @@ -6452,19 +5767,16 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) } static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, - unsigned long nr_mem, unsigned long nr_memsw, unsigned long nr_anon, unsigned long nr_file, unsigned long nr_huge, struct page *dummy_page) { + unsigned long nr_pages = nr_anon + nr_file; unsigned long flags; if (!mem_cgroup_is_root(memcg)) { - if (nr_mem) - res_counter_uncharge(&memcg->res, - nr_mem * PAGE_SIZE); - if (nr_memsw) - res_counter_uncharge(&memcg->memsw, - nr_memsw * PAGE_SIZE); + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_swap_account) + page_counter_uncharge(&memcg->memsw, nr_pages); memcg_oom_recover(memcg); } @@ -6473,27 +5785,27 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); - __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file); + __this_cpu_add(memcg->stat->nr_page_events, nr_pages); memcg_check_events(memcg, dummy_page); local_irq_restore(flags); + + if (!mem_cgroup_is_root(memcg)) + css_put_many(&memcg->css, nr_pages); } static void uncharge_list(struct list_head *page_list) { struct mem_cgroup *memcg = NULL; - unsigned long nr_memsw = 0; unsigned long nr_anon = 0; unsigned long nr_file = 0; unsigned long nr_huge = 0; unsigned long pgpgout = 0; - unsigned long nr_mem = 0; struct list_head *next; struct page *page; next = page_list->next; do { unsigned int nr_pages = 1; - struct page_cgroup *pc; page = list_entry(next, struct page, lru); next = page->lru.next; @@ -6501,24 +5813,22 @@ static void uncharge_list(struct list_head *page_list) VM_BUG_ON_PAGE(PageLRU(page), page); VM_BUG_ON_PAGE(page_count(page), page); - pc = lookup_page_cgroup(page); - if (!PageCgroupUsed(pc)) + if (!page->mem_cgroup) continue; /* * Nobody should be changing or seriously looking at - * pc->mem_cgroup and pc->flags at this point, we have - * fully exclusive access to the page. + * page->mem_cgroup at this point, we have fully + * exclusive access to the page. */ - if (memcg != pc->mem_cgroup) { + if (memcg != page->mem_cgroup) { if (memcg) { - uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, - nr_anon, nr_file, nr_huge, page); - pgpgout = nr_mem = nr_memsw = 0; - nr_anon = nr_file = nr_huge = 0; + uncharge_batch(memcg, pgpgout, nr_anon, nr_file, + nr_huge, page); + pgpgout = nr_anon = nr_file = nr_huge = 0; } - memcg = pc->mem_cgroup; + memcg = page->mem_cgroup; } if (PageTransHuge(page)) { @@ -6532,18 +5842,14 @@ static void uncharge_list(struct list_head *page_list) else nr_file += nr_pages; - if (pc->flags & PCG_MEM) - nr_mem += nr_pages; - if (pc->flags & PCG_MEMSW) - nr_memsw += nr_pages; - pc->flags = 0; + page->mem_cgroup = NULL; pgpgout++; } while (next != page_list); if (memcg) - uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, - nr_anon, nr_file, nr_huge, page); + uncharge_batch(memcg, pgpgout, nr_anon, nr_file, + nr_huge, page); } /** @@ -6555,14 +5861,11 @@ static void uncharge_list(struct list_head *page_list) */ void mem_cgroup_uncharge(struct page *page) { - struct page_cgroup *pc; - if (mem_cgroup_disabled()) return; /* Don't touch page->lru of any random page, pre-check: */ - pc = lookup_page_cgroup(page); - if (!PageCgroupUsed(pc)) + if (!page->mem_cgroup) return; INIT_LIST_HEAD(&page->lru); @@ -6598,7 +5901,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, bool lrucare) { - struct page_cgroup *pc; + struct mem_cgroup *memcg; int isolated; VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); @@ -6613,27 +5916,28 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, return; /* Page cache replacement: new page already charged? */ - pc = lookup_page_cgroup(newpage); - if (PageCgroupUsed(pc)) + if (newpage->mem_cgroup) return; - /* Re-entrant migration: old page already uncharged? */ - pc = lookup_page_cgroup(oldpage); - if (!PageCgroupUsed(pc)) + /* + * Swapcache readahead pages can get migrated before being + * charged, and migration from compaction can happen to an + * uncharged page when the PFN walker finds a page that + * reclaim just put back on the LRU but has not released yet. + */ + memcg = oldpage->mem_cgroup; + if (!memcg) return; - VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage); - VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage); - if (lrucare) lock_page_lru(oldpage, &isolated); - pc->flags = 0; + oldpage->mem_cgroup = NULL; if (lrucare) unlock_page_lru(oldpage, isolated); - commit_charge(newpage, pc->mem_cgroup, lrucare); + commit_charge(newpage, memcg, lrucare); } /* diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 8639f6b28746..e5ee0ca7ae85 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -233,7 +233,7 @@ void shake_page(struct page *p, int access) lru_add_drain_all(); if (PageLRU(p)) return; - drain_all_pages(); + drain_all_pages(page_zone(p)); if (PageLRU(p) || is_free_buddy_page(p)) return; } @@ -860,7 +860,6 @@ static int page_action(struct page_state *ps, struct page *p, int count; result = ps->action(p, pfn); - action_result(pfn, ps->msg, result); count = page_count(p) - 1; if (ps->action == me_swapcache_dirty && result == DELAYED) @@ -871,6 +870,7 @@ static int page_action(struct page_state *ps, struct page *p, pfn, ps->msg, count); result = FAILED; } + action_result(pfn, ps->msg, result); /* Could do more checks here if page looks ok */ /* @@ -1661,7 +1661,7 @@ static int __soft_offline_page(struct page *page, int flags) if (!is_free_buddy_page(page)) lru_add_drain_all(); if (!is_free_buddy_page(page)) - drain_all_pages(); + drain_all_pages(page_zone(page)); SetPageHWPoison(page); if (!is_free_buddy_page(page)) pr_info("soft offline: %#lx: page leaked\n", diff --git a/mm/memory.c b/mm/memory.c index 3e503831e042..0b3f6c71620d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -220,9 +220,6 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long /* Is it from 0 to ~0? */ tlb->fullmm = !(start | (end+1)); tlb->need_flush_all = 0; - tlb->start = start; - tlb->end = end; - tlb->need_flush = 0; tlb->local.next = NULL; tlb->local.nr = 0; tlb->local.max = ARRAY_SIZE(tlb->__pages); @@ -232,15 +229,20 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long #ifdef CONFIG_HAVE_RCU_TABLE_FREE tlb->batch = NULL; #endif + + __tlb_reset_range(tlb); } static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) { - tlb->need_flush = 0; + if (!tlb->end) + return; + tlb_flush(tlb); #ifdef CONFIG_HAVE_RCU_TABLE_FREE tlb_table_flush(tlb); #endif + __tlb_reset_range(tlb); } static void tlb_flush_mmu_free(struct mmu_gather *tlb) @@ -256,8 +258,6 @@ static void tlb_flush_mmu_free(struct mmu_gather *tlb) void tlb_flush_mmu(struct mmu_gather *tlb) { - if (!tlb->need_flush) - return; tlb_flush_mmu_tlbonly(tlb); tlb_flush_mmu_free(tlb); } @@ -292,7 +292,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { struct mmu_gather_batch *batch; - VM_BUG_ON(!tlb->need_flush); + VM_BUG_ON(!tlb->end); batch = tlb->active; batch->pages[batch->nr++] = page; @@ -359,8 +359,6 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) { struct mmu_table_batch **batch = &tlb->batch; - tlb->need_flush = 1; - /* * When there's less then two users of this mm there cannot be a * concurrent page-table walk. @@ -815,20 +813,20 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (!pte_file(pte)) { swp_entry_t entry = pte_to_swp_entry(pte); - if (swap_duplicate(entry) < 0) - return entry.val; - - /* make sure dst_mm is on swapoff's mmlist. */ - if (unlikely(list_empty(&dst_mm->mmlist))) { - spin_lock(&mmlist_lock); - if (list_empty(&dst_mm->mmlist)) - list_add(&dst_mm->mmlist, - &src_mm->mmlist); - spin_unlock(&mmlist_lock); - } - if (likely(!non_swap_entry(entry))) + if (likely(!non_swap_entry(entry))) { + if (swap_duplicate(entry) < 0) + return entry.val; + + /* make sure dst_mm is on swapoff's mmlist. */ + if (unlikely(list_empty(&dst_mm->mmlist))) { + spin_lock(&mmlist_lock); + if (list_empty(&dst_mm->mmlist)) + list_add(&dst_mm->mmlist, + &src_mm->mmlist); + spin_unlock(&mmlist_lock); + } rss[MM_SWAPENTS]++; - else if (is_migration_entry(entry)) { + } else if (is_migration_entry(entry)) { page = migration_entry_to_page(entry); if (PageAnon(page)) @@ -1186,20 +1184,8 @@ again: arch_leave_lazy_mmu_mode(); /* Do the actual TLB flush before dropping ptl */ - if (force_flush) { - unsigned long old_end; - - /* - * Flush the TLB just for the previous segment, - * then update the range to be the remaining - * TLB range. - */ - old_end = tlb->end; - tlb->end = addr; + if (force_flush) tlb_flush_mmu_tlbonly(tlb); - tlb->start = addr; - tlb->end = old_end; - } pte_unmap_unlock(start_pte, ptl); /* diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 252e1dbbed86..9fab10795bea 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -31,6 +31,7 @@ #include <linux/stop_machine.h> #include <linux/hugetlb.h> #include <linux/memblock.h> +#include <linux/bootmem.h> #include <asm/tlbflush.h> @@ -1066,6 +1067,16 @@ out: } #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ +static void reset_node_present_pages(pg_data_t *pgdat) +{ + struct zone *z; + + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) + z->present_pages = 0; + + pgdat->node_present_pages = 0; +} + /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) { @@ -1096,6 +1107,21 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) build_all_zonelists(pgdat, NULL); mutex_unlock(&zonelists_mutex); + /* + * zone->managed_pages is set to an approximate value in + * free_area_init_core(), which will cause + * /sys/device/system/node/nodeX/meminfo has wrong data. + * So reset it to 0 before any memory is onlined. + */ + reset_node_managed_pages(pgdat); + + /* + * When memory is hot-added, all the memory is in offline state. So + * clear all zones' present_pages because they will be updated in + * online_pages() and offline_pages(). + */ + reset_node_present_pages(pgdat); + return pgdat; } @@ -1699,7 +1725,7 @@ repeat: if (drain) { lru_add_drain_all(); cond_resched(); - drain_all_pages(); + drain_all_pages(zone); } pfn = scan_movable_pages(start_pfn, end_pfn); @@ -1721,7 +1747,7 @@ repeat: lru_add_drain_all(); yield(); /* drain pcp pages, this is synchronous. */ - drain_all_pages(); + drain_all_pages(zone); /* * dissolve free hugepages in the memory block before doing offlining * actually in order to make hugetlbfs's object counting consistent. diff --git a/mm/mmap.c b/mm/mmap.c index 87e82b38453c..b6c0a77fc1c8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -776,8 +776,11 @@ again: remove_next = 1 + (end > next->vm_end); * shrinking vma had, to cover any anon pages imported. */ if (exporter && exporter->anon_vma && !importer->anon_vma) { - if (anon_vma_clone(importer, exporter)) - return -ENOMEM; + int error; + + error = anon_vma_clone(importer, exporter); + if (error) + return error; importer->anon_vma = exporter->anon_vma; } } @@ -2469,7 +2472,8 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (err) goto out_free_vma; - if (anon_vma_clone(new, vma)) + err = anon_vma_clone(new, vma); + if (err) goto out_free_mpol; if (new->vm_file) @@ -2597,6 +2601,8 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) detach_vmas_to_be_unmapped(mm, vma, prev, end); unmap_region(mm, vma, prev, start, end); + arch_unmap(mm, vma, start, end); + /* Fix up all other VM information */ remove_vma_list(mm, vma); diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 7c7ab32ee503..90b50468333e 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -145,12 +145,10 @@ static unsigned long __init free_low_memory_core_early(void) static int reset_managed_pages_done __initdata; -static inline void __init reset_node_managed_pages(pg_data_t *pgdat) +void reset_node_managed_pages(pg_data_t *pgdat) { struct zone *z; - if (reset_managed_pages_done) - return; for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) z->managed_pages = 0; } @@ -159,8 +157,12 @@ void __init reset_all_zones_managed_pages(void) { struct pglist_data *pgdat; + if (reset_managed_pages_done) + return; + for_each_online_pgdat(pgdat) reset_node_managed_pages(pgdat); + reset_managed_pages_done = 1; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 5340f6b91312..3b014d326151 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -119,7 +119,7 @@ found: /* return true if the task is not adequate as candidate victim task. */ static bool oom_unkillable_task(struct task_struct *p, - const struct mem_cgroup *memcg, const nodemask_t *nodemask) + struct mem_cgroup *memcg, const nodemask_t *nodemask) { if (is_global_init(p)) return true; @@ -353,7 +353,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, * swapents, oom_score_adj value, and name. */ -static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask) +static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) { struct task_struct *p; struct task_struct *task; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 19ceae87522d..d5d81f5384d1 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2357,7 +2357,7 @@ int test_clear_page_writeback(struct page *page) dec_zone_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_WRITTEN); } - mem_cgroup_end_page_stat(memcg, locked, memcg_flags); + mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags); return ret; } @@ -2399,7 +2399,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); inc_zone_page_state(page, NR_WRITEBACK); } - mem_cgroup_end_page_stat(memcg, locked, memcg_flags); + mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags); return ret; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9cd36b822444..a7198c065999 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -48,7 +48,6 @@ #include <linux/backing-dev.h> #include <linux/fault-inject.h> #include <linux/page-isolation.h> -#include <linux/page_cgroup.h> #include <linux/debugobjects.h> #include <linux/kmemleak.h> #include <linux/compaction.h> @@ -467,29 +466,6 @@ static inline void rmv_page_order(struct page *page) } /* - * Locate the struct page for both the matching buddy in our - * pair (buddy1) and the combined O(n+1) page they form (page). - * - * 1) Any buddy B1 will have an order O twin B2 which satisfies - * the following equation: - * B2 = B1 ^ (1 << O) - * For example, if the starting buddy (buddy2) is #8 its order - * 1 buddy is #10: - * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 - * - * 2) Any buddy B will have an order O+1 parent P which - * satisfies the following equation: - * P = B & ~(1 << O) - * - * Assumption: *_mem_map is contiguous at least up to MAX_ORDER - */ -static inline unsigned long -__find_buddy_index(unsigned long page_idx, unsigned int order) -{ - return page_idx ^ (1 << order); -} - -/* * This function checks whether a page is free && is the buddy * we can do coalesce a page and its buddy if * (a) the buddy is not in a hole && @@ -569,6 +545,7 @@ static inline void __free_one_page(struct page *page, unsigned long combined_idx; unsigned long uninitialized_var(buddy_idx); struct page *buddy; + int max_order = MAX_ORDER; VM_BUG_ON(!zone_is_initialized(zone)); @@ -577,13 +554,24 @@ static inline void __free_one_page(struct page *page, return; VM_BUG_ON(migratetype == -1); + if (is_migrate_isolate(migratetype)) { + /* + * We restrict max order of merging to prevent merge + * between freepages on isolate pageblock and normal + * pageblock. Without this, pageblock isolation + * could cause incorrect freepage accounting. + */ + max_order = min(MAX_ORDER, pageblock_order + 1); + } else { + __mod_zone_freepage_state(zone, 1 << order, migratetype); + } - page_idx = pfn & ((1 << MAX_ORDER) - 1); + page_idx = pfn & ((1 << max_order) - 1); VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); - while (order < MAX_ORDER-1) { + while (order < max_order - 1) { buddy_idx = __find_buddy_index(page_idx, order); buddy = page + (buddy_idx - page_idx); if (!page_is_buddy(page, buddy, order)) @@ -594,9 +582,11 @@ static inline void __free_one_page(struct page *page, */ if (page_is_guard(buddy)) { clear_page_guard_flag(buddy); - set_page_private(page, 0); - __mod_zone_freepage_state(zone, 1 << order, - migratetype); + set_page_private(buddy, 0); + if (!is_migrate_isolate(migratetype)) { + __mod_zone_freepage_state(zone, 1 << order, + migratetype); + } } else { list_del(&buddy->lru); zone->free_area[order].nr_free--; @@ -650,8 +640,10 @@ static inline int free_pages_check(struct page *page) bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; bad_flags = PAGE_FLAGS_CHECK_AT_FREE; } - if (unlikely(mem_cgroup_bad_page_check(page))) - bad_reason = "cgroup check failed"; +#ifdef CONFIG_MEMCG + if (unlikely(page->mem_cgroup)) + bad_reason = "page still charged to cgroup"; +#endif if (unlikely(bad_reason)) { bad_page(page, bad_reason, bad_flags); return 1; @@ -715,14 +707,12 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* must delete as __free_one_page list manipulates */ list_del(&page->lru); mt = get_freepage_migratetype(page); + if (unlikely(has_isolate_pageblock(zone))) + mt = get_pageblock_migratetype(page); + /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ __free_one_page(page, page_to_pfn(page), zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); - if (likely(!is_migrate_isolate_page(page))) { - __mod_zone_page_state(zone, NR_FREE_PAGES, 1); - if (is_migrate_cma(mt)) - __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); - } } while (--to_free && --batch_free && !list_empty(list)); } spin_unlock(&zone->lock); @@ -739,9 +729,11 @@ static void free_one_page(struct zone *zone, if (nr_scanned) __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); + if (unlikely(has_isolate_pageblock(zone) || + is_migrate_isolate(migratetype))) { + migratetype = get_pfnblock_migratetype(page, pfn); + } __free_one_page(page, pfn, zone, order, migratetype); - if (unlikely(!is_migrate_isolate(migratetype))) - __mod_zone_freepage_state(zone, 1 << order, migratetype); spin_unlock(&zone->lock); } @@ -750,6 +742,9 @@ static bool free_pages_prepare(struct page *page, unsigned int order) int i; int bad = 0; + VM_BUG_ON_PAGE(PageTail(page), page); + VM_BUG_ON_PAGE(PageHead(page) && compound_order(page) != order, page); + trace_mm_page_free(page, order); kmemcheck_free_shadow(page, order); @@ -907,8 +902,10 @@ static inline int check_new_page(struct page *page) bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; bad_flags = PAGE_FLAGS_CHECK_AT_PREP; } - if (unlikely(mem_cgroup_bad_page_check(page))) - bad_reason = "cgroup check failed"; +#ifdef CONFIG_MEMCG + if (unlikely(page->mem_cgroup)) + bad_reason = "page still charged to cgroup"; +#endif if (unlikely(bad_reason)) { bad_page(page, bad_reason, bad_flags); return 1; @@ -1276,55 +1273,75 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) #endif /* - * Drain pages of the indicated processor. + * Drain pcplists of the indicated processor and zone. * * The processor must either be the current processor and the * thread pinned to the current processor or a processor that * is not online. */ -static void drain_pages(unsigned int cpu) +static void drain_pages_zone(unsigned int cpu, struct zone *zone) { unsigned long flags; - struct zone *zone; + struct per_cpu_pageset *pset; + struct per_cpu_pages *pcp; - for_each_populated_zone(zone) { - struct per_cpu_pageset *pset; - struct per_cpu_pages *pcp; + local_irq_save(flags); + pset = per_cpu_ptr(zone->pageset, cpu); - local_irq_save(flags); - pset = per_cpu_ptr(zone->pageset, cpu); + pcp = &pset->pcp; + if (pcp->count) { + free_pcppages_bulk(zone, pcp->count, pcp); + pcp->count = 0; + } + local_irq_restore(flags); +} - pcp = &pset->pcp; - if (pcp->count) { - free_pcppages_bulk(zone, pcp->count, pcp); - pcp->count = 0; - } - local_irq_restore(flags); +/* + * Drain pcplists of all zones on the indicated processor. + * + * The processor must either be the current processor and the + * thread pinned to the current processor or a processor that + * is not online. + */ +static void drain_pages(unsigned int cpu) +{ + struct zone *zone; + + for_each_populated_zone(zone) { + drain_pages_zone(cpu, zone); } } /* * Spill all of this CPU's per-cpu pages back into the buddy allocator. + * + * The CPU has to be pinned. When zone parameter is non-NULL, spill just + * the single zone's pages. */ -void drain_local_pages(void *arg) +void drain_local_pages(struct zone *zone) { - drain_pages(smp_processor_id()); + int cpu = smp_processor_id(); + + if (zone) + drain_pages_zone(cpu, zone); + else + drain_pages(cpu); } /* * Spill all the per-cpu pages from all CPUs back into the buddy allocator. * + * When zone parameter is non-NULL, spill just the single zone's pages. + * * Note that this code is protected against sending an IPI to an offline * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but * nothing keeps CPUs from showing up after we populated the cpumask and * before the call to on_each_cpu_mask(). */ -void drain_all_pages(void) +void drain_all_pages(struct zone *zone) { int cpu; - struct per_cpu_pageset *pcp; - struct zone *zone; /* * Allocate in the BSS so we wont require allocation in @@ -1339,20 +1356,31 @@ void drain_all_pages(void) * disables preemption as part of its processing */ for_each_online_cpu(cpu) { + struct per_cpu_pageset *pcp; + struct zone *z; bool has_pcps = false; - for_each_populated_zone(zone) { + + if (zone) { pcp = per_cpu_ptr(zone->pageset, cpu); - if (pcp->pcp.count) { + if (pcp->pcp.count) has_pcps = true; - break; + } else { + for_each_populated_zone(z) { + pcp = per_cpu_ptr(z->pageset, cpu); + if (pcp->pcp.count) { + has_pcps = true; + break; + } } } + if (has_pcps) cpumask_set_cpu(cpu, &cpus_with_pcps); else cpumask_clear_cpu(cpu, &cpus_with_pcps); } - on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1); + on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages, + zone, 1); } #ifdef CONFIG_HIBERNATION @@ -1484,7 +1512,7 @@ void split_page(struct page *page, unsigned int order) } EXPORT_SYMBOL_GPL(split_page); -static int __isolate_free_page(struct page *page, unsigned int order) +int __isolate_free_page(struct page *page, unsigned int order) { unsigned long watermark; struct zone *zone; @@ -1714,7 +1742,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, int alloc_flags, long free_pages) { - /* free_pages my go negative - that's OK */ + /* free_pages may go negative - that's OK */ long min = mark; int o; long free_cma = 0; @@ -2305,7 +2333,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, int classzone_idx, int migratetype, enum migrate_mode mode, int *contended_compaction, bool *deferred_compaction) { - struct zone *last_compact_zone = NULL; unsigned long compact_result; struct page *page; @@ -2316,7 +2343,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, compact_result = try_to_compact_pages(zonelist, order, gfp_mask, nodemask, mode, contended_compaction, - &last_compact_zone); + alloc_flags, classzone_idx); current->flags &= ~PF_MEMALLOC; switch (compact_result) { @@ -2335,10 +2362,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, */ count_vm_event(COMPACTSTALL); - /* Page migration frees to the PCP lists but we want merging */ - drain_pages(get_cpu()); - put_cpu(); - page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, @@ -2354,14 +2377,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, } /* - * last_compact_zone is where try_to_compact_pages thought allocation - * should succeed, so it did not defer compaction. But here we know - * that it didn't succeed, so we do the defer. - */ - if (last_compact_zone && mode != MIGRATE_ASYNC) - defer_compaction(last_compact_zone, order); - - /* * It's bad if compaction run occurs and fails. The most likely reason * is that pages exist, but not enough to satisfy watermarks. */ @@ -2442,7 +2457,7 @@ retry: * pages are pinned on the per-cpu lists. Drain them and try again */ if (!page && !drained) { - drain_all_pages(); + drain_all_pages(NULL); drained = true; goto retry; } @@ -3902,14 +3917,14 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) else page_group_by_mobility_disabled = 0; - printk("Built %i zonelists in %s order, mobility grouping %s. " + pr_info("Built %i zonelists in %s order, mobility grouping %s. " "Total pages: %ld\n", nr_online_nodes, zonelist_order_name[current_zonelist_order], page_group_by_mobility_disabled ? "off" : "on", vm_total_pages); #ifdef CONFIG_NUMA - printk("Policy zone: %s\n", zone_names[policy_zone]); + pr_info("Policy zone: %s\n", zone_names[policy_zone]); #endif } @@ -4841,7 +4856,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, #endif init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); - pgdat_page_cgroup_init(pgdat); for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; @@ -5343,33 +5357,33 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) find_zone_movable_pfns_for_nodes(); /* Print out the zone ranges */ - printk("Zone ranges:\n"); + pr_info("Zone ranges:\n"); for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; - printk(KERN_CONT " %-8s ", zone_names[i]); + pr_info(" %-8s ", zone_names[i]); if (arch_zone_lowest_possible_pfn[i] == arch_zone_highest_possible_pfn[i]) - printk(KERN_CONT "empty\n"); + pr_cont("empty\n"); else - printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", + pr_cont("[mem %0#10lx-%0#10lx]\n", arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, (arch_zone_highest_possible_pfn[i] << PAGE_SHIFT) - 1); } /* Print out the PFNs ZONE_MOVABLE begins at in each node */ - printk("Movable zone start for each node\n"); + pr_info("Movable zone start for each node\n"); for (i = 0; i < MAX_NUMNODES; i++) { if (zone_movable_pfn[i]) - printk(" Node %d: %#010lx\n", i, + pr_info(" Node %d: %#010lx\n", i, zone_movable_pfn[i] << PAGE_SHIFT); } /* Print out the early node map */ - printk("Early memory node ranges\n"); + pr_info("Early memory node ranges\n"); for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) - printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, + pr_info(" node %3d: [mem %#010lx-%#010lx]\n", nid, start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); /* Initialise every node */ @@ -5505,7 +5519,7 @@ void __init mem_init_print_info(const char *str) #undef adj_init_size - printk("Memory: %luK/%luK available " + pr_info("Memory: %luK/%luK available " "(%luK kernel code, %luK rwdata, %luK rodata, " "%luK init, %luK bss, %luK reserved" #ifdef CONFIG_HIGHMEM @@ -6394,7 +6408,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, */ lru_add_drain_all(); - drain_all_pages(); + drain_all_pages(cc.zone); order = 0; outer_start = start; @@ -6408,13 +6422,12 @@ int alloc_contig_range(unsigned long start, unsigned long end, /* Make sure the range is really isolated. */ if (test_pages_isolated(outer_start, end, false)) { - pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", - outer_start, end); + pr_info("%s: [%lx, %lx) PFNs busy\n", + __func__, outer_start, end); ret = -EBUSY; goto done; } - /* Grab isolated pages from freelists. */ outer_end = isolate_freepages_range(&cc, outer_start, end); if (!outer_end) { diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c deleted file mode 100644 index 5331c2bd85a2..000000000000 --- a/mm/page_cgroup.c +++ /dev/null @@ -1,530 +0,0 @@ -#include <linux/mm.h> -#include <linux/mmzone.h> -#include <linux/bootmem.h> -#include <linux/bit_spinlock.h> -#include <linux/page_cgroup.h> -#include <linux/hash.h> -#include <linux/slab.h> -#include <linux/memory.h> -#include <linux/vmalloc.h> -#include <linux/cgroup.h> -#include <linux/swapops.h> -#include <linux/kmemleak.h> - -static unsigned long total_usage; - -#if !defined(CONFIG_SPARSEMEM) - - -void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) -{ - pgdat->node_page_cgroup = NULL; -} - -struct page_cgroup *lookup_page_cgroup(struct page *page) -{ - unsigned long pfn = page_to_pfn(page); - unsigned long offset; - struct page_cgroup *base; - - base = NODE_DATA(page_to_nid(page))->node_page_cgroup; -#ifdef CONFIG_DEBUG_VM - /* - * The sanity checks the page allocator does upon freeing a - * page can reach here before the page_cgroup arrays are - * allocated when feeding a range of pages to the allocator - * for the first time during bootup or memory hotplug. - */ - if (unlikely(!base)) - return NULL; -#endif - offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn; - return base + offset; -} - -static int __init alloc_node_page_cgroup(int nid) -{ - struct page_cgroup *base; - unsigned long table_size; - unsigned long nr_pages; - - nr_pages = NODE_DATA(nid)->node_spanned_pages; - if (!nr_pages) - return 0; - - table_size = sizeof(struct page_cgroup) * nr_pages; - - base = memblock_virt_alloc_try_nid_nopanic( - table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), - BOOTMEM_ALLOC_ACCESSIBLE, nid); - if (!base) - return -ENOMEM; - NODE_DATA(nid)->node_page_cgroup = base; - total_usage += table_size; - return 0; -} - -void __init page_cgroup_init_flatmem(void) -{ - - int nid, fail; - - if (mem_cgroup_disabled()) - return; - - for_each_online_node(nid) { - fail = alloc_node_page_cgroup(nid); - if (fail) - goto fail; - } - printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); - printk(KERN_INFO "please try 'cgroup_disable=memory' option if you" - " don't want memory cgroups\n"); - return; -fail: - printk(KERN_CRIT "allocation of page_cgroup failed.\n"); - printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n"); - panic("Out of memory"); -} - -#else /* CONFIG_FLAT_NODE_MEM_MAP */ - -struct page_cgroup *lookup_page_cgroup(struct page *page) -{ - unsigned long pfn = page_to_pfn(page); - struct mem_section *section = __pfn_to_section(pfn); -#ifdef CONFIG_DEBUG_VM - /* - * The sanity checks the page allocator does upon freeing a - * page can reach here before the page_cgroup arrays are - * allocated when feeding a range of pages to the allocator - * for the first time during bootup or memory hotplug. - */ - if (!section->page_cgroup) - return NULL; -#endif - return section->page_cgroup + pfn; -} - -static void *__meminit alloc_page_cgroup(size_t size, int nid) -{ - gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; - void *addr = NULL; - - addr = alloc_pages_exact_nid(nid, size, flags); - if (addr) { - kmemleak_alloc(addr, size, 1, flags); - return addr; - } - - if (node_state(nid, N_HIGH_MEMORY)) - addr = vzalloc_node(size, nid); - else - addr = vzalloc(size); - - return addr; -} - -static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) -{ - struct mem_section *section; - struct page_cgroup *base; - unsigned long table_size; - - section = __pfn_to_section(pfn); - - if (section->page_cgroup) - return 0; - - table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; - base = alloc_page_cgroup(table_size, nid); - - /* - * The value stored in section->page_cgroup is (base - pfn) - * and it does not point to the memory block allocated above, - * causing kmemleak false positives. - */ - kmemleak_not_leak(base); - - if (!base) { - printk(KERN_ERR "page cgroup allocation failure\n"); - return -ENOMEM; - } - - /* - * The passed "pfn" may not be aligned to SECTION. For the calculation - * we need to apply a mask. - */ - pfn &= PAGE_SECTION_MASK; - section->page_cgroup = base - pfn; - total_usage += table_size; - return 0; -} -#ifdef CONFIG_MEMORY_HOTPLUG -static void free_page_cgroup(void *addr) -{ - if (is_vmalloc_addr(addr)) { - vfree(addr); - } else { - struct page *page = virt_to_page(addr); - size_t table_size = - sizeof(struct page_cgroup) * PAGES_PER_SECTION; - - BUG_ON(PageReserved(page)); - kmemleak_free(addr); - free_pages_exact(addr, table_size); - } -} - -static void __free_page_cgroup(unsigned long pfn) -{ - struct mem_section *ms; - struct page_cgroup *base; - - ms = __pfn_to_section(pfn); - if (!ms || !ms->page_cgroup) - return; - base = ms->page_cgroup + pfn; - free_page_cgroup(base); - ms->page_cgroup = NULL; -} - -static int __meminit online_page_cgroup(unsigned long start_pfn, - unsigned long nr_pages, - int nid) -{ - unsigned long start, end, pfn; - int fail = 0; - - start = SECTION_ALIGN_DOWN(start_pfn); - end = SECTION_ALIGN_UP(start_pfn + nr_pages); - - if (nid == -1) { - /* - * In this case, "nid" already exists and contains valid memory. - * "start_pfn" passed to us is a pfn which is an arg for - * online__pages(), and start_pfn should exist. - */ - nid = pfn_to_nid(start_pfn); - VM_BUG_ON(!node_state(nid, N_ONLINE)); - } - - for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { - if (!pfn_present(pfn)) - continue; - fail = init_section_page_cgroup(pfn, nid); - } - if (!fail) - return 0; - - /* rollback */ - for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) - __free_page_cgroup(pfn); - - return -ENOMEM; -} - -static int __meminit offline_page_cgroup(unsigned long start_pfn, - unsigned long nr_pages, int nid) -{ - unsigned long start, end, pfn; - - start = SECTION_ALIGN_DOWN(start_pfn); - end = SECTION_ALIGN_UP(start_pfn + nr_pages); - - for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) - __free_page_cgroup(pfn); - return 0; - -} - -static int __meminit page_cgroup_callback(struct notifier_block *self, - unsigned long action, void *arg) -{ - struct memory_notify *mn = arg; - int ret = 0; - switch (action) { - case MEM_GOING_ONLINE: - ret = online_page_cgroup(mn->start_pfn, - mn->nr_pages, mn->status_change_nid); - break; - case MEM_OFFLINE: - offline_page_cgroup(mn->start_pfn, - mn->nr_pages, mn->status_change_nid); - break; - case MEM_CANCEL_ONLINE: - offline_page_cgroup(mn->start_pfn, - mn->nr_pages, mn->status_change_nid); - break; - case MEM_GOING_OFFLINE: - break; - case MEM_ONLINE: - case MEM_CANCEL_OFFLINE: - break; - } - - return notifier_from_errno(ret); -} - -#endif - -void __init page_cgroup_init(void) -{ - unsigned long pfn; - int nid; - - if (mem_cgroup_disabled()) - return; - - for_each_node_state(nid, N_MEMORY) { - unsigned long start_pfn, end_pfn; - - start_pfn = node_start_pfn(nid); - end_pfn = node_end_pfn(nid); - /* - * start_pfn and end_pfn may not be aligned to SECTION and the - * page->flags of out of node pages are not initialized. So we - * scan [start_pfn, the biggest section's pfn < end_pfn) here. - */ - for (pfn = start_pfn; - pfn < end_pfn; - pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { - - if (!pfn_valid(pfn)) - continue; - /* - * Nodes's pfns can be overlapping. - * We know some arch can have a nodes layout such as - * -------------pfn--------------> - * N0 | N1 | N2 | N0 | N1 | N2|.... - */ - if (pfn_to_nid(pfn) != nid) - continue; - if (init_section_page_cgroup(pfn, nid)) - goto oom; - } - } - hotplug_memory_notifier(page_cgroup_callback, 0); - printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); - printk(KERN_INFO "please try 'cgroup_disable=memory' option if you " - "don't want memory cgroups\n"); - return; -oom: - printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); - panic("Out of memory"); -} - -void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) -{ - return; -} - -#endif - - -#ifdef CONFIG_MEMCG_SWAP - -static DEFINE_MUTEX(swap_cgroup_mutex); -struct swap_cgroup_ctrl { - struct page **map; - unsigned long length; - spinlock_t lock; -}; - -static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; - -struct swap_cgroup { - unsigned short id; -}; -#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) - -/* - * SwapCgroup implements "lookup" and "exchange" operations. - * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge - * against SwapCache. At swap_free(), this is accessed directly from swap. - * - * This means, - * - we have no race in "exchange" when we're accessed via SwapCache because - * SwapCache(and its swp_entry) is under lock. - * - When called via swap_free(), there is no user of this entry and no race. - * Then, we don't need lock around "exchange". - * - * TODO: we can push these buffers out to HIGHMEM. - */ - -/* - * allocate buffer for swap_cgroup. - */ -static int swap_cgroup_prepare(int type) -{ - struct page *page; - struct swap_cgroup_ctrl *ctrl; - unsigned long idx, max; - - ctrl = &swap_cgroup_ctrl[type]; - - for (idx = 0; idx < ctrl->length; idx++) { - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) - goto not_enough_page; - ctrl->map[idx] = page; - } - return 0; -not_enough_page: - max = idx; - for (idx = 0; idx < max; idx++) - __free_page(ctrl->map[idx]); - - return -ENOMEM; -} - -static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, - struct swap_cgroup_ctrl **ctrlp) -{ - pgoff_t offset = swp_offset(ent); - struct swap_cgroup_ctrl *ctrl; - struct page *mappage; - struct swap_cgroup *sc; - - ctrl = &swap_cgroup_ctrl[swp_type(ent)]; - if (ctrlp) - *ctrlp = ctrl; - - mappage = ctrl->map[offset / SC_PER_PAGE]; - sc = page_address(mappage); - return sc + offset % SC_PER_PAGE; -} - -/** - * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. - * @ent: swap entry to be cmpxchged - * @old: old id - * @new: new id - * - * Returns old id at success, 0 at failure. - * (There is no mem_cgroup using 0 as its id) - */ -unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, - unsigned short old, unsigned short new) -{ - struct swap_cgroup_ctrl *ctrl; - struct swap_cgroup *sc; - unsigned long flags; - unsigned short retval; - - sc = lookup_swap_cgroup(ent, &ctrl); - - spin_lock_irqsave(&ctrl->lock, flags); - retval = sc->id; - if (retval == old) - sc->id = new; - else - retval = 0; - spin_unlock_irqrestore(&ctrl->lock, flags); - return retval; -} - -/** - * swap_cgroup_record - record mem_cgroup for this swp_entry. - * @ent: swap entry to be recorded into - * @id: mem_cgroup to be recorded - * - * Returns old value at success, 0 at failure. - * (Of course, old value can be 0.) - */ -unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) -{ - struct swap_cgroup_ctrl *ctrl; - struct swap_cgroup *sc; - unsigned short old; - unsigned long flags; - - sc = lookup_swap_cgroup(ent, &ctrl); - - spin_lock_irqsave(&ctrl->lock, flags); - old = sc->id; - sc->id = id; - spin_unlock_irqrestore(&ctrl->lock, flags); - - return old; -} - -/** - * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry - * @ent: swap entry to be looked up. - * - * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) - */ -unsigned short lookup_swap_cgroup_id(swp_entry_t ent) -{ - return lookup_swap_cgroup(ent, NULL)->id; -} - -int swap_cgroup_swapon(int type, unsigned long max_pages) -{ - void *array; - unsigned long array_size; - unsigned long length; - struct swap_cgroup_ctrl *ctrl; - - if (!do_swap_account) - return 0; - - length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); - array_size = length * sizeof(void *); - - array = vzalloc(array_size); - if (!array) - goto nomem; - - ctrl = &swap_cgroup_ctrl[type]; - mutex_lock(&swap_cgroup_mutex); - ctrl->length = length; - ctrl->map = array; - spin_lock_init(&ctrl->lock); - if (swap_cgroup_prepare(type)) { - /* memory shortage */ - ctrl->map = NULL; - ctrl->length = 0; - mutex_unlock(&swap_cgroup_mutex); - vfree(array); - goto nomem; - } - mutex_unlock(&swap_cgroup_mutex); - - return 0; -nomem: - printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); - printk(KERN_INFO - "swap_cgroup can be disabled by swapaccount=0 boot option\n"); - return -ENOMEM; -} - -void swap_cgroup_swapoff(int type) -{ - struct page **map; - unsigned long i, length; - struct swap_cgroup_ctrl *ctrl; - - if (!do_swap_account) - return; - - mutex_lock(&swap_cgroup_mutex); - ctrl = &swap_cgroup_ctrl[type]; - map = ctrl->map; - length = ctrl->length; - ctrl->map = NULL; - ctrl->length = 0; - mutex_unlock(&swap_cgroup_mutex); - - if (map) { - for (i = 0; i < length; i++) { - struct page *page = map[i]; - if (page) - __free_page(page); - } - vfree(map); - } -} - -#endif diff --git a/mm/page_counter.c b/mm/page_counter.c new file mode 100644 index 000000000000..a009574fbba9 --- /dev/null +++ b/mm/page_counter.c @@ -0,0 +1,192 @@ +/* + * Lockless hierarchical page accounting & limiting + * + * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner + */ + +#include <linux/page_counter.h> +#include <linux/atomic.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/sched.h> +#include <linux/bug.h> +#include <asm/page.h> + +/** + * page_counter_cancel - take pages out of the local counter + * @counter: counter + * @nr_pages: number of pages to cancel + */ +void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) +{ + long new; + + new = atomic_long_sub_return(nr_pages, &counter->count); + /* More uncharges than charges? */ + WARN_ON_ONCE(new < 0); +} + +/** + * page_counter_charge - hierarchically charge pages + * @counter: counter + * @nr_pages: number of pages to charge + * + * NOTE: This does not consider any configured counter limits. + */ +void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) +{ + struct page_counter *c; + + for (c = counter; c; c = c->parent) { + long new; + + new = atomic_long_add_return(nr_pages, &c->count); + /* + * This is indeed racy, but we can live with some + * inaccuracy in the watermark. + */ + if (new > c->watermark) + c->watermark = new; + } +} + +/** + * page_counter_try_charge - try to hierarchically charge pages + * @counter: counter + * @nr_pages: number of pages to charge + * @fail: points first counter to hit its limit, if any + * + * Returns 0 on success, or -ENOMEM and @fail if the counter or one of + * its ancestors has hit its configured limit. + */ +int page_counter_try_charge(struct page_counter *counter, + unsigned long nr_pages, + struct page_counter **fail) +{ + struct page_counter *c; + + for (c = counter; c; c = c->parent) { + long new; + /* + * Charge speculatively to avoid an expensive CAS. If + * a bigger charge fails, it might falsely lock out a + * racing smaller charge and send it into reclaim + * early, but the error is limited to the difference + * between the two sizes, which is less than 2M/4M in + * case of a THP locking out a regular page charge. + * + * The atomic_long_add_return() implies a full memory + * barrier between incrementing the count and reading + * the limit. When racing with page_counter_limit(), + * we either see the new limit or the setter sees the + * counter has changed and retries. + */ + new = atomic_long_add_return(nr_pages, &c->count); + if (new > c->limit) { + atomic_long_sub(nr_pages, &c->count); + /* + * This is racy, but we can live with some + * inaccuracy in the failcnt. + */ + c->failcnt++; + *fail = c; + goto failed; + } + /* + * Just like with failcnt, we can live with some + * inaccuracy in the watermark. + */ + if (new > c->watermark) + c->watermark = new; + } + return 0; + +failed: + for (c = counter; c != *fail; c = c->parent) + page_counter_cancel(c, nr_pages); + + return -ENOMEM; +} + +/** + * page_counter_uncharge - hierarchically uncharge pages + * @counter: counter + * @nr_pages: number of pages to uncharge + */ +void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) +{ + struct page_counter *c; + + for (c = counter; c; c = c->parent) + page_counter_cancel(c, nr_pages); +} + +/** + * page_counter_limit - limit the number of pages allowed + * @counter: counter + * @limit: limit to set + * + * Returns 0 on success, -EBUSY if the current number of pages on the + * counter already exceeds the specified limit. + * + * The caller must serialize invocations on the same counter. + */ +int page_counter_limit(struct page_counter *counter, unsigned long limit) +{ + for (;;) { + unsigned long old; + long count; + + /* + * Update the limit while making sure that it's not + * below the concurrently-changing counter value. + * + * The xchg implies two full memory barriers before + * and after, so the read-swap-read is ordered and + * ensures coherency with page_counter_try_charge(): + * that function modifies the count before checking + * the limit, so if it sees the old limit, we see the + * modified counter and retry. + */ + count = atomic_long_read(&counter->count); + + if (count > limit) + return -EBUSY; + + old = xchg(&counter->limit, limit); + + if (atomic_long_read(&counter->count) <= count) + return 0; + + counter->limit = old; + cond_resched(); + } +} + +/** + * page_counter_memparse - memparse() for page counter limits + * @buf: string to parse + * @nr_pages: returns the result in number of pages + * + * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be + * limited to %PAGE_COUNTER_MAX. + */ +int page_counter_memparse(const char *buf, unsigned long *nr_pages) +{ + char unlimited[] = "-1"; + char *end; + u64 bytes; + + if (!strncmp(buf, unlimited, sizeof(unlimited))) { + *nr_pages = PAGE_COUNTER_MAX; + return 0; + } + + bytes = memparse(buf, &end); + if (*end != '\0') + return -EINVAL; + + *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX); + + return 0; +} diff --git a/mm/page_isolation.c b/mm/page_isolation.c index d1473b2e9481..72f5ac381ab3 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -60,6 +60,7 @@ out: int migratetype = get_pageblock_migratetype(page); set_pageblock_migratetype(page, MIGRATE_ISOLATE); + zone->nr_isolate_pageblock++; nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); __mod_zone_freepage_state(zone, -nr_pages, migratetype); @@ -67,7 +68,7 @@ out: spin_unlock_irqrestore(&zone->lock, flags); if (!ret) - drain_all_pages(); + drain_all_pages(zone); return ret; } @@ -75,16 +76,54 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype) { struct zone *zone; unsigned long flags, nr_pages; + struct page *isolated_page = NULL; + unsigned int order; + unsigned long page_idx, buddy_idx; + struct page *buddy; zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) goto out; - nr_pages = move_freepages_block(zone, page, migratetype); - __mod_zone_freepage_state(zone, nr_pages, migratetype); + + /* + * Because freepage with more than pageblock_order on isolated + * pageblock is restricted to merge due to freepage counting problem, + * it is possible that there is free buddy page. + * move_freepages_block() doesn't care of merge so we need other + * approach in order to merge them. Isolation and free will make + * these pages to be merged. + */ + if (PageBuddy(page)) { + order = page_order(page); + if (order >= pageblock_order) { + page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); + buddy_idx = __find_buddy_index(page_idx, order); + buddy = page + (buddy_idx - page_idx); + + if (!is_migrate_isolate_page(buddy)) { + __isolate_free_page(page, order); + set_page_refcounted(page); + isolated_page = page; + } + } + } + + /* + * If we isolate freepage with more than pageblock_order, there + * should be no freepage in the range, so we could avoid costly + * pageblock scanning for freepage moving. + */ + if (!isolated_page) { + nr_pages = move_freepages_block(zone, page, migratetype); + __mod_zone_freepage_state(zone, nr_pages, migratetype); + } set_pageblock_migratetype(page, migratetype); + zone->nr_isolate_pageblock--; out: spin_unlock_irqrestore(&zone->lock, flags); + if (isolated_page) + __free_pages(isolated_page, order); } static inline struct page * diff --git a/mm/rmap.c b/mm/rmap.c index 19886fb2f13a..45eba36fd673 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -274,6 +274,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) { struct anon_vma_chain *avc; struct anon_vma *anon_vma; + int error; /* Don't bother if the parent process has no anon_vma here. */ if (!pvma->anon_vma) @@ -283,8 +284,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) * First, attach the new VMA to the parent VMA's anon_vmas, * so rmap can find non-COWed pages in child processes. */ - if (anon_vma_clone(vma, pvma)) - return -ENOMEM; + error = anon_vma_clone(vma, pvma); + if (error) + return error; /* Then add our own anon_vma. */ anon_vma = anon_vma_alloc(); @@ -1051,7 +1053,7 @@ void page_add_file_rmap(struct page *page) __inc_zone_page_state(page, NR_FILE_MAPPED); mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); } - mem_cgroup_end_page_stat(memcg, locked, flags); + mem_cgroup_end_page_stat(memcg, &locked, &flags); } static void page_remove_file_rmap(struct page *page) @@ -1081,7 +1083,7 @@ static void page_remove_file_rmap(struct page *page) if (unlikely(PageMlocked(page))) clear_page_mlock(page); out: - mem_cgroup_end_page_stat(memcg, locked, flags); + mem_cgroup_end_page_stat(memcg, &locked, &flags); } /** diff --git a/mm/slab.c b/mm/slab.c index eb2b2ea30130..79e15f0a2a6e 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2590,7 +2590,10 @@ static int cache_grow(struct kmem_cache *cachep, * Be lazy and only check for valid flags here, keeping it out of the * critical path in kmem_cache_alloc(). */ - BUG_ON(flags & GFP_SLAB_BUG_MASK); + if (unlikely(flags & GFP_SLAB_BUG_MASK)) { + pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); + BUG(); + } local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); /* Take the node list lock to change the colour_next on this node */ @@ -3076,7 +3079,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, void *obj; int x; - VM_BUG_ON(nodeid > num_online_nodes()); + VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES); n = get_node(cachep, nodeid); BUG_ON(!n); @@ -3580,11 +3583,11 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) for_each_online_node(node) { - if (use_alien_caches) { - new_alien = alloc_alien_cache(node, cachep->limit, gfp); - if (!new_alien) - goto fail; - } + if (use_alien_caches) { + new_alien = alloc_alien_cache(node, cachep->limit, gfp); + if (!new_alien) + goto fail; + } new_shared = NULL; if (cachep->shared) { @@ -4043,12 +4046,6 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, #ifdef CONFIG_DEBUG_SLAB_LEAK -static void *leaks_start(struct seq_file *m, loff_t *pos) -{ - mutex_lock(&slab_mutex); - return seq_list_start(&slab_caches, *pos); -} - static inline int add_caller(unsigned long *n, unsigned long v) { unsigned long *p; @@ -4170,7 +4167,7 @@ static int leaks_show(struct seq_file *m, void *p) } static const struct seq_operations slabstats_op = { - .start = leaks_start, + .start = slab_start, .next = slab_next, .stop = slab_stop, .show = leaks_show, diff --git a/mm/slab.h b/mm/slab.h index ab019e63e3c2..1cf4005482dd 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -209,15 +209,15 @@ cache_from_memcg_idx(struct kmem_cache *s, int idx) rcu_read_lock(); params = rcu_dereference(s->memcg_params); - cachep = params->memcg_caches[idx]; - rcu_read_unlock(); /* * Make sure we will access the up-to-date value. The code updating * memcg_caches issues a write barrier to match this (see * memcg_register_cache()). */ - smp_read_barrier_depends(); + cachep = lockless_dereference(params->memcg_caches[idx]); + rcu_read_unlock(); + return cachep; } @@ -357,7 +357,9 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) #endif +void *slab_start(struct seq_file *m, loff_t *pos); void *slab_next(struct seq_file *m, void *p, loff_t *pos); void slab_stop(struct seq_file *m, void *p); +int memcg_slab_show(struct seq_file *m, void *p); #endif /* MM_SLAB_H */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 406944207b61..e03dd6f2a272 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -240,7 +240,7 @@ struct kmem_cache *find_mergeable(size_t size, size_t align, size = ALIGN(size, align); flags = kmem_cache_flags(size, flags, name, NULL); - list_for_each_entry(s, &slab_caches, list) { + list_for_each_entry_reverse(s, &slab_caches, list) { if (slab_unmergeable(s)) continue; @@ -259,6 +259,10 @@ struct kmem_cache *find_mergeable(size_t size, size_t align, if (s->size - size >= sizeof(void *)) continue; + if (IS_ENABLED(CONFIG_SLAB) && align && + (align > s->align || s->align % align)) + continue; + return s; } return NULL; @@ -807,7 +811,7 @@ EXPORT_SYMBOL(kmalloc_order_trace); #define SLABINFO_RIGHTS S_IRUSR #endif -void print_slabinfo_header(struct seq_file *m) +static void print_slabinfo_header(struct seq_file *m) { /* * Output format version, so at least we can change it @@ -830,14 +834,9 @@ void print_slabinfo_header(struct seq_file *m) seq_putc(m, '\n'); } -static void *s_start(struct seq_file *m, loff_t *pos) +void *slab_start(struct seq_file *m, loff_t *pos) { - loff_t n = *pos; - mutex_lock(&slab_mutex); - if (!n) - print_slabinfo_header(m); - return seq_list_start(&slab_caches, *pos); } @@ -877,7 +876,7 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info) } } -int cache_show(struct kmem_cache *s, struct seq_file *m) +static void cache_show(struct kmem_cache *s, struct seq_file *m) { struct slabinfo sinfo; @@ -896,17 +895,32 @@ int cache_show(struct kmem_cache *s, struct seq_file *m) sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail); slabinfo_show_stats(m, s); seq_putc(m, '\n'); +} + +static int slab_show(struct seq_file *m, void *p) +{ + struct kmem_cache *s = list_entry(p, struct kmem_cache, list); + + if (p == slab_caches.next) + print_slabinfo_header(m); + if (is_root_cache(s)) + cache_show(s, m); return 0; } -static int s_show(struct seq_file *m, void *p) +#ifdef CONFIG_MEMCG_KMEM +int memcg_slab_show(struct seq_file *m, void *p) { struct kmem_cache *s = list_entry(p, struct kmem_cache, list); + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - if (!is_root_cache(s)) - return 0; - return cache_show(s, m); + if (p == slab_caches.next) + print_slabinfo_header(m); + if (!is_root_cache(s) && s->memcg_params->memcg == memcg) + cache_show(s, m); + return 0; } +#endif /* * slabinfo_op - iterator that generates /proc/slabinfo @@ -922,10 +936,10 @@ static int s_show(struct seq_file *m, void *p) * + further values on SMP and with statistics enabled */ static const struct seq_operations slabinfo_op = { - .start = s_start, + .start = slab_start, .next = slab_next, .stop = slab_stop, - .show = s_show, + .show = slab_show, }; static int slabinfo_open(struct inode *inode, struct file *file) diff --git a/mm/slub.c b/mm/slub.c index ae7b9f1ad394..386bbed76e94 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -849,12 +849,12 @@ static int check_slab(struct kmem_cache *s, struct page *page) maxobj = order_objects(compound_order(page), s->size, s->reserved); if (page->objects > maxobj) { slab_err(s, page, "objects %u > max %u", - s->name, page->objects, maxobj); + page->objects, maxobj); return 0; } if (page->inuse > page->objects) { slab_err(s, page, "inuse %u > max %u", - s->name, page->inuse, page->objects); + page->inuse, page->objects); return 0; } /* Slab_pad_check fixes things up after itself */ @@ -871,7 +871,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) int nr = 0; void *fp; void *object = NULL; - unsigned long max_objects; + int max_objects; fp = page->freelist; while (fp && nr <= page->objects) { @@ -1377,7 +1377,10 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) int order; int idx; - BUG_ON(flags & GFP_SLAB_BUG_MASK); + if (unlikely(flags & GFP_SLAB_BUG_MASK)) { + pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); + BUG(); + } page = allocate_slab(s, flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); @@ -2554,7 +2557,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, } else { /* Needs to be taken off a list */ - n = get_node(s, page_to_nid(page)); + n = get_node(s, page_to_nid(page)); /* * Speculatively acquire the list_lock. * If the cmpxchg does not succeed then we may @@ -2587,10 +2590,10 @@ static void __slab_free(struct kmem_cache *s, struct page *page, * The list lock was not taken therefore no list * activity can be necessary. */ - if (was_frozen) - stat(s, FREE_FROZEN); - return; - } + if (was_frozen) + stat(s, FREE_FROZEN); + return; + } if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) goto slab_empty; diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c new file mode 100644 index 000000000000..b5f7f24b8dd1 --- /dev/null +++ b/mm/swap_cgroup.c @@ -0,0 +1,208 @@ +#include <linux/swap_cgroup.h> +#include <linux/vmalloc.h> +#include <linux/mm.h> + +#include <linux/swapops.h> /* depends on mm.h include */ + +static DEFINE_MUTEX(swap_cgroup_mutex); +struct swap_cgroup_ctrl { + struct page **map; + unsigned long length; + spinlock_t lock; +}; + +static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; + +struct swap_cgroup { + unsigned short id; +}; +#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) + +/* + * SwapCgroup implements "lookup" and "exchange" operations. + * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge + * against SwapCache. At swap_free(), this is accessed directly from swap. + * + * This means, + * - we have no race in "exchange" when we're accessed via SwapCache because + * SwapCache(and its swp_entry) is under lock. + * - When called via swap_free(), there is no user of this entry and no race. + * Then, we don't need lock around "exchange". + * + * TODO: we can push these buffers out to HIGHMEM. + */ + +/* + * allocate buffer for swap_cgroup. + */ +static int swap_cgroup_prepare(int type) +{ + struct page *page; + struct swap_cgroup_ctrl *ctrl; + unsigned long idx, max; + + ctrl = &swap_cgroup_ctrl[type]; + + for (idx = 0; idx < ctrl->length; idx++) { + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + goto not_enough_page; + ctrl->map[idx] = page; + } + return 0; +not_enough_page: + max = idx; + for (idx = 0; idx < max; idx++) + __free_page(ctrl->map[idx]); + + return -ENOMEM; +} + +static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, + struct swap_cgroup_ctrl **ctrlp) +{ + pgoff_t offset = swp_offset(ent); + struct swap_cgroup_ctrl *ctrl; + struct page *mappage; + struct swap_cgroup *sc; + + ctrl = &swap_cgroup_ctrl[swp_type(ent)]; + if (ctrlp) + *ctrlp = ctrl; + + mappage = ctrl->map[offset / SC_PER_PAGE]; + sc = page_address(mappage); + return sc + offset % SC_PER_PAGE; +} + +/** + * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. + * @ent: swap entry to be cmpxchged + * @old: old id + * @new: new id + * + * Returns old id at success, 0 at failure. + * (There is no mem_cgroup using 0 as its id) + */ +unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, + unsigned short old, unsigned short new) +{ + struct swap_cgroup_ctrl *ctrl; + struct swap_cgroup *sc; + unsigned long flags; + unsigned short retval; + + sc = lookup_swap_cgroup(ent, &ctrl); + + spin_lock_irqsave(&ctrl->lock, flags); + retval = sc->id; + if (retval == old) + sc->id = new; + else + retval = 0; + spin_unlock_irqrestore(&ctrl->lock, flags); + return retval; +} + +/** + * swap_cgroup_record - record mem_cgroup for this swp_entry. + * @ent: swap entry to be recorded into + * @id: mem_cgroup to be recorded + * + * Returns old value at success, 0 at failure. + * (Of course, old value can be 0.) + */ +unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) +{ + struct swap_cgroup_ctrl *ctrl; + struct swap_cgroup *sc; + unsigned short old; + unsigned long flags; + + sc = lookup_swap_cgroup(ent, &ctrl); + + spin_lock_irqsave(&ctrl->lock, flags); + old = sc->id; + sc->id = id; + spin_unlock_irqrestore(&ctrl->lock, flags); + + return old; +} + +/** + * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry + * @ent: swap entry to be looked up. + * + * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) + */ +unsigned short lookup_swap_cgroup_id(swp_entry_t ent) +{ + return lookup_swap_cgroup(ent, NULL)->id; +} + +int swap_cgroup_swapon(int type, unsigned long max_pages) +{ + void *array; + unsigned long array_size; + unsigned long length; + struct swap_cgroup_ctrl *ctrl; + + if (!do_swap_account) + return 0; + + length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); + array_size = length * sizeof(void *); + + array = vzalloc(array_size); + if (!array) + goto nomem; + + ctrl = &swap_cgroup_ctrl[type]; + mutex_lock(&swap_cgroup_mutex); + ctrl->length = length; + ctrl->map = array; + spin_lock_init(&ctrl->lock); + if (swap_cgroup_prepare(type)) { + /* memory shortage */ + ctrl->map = NULL; + ctrl->length = 0; + mutex_unlock(&swap_cgroup_mutex); + vfree(array); + goto nomem; + } + mutex_unlock(&swap_cgroup_mutex); + + return 0; +nomem: + printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); + printk(KERN_INFO + "swap_cgroup can be disabled by swapaccount=0 boot option\n"); + return -ENOMEM; +} + +void swap_cgroup_swapoff(int type) +{ + struct page **map; + unsigned long i, length; + struct swap_cgroup_ctrl *ctrl; + + if (!do_swap_account) + return; + + mutex_lock(&swap_cgroup_mutex); + ctrl = &swap_cgroup_ctrl[type]; + map = ctrl->map; + length = ctrl->length; + ctrl->map = NULL; + ctrl->length = 0; + mutex_unlock(&swap_cgroup_mutex); + + if (map) { + for (i = 0; i < length; i++) { + struct page *page = map[i]; + if (page) + __free_page(page); + } + vfree(map); + } +} diff --git a/mm/swap_state.c b/mm/swap_state.c index 154444918685..9711342987a0 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -17,7 +17,6 @@ #include <linux/blkdev.h> #include <linux/pagevec.h> #include <linux/migrate.h> -#include <linux/page_cgroup.h> #include <asm/pgtable.h> diff --git a/mm/swapfile.c b/mm/swapfile.c index 8798b2e0ac59..63f55ccb9b26 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -38,7 +38,7 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <linux/swapops.h> -#include <linux/page_cgroup.h> +#include <linux/swap_cgroup.h> static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); diff --git a/mm/truncate.c b/mm/truncate.c index 261eaf6e5a19..f1e4d6052369 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -715,8 +715,9 @@ EXPORT_SYMBOL(truncate_pagecache); * necessary) to @newsize. It will be typically be called from the filesystem's * setattr function when ATTR_SIZE is passed in. * - * Must be called with inode_mutex held and before all filesystem specific - * block truncation has been performed. + * Must be called with a lock serializing truncates and writes (generally + * i_mutex but e.g. xfs uses a different lock) and before all filesystem + * specific block truncation has been performed. */ void truncate_setsize(struct inode *inode, loff_t newsize) { @@ -755,7 +756,6 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) struct page *page; pgoff_t index; - WARN_ON(!mutex_is_locked(&inode->i_mutex)); WARN_ON(to > inode->i_size); if (from >= to || bsize == PAGE_CACHE_SIZE) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 90520af7f186..8a18196fcdff 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -463,8 +463,7 @@ overflow: goto retry; } if (printk_ratelimit()) - printk(KERN_WARNING - "vmap allocation for size %lu failed: " + pr_warn("vmap allocation for size %lu failed: " "use vmalloc=<size> to increase size.\n", size); kfree(va); return ERR_PTR(-EBUSY); diff --git a/mm/vmpressure.c b/mm/vmpressure.c index d4042e75f7c7..c5afd573d7da 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -165,6 +165,7 @@ static void vmpressure_work_fn(struct work_struct *work) unsigned long scanned; unsigned long reclaimed; + spin_lock(&vmpr->sr_lock); /* * Several contexts might be calling vmpressure(), so it is * possible that the work was rescheduled again before the old @@ -173,11 +174,12 @@ static void vmpressure_work_fn(struct work_struct *work) * here. No need for any locks here since we don't care if * vmpr->reclaimed is in sync. */ - if (!vmpr->scanned) + scanned = vmpr->scanned; + if (!scanned) { + spin_unlock(&vmpr->sr_lock); return; + } - spin_lock(&vmpr->sr_lock); - scanned = vmpr->scanned; reclaimed = vmpr->reclaimed; vmpr->scanned = 0; vmpr->reclaimed = 0; diff --git a/mm/vmscan.c b/mm/vmscan.c index dcb47074ae03..4636d9e822c1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -260,8 +260,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, do_div(delta, lru_pages + 1); total_scan += delta; if (total_scan < 0) { - printk(KERN_ERR - "shrink_slab: %pF negative objects to delete nr=%ld\n", + pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", shrinker->scan_objects, total_scan); total_scan = freeable; } @@ -875,7 +874,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, * end of the LRU a second time. */ mapping = page_mapping(page); - if ((mapping && bdi_write_congested(mapping->backing_dev_info)) || + if (((dirty || writeback) && mapping && + bdi_write_congested(mapping->backing_dev_info)) || (writeback && PageReclaim(page))) nr_congested++; @@ -2249,7 +2249,7 @@ static inline bool should_continue_reclaim(struct zone *zone, return true; /* If compaction would go ahead or the allocation would succeed, stop */ - switch (compaction_suitable(zone, sc->order)) { + switch (compaction_suitable(zone, sc->order, 0, 0)) { case COMPACT_PARTIAL: case COMPACT_CONTINUE: return false; @@ -2346,7 +2346,7 @@ static inline bool compaction_ready(struct zone *zone, int order) * If compaction is not ready to start and allocation is not likely * to succeed without it, then keep reclaiming. */ - if (compaction_suitable(zone, order) == COMPACT_SKIPPED) + if (compaction_suitable(zone, order, 0, 0) == COMPACT_SKIPPED) return false; return watermark_ok; @@ -2824,8 +2824,8 @@ static bool zone_balanced(struct zone *zone, int order, balance_gap, classzone_idx, 0)) return false; - if (IS_ENABLED(CONFIG_COMPACTION) && order && - compaction_suitable(zone, order) == COMPACT_SKIPPED) + if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone, + order, 0, classzone_idx) == COMPACT_SKIPPED) return false; return true; @@ -2952,8 +2952,8 @@ static bool kswapd_shrink_zone(struct zone *zone, * from memory. Do not reclaim more than needed for compaction. */ if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && - compaction_suitable(zone, sc->order) != - COMPACT_SKIPPED) + compaction_suitable(zone, sc->order, 0, classzone_idx) + != COMPACT_SKIPPED) testorder = 0; /* |