diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 18 | ||||
-rw-r--r-- | mm/backing-dev.c | 634 | ||||
-rw-r--r-- | mm/bootmem.c | 13 | ||||
-rw-r--r-- | mm/debug.c | 2 | ||||
-rw-r--r-- | mm/fadvise.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 34 | ||||
-rw-r--r-- | mm/hugetlb.c | 10 | ||||
-rw-r--r-- | mm/internal.h | 11 | ||||
-rw-r--r-- | mm/kasan/kasan.h | 1 | ||||
-rw-r--r-- | mm/madvise.c | 1 | ||||
-rw-r--r-- | mm/memblock.c | 34 | ||||
-rw-r--r-- | mm/memcontrol.c | 223 | ||||
-rw-r--r-- | mm/mm_init.c | 9 | ||||
-rw-r--r-- | mm/nobootmem.c | 7 | ||||
-rw-r--r-- | mm/nommu.c | 4 | ||||
-rw-r--r-- | mm/page-writeback.c | 1231 | ||||
-rw-r--r-- | mm/page_alloc.c | 442 | ||||
-rw-r--r-- | mm/page_io.c | 2 | ||||
-rw-r--r-- | mm/page_owner.c | 2 | ||||
-rw-r--r-- | mm/readahead.c | 2 | ||||
-rw-r--r-- | mm/rmap.c | 2 | ||||
-rw-r--r-- | mm/slab_common.c | 32 | ||||
-rw-r--r-- | mm/truncate.c | 18 | ||||
-rw-r--r-- | mm/vmscan.c | 79 | ||||
-rw-r--r-- | mm/zbud.c | 23 | ||||
-rw-r--r-- | mm/zpool.c | 35 | ||||
-rw-r--r-- | mm/zsmalloc.c | 7 | ||||
-rw-r--r-- | mm/zswap.c | 12 |
28 files changed, 2083 insertions, 807 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index c180af880ed5..e79de2bd12cd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -636,3 +636,21 @@ config MAX_STACK_SIZE_MB changed to a smaller value in which case that is used. A sane initial value is 80 MB. + +# For architectures that support deferred memory initialisation +config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT + bool + +config DEFERRED_STRUCT_PAGE_INIT + bool "Defer initialisation of struct pages to kswapd" + default n + depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT + depends on MEMORY_HOTPLUG + help + Ordinarily all struct pages are initialised during early boot in a + single thread. On very large machines this can take a considerable + amount of time. If this option is set, large machines will bring up + a subset of memmap at boot and then initialise the rest in parallel + when kswapd starts. This has a potential performance impact on + processes running early in the lifetime of the systemm until kswapd + finishes the initialisation. diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 000e7b3b9896..7756da31b02b 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -18,6 +18,7 @@ struct backing_dev_info noop_backing_dev_info = { .name = "noop", .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; +EXPORT_SYMBOL_GPL(noop_backing_dev_info); static struct class *bdi_class; @@ -48,7 +49,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) struct bdi_writeback *wb = &bdi->wb; unsigned long background_thresh; unsigned long dirty_thresh; - unsigned long bdi_thresh; + unsigned long wb_thresh; unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time; struct inode *inode; @@ -66,7 +67,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) spin_unlock(&wb->list_lock); global_dirty_limits(&background_thresh, &dirty_thresh); - bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); + wb_thresh = wb_calc_thresh(wb, dirty_thresh); #define K(x) ((x) << (PAGE_SHIFT - 10)) seq_printf(m, @@ -84,19 +85,19 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "b_dirty_time: %10lu\n" "bdi_list: %10u\n" "state: %10lx\n", - (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), - (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), - K(bdi_thresh), + (unsigned long) K(wb_stat(wb, WB_WRITEBACK)), + (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)), + K(wb_thresh), K(dirty_thresh), K(background_thresh), - (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)), - (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)), - (unsigned long) K(bdi->write_bandwidth), + (unsigned long) K(wb_stat(wb, WB_DIRTIED)), + (unsigned long) K(wb_stat(wb, WB_WRITTEN)), + (unsigned long) K(wb->write_bandwidth), nr_dirty, nr_io, nr_more_io, nr_dirty_time, - !list_empty(&bdi->bdi_list), bdi->state); + !list_empty(&bdi->bdi_list), bdi->wb.state); #undef K return 0; @@ -255,13 +256,8 @@ static int __init default_bdi_init(void) } subsys_initcall(default_bdi_init); -int bdi_has_dirty_io(struct backing_dev_info *bdi) -{ - return wb_has_dirty_io(&bdi->wb); -} - /* - * This function is used when the first inode for this bdi is marked dirty. It + * This function is used when the first inode for this wb is marked dirty. It * wakes-up the corresponding bdi thread which should then take care of the * periodic background write-out of dirty inodes. Since the write-out would * starts only 'dirty_writeback_interval' centisecs from now anyway, we just @@ -274,162 +270,550 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) * We have to be careful not to postpone flush work if it is scheduled for * earlier. Thus we use queue_delayed_work(). */ -void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) +void wb_wakeup_delayed(struct bdi_writeback *wb) { unsigned long timeout; timeout = msecs_to_jiffies(dirty_writeback_interval * 10); - spin_lock_bh(&bdi->wb_lock); - if (test_bit(BDI_registered, &bdi->state)) - queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout); - spin_unlock_bh(&bdi->wb_lock); + spin_lock_bh(&wb->work_lock); + if (test_bit(WB_registered, &wb->state)) + queue_delayed_work(bdi_wq, &wb->dwork, timeout); + spin_unlock_bh(&wb->work_lock); } /* - * Remove bdi from bdi_list, and ensure that it is no longer visible + * Initial write bandwidth: 100 MB/s */ -static void bdi_remove_from_list(struct backing_dev_info *bdi) -{ - spin_lock_bh(&bdi_lock); - list_del_rcu(&bdi->bdi_list); - spin_unlock_bh(&bdi_lock); - - synchronize_rcu_expedited(); -} +#define INIT_BW (100 << (20 - PAGE_SHIFT)) -int bdi_register(struct backing_dev_info *bdi, struct device *parent, - const char *fmt, ...) +static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, + gfp_t gfp) { - va_list args; - struct device *dev; + int i, err; - if (bdi->dev) /* The driver needs to use separate queues per device */ - return 0; + memset(wb, 0, sizeof(*wb)); - va_start(args, fmt); - dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args); - va_end(args); - if (IS_ERR(dev)) - return PTR_ERR(dev); + wb->bdi = bdi; + wb->last_old_flush = jiffies; + INIT_LIST_HEAD(&wb->b_dirty); + INIT_LIST_HEAD(&wb->b_io); + INIT_LIST_HEAD(&wb->b_more_io); + INIT_LIST_HEAD(&wb->b_dirty_time); + spin_lock_init(&wb->list_lock); - bdi->dev = dev; + wb->bw_time_stamp = jiffies; + wb->balanced_dirty_ratelimit = INIT_BW; + wb->dirty_ratelimit = INIT_BW; + wb->write_bandwidth = INIT_BW; + wb->avg_write_bandwidth = INIT_BW; - bdi_debug_register(bdi, dev_name(dev)); - set_bit(BDI_registered, &bdi->state); + spin_lock_init(&wb->work_lock); + INIT_LIST_HEAD(&wb->work_list); + INIT_DELAYED_WORK(&wb->dwork, wb_workfn); - spin_lock_bh(&bdi_lock); - list_add_tail_rcu(&bdi->bdi_list, &bdi_list); - spin_unlock_bh(&bdi_lock); + err = fprop_local_init_percpu(&wb->completions, gfp); + if (err) + return err; - trace_writeback_bdi_register(bdi); - return 0; -} -EXPORT_SYMBOL(bdi_register); + for (i = 0; i < NR_WB_STAT_ITEMS; i++) { + err = percpu_counter_init(&wb->stat[i], 0, gfp); + if (err) { + while (--i) + percpu_counter_destroy(&wb->stat[i]); + fprop_local_destroy_percpu(&wb->completions); + return err; + } + } -int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) -{ - return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev)); + return 0; } -EXPORT_SYMBOL(bdi_register_dev); /* * Remove bdi from the global list and shutdown any threads we have running */ -static void bdi_wb_shutdown(struct backing_dev_info *bdi) +static void wb_shutdown(struct bdi_writeback *wb) { /* Make sure nobody queues further work */ - spin_lock_bh(&bdi->wb_lock); - if (!test_and_clear_bit(BDI_registered, &bdi->state)) { - spin_unlock_bh(&bdi->wb_lock); + spin_lock_bh(&wb->work_lock); + if (!test_and_clear_bit(WB_registered, &wb->state)) { + spin_unlock_bh(&wb->work_lock); return; } - spin_unlock_bh(&bdi->wb_lock); + spin_unlock_bh(&wb->work_lock); /* - * Make sure nobody finds us on the bdi_list anymore + * Drain work list and shutdown the delayed_work. !WB_registered + * tells wb_workfn() that @wb is dying and its work_list needs to + * be drained no matter what. */ - bdi_remove_from_list(bdi); + mod_delayed_work(bdi_wq, &wb->dwork, 0); + flush_delayed_work(&wb->dwork); + WARN_ON(!list_empty(&wb->work_list)); +} + +static void wb_exit(struct bdi_writeback *wb) +{ + int i; + + WARN_ON(delayed_work_pending(&wb->dwork)); + + for (i = 0; i < NR_WB_STAT_ITEMS; i++) + percpu_counter_destroy(&wb->stat[i]); + + fprop_local_destroy_percpu(&wb->completions); +} + +#ifdef CONFIG_CGROUP_WRITEBACK + +#include <linux/memcontrol.h> + +/* + * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree, + * blkcg->cgwb_list, and memcg->cgwb_list. bdi->cgwb_tree is also RCU + * protected. cgwb_release_wait is used to wait for the completion of cgwb + * releases from bdi destruction path. + */ +static DEFINE_SPINLOCK(cgwb_lock); +static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait); + +/** + * wb_congested_get_create - get or create a wb_congested + * @bdi: associated bdi + * @blkcg_id: ID of the associated blkcg + * @gfp: allocation mask + * + * Look up the wb_congested for @blkcg_id on @bdi. If missing, create one. + * The returned wb_congested has its reference count incremented. Returns + * NULL on failure. + */ +struct bdi_writeback_congested * +wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp) +{ + struct bdi_writeback_congested *new_congested = NULL, *congested; + struct rb_node **node, *parent; + unsigned long flags; + + if (blkcg_id == 1) + return &bdi->wb_congested; +retry: + spin_lock_irqsave(&cgwb_lock, flags); + + node = &bdi->cgwb_congested_tree.rb_node; + parent = NULL; + + while (*node != NULL) { + parent = *node; + congested = container_of(parent, struct bdi_writeback_congested, + rb_node); + if (congested->blkcg_id < blkcg_id) + node = &parent->rb_left; + else if (congested->blkcg_id > blkcg_id) + node = &parent->rb_right; + else + goto found; + } + + if (new_congested) { + /* !found and storage for new one already allocated, insert */ + congested = new_congested; + new_congested = NULL; + rb_link_node(&congested->rb_node, parent, node); + rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree); + atomic_inc(&bdi->usage_cnt); + goto found; + } + + spin_unlock_irqrestore(&cgwb_lock, flags); + + /* allocate storage for new one and retry */ + new_congested = kzalloc(sizeof(*new_congested), gfp); + if (!new_congested) + return NULL; + + atomic_set(&new_congested->refcnt, 0); + new_congested->bdi = bdi; + new_congested->blkcg_id = blkcg_id; + goto retry; + +found: + atomic_inc(&congested->refcnt); + spin_unlock_irqrestore(&cgwb_lock, flags); + kfree(new_congested); + return congested; +} + +/** + * wb_congested_put - put a wb_congested + * @congested: wb_congested to put + * + * Put @congested and destroy it if the refcnt reaches zero. + */ +void wb_congested_put(struct bdi_writeback_congested *congested) +{ + struct backing_dev_info *bdi = congested->bdi; + unsigned long flags; + + if (congested->blkcg_id == 1) + return; + + local_irq_save(flags); + if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) { + local_irq_restore(flags); + return; + } + + rb_erase(&congested->rb_node, &congested->bdi->cgwb_congested_tree); + spin_unlock_irqrestore(&cgwb_lock, flags); + kfree(congested); + + if (atomic_dec_and_test(&bdi->usage_cnt)) + wake_up_all(&cgwb_release_wait); +} + +static void cgwb_release_workfn(struct work_struct *work) +{ + struct bdi_writeback *wb = container_of(work, struct bdi_writeback, + release_work); + struct backing_dev_info *bdi = wb->bdi; + + wb_shutdown(wb); + + css_put(wb->memcg_css); + css_put(wb->blkcg_css); + wb_congested_put(wb->congested); + + fprop_local_destroy_percpu(&wb->memcg_completions); + percpu_ref_exit(&wb->refcnt); + wb_exit(wb); + kfree_rcu(wb, rcu); + + if (atomic_dec_and_test(&bdi->usage_cnt)) + wake_up_all(&cgwb_release_wait); +} + +static void cgwb_release(struct percpu_ref *refcnt) +{ + struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback, + refcnt); + schedule_work(&wb->release_work); +} + +static void cgwb_kill(struct bdi_writeback *wb) +{ + lockdep_assert_held(&cgwb_lock); + + WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id)); + list_del(&wb->memcg_node); + list_del(&wb->blkcg_node); + percpu_ref_kill(&wb->refcnt); +} + +static int cgwb_create(struct backing_dev_info *bdi, + struct cgroup_subsys_state *memcg_css, gfp_t gfp) +{ + struct mem_cgroup *memcg; + struct cgroup_subsys_state *blkcg_css; + struct blkcg *blkcg; + struct list_head *memcg_cgwb_list, *blkcg_cgwb_list; + struct bdi_writeback *wb; + unsigned long flags; + int ret = 0; + + memcg = mem_cgroup_from_css(memcg_css); + blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &blkio_cgrp_subsys); + blkcg = css_to_blkcg(blkcg_css); + memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); + blkcg_cgwb_list = &blkcg->cgwb_list; + + /* look up again under lock and discard on blkcg mismatch */ + spin_lock_irqsave(&cgwb_lock, flags); + wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); + if (wb && wb->blkcg_css != blkcg_css) { + cgwb_kill(wb); + wb = NULL; + } + spin_unlock_irqrestore(&cgwb_lock, flags); + if (wb) + goto out_put; + + /* need to create a new one */ + wb = kmalloc(sizeof(*wb), gfp); + if (!wb) + return -ENOMEM; + + ret = wb_init(wb, bdi, gfp); + if (ret) + goto err_free; + + ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp); + if (ret) + goto err_wb_exit; + + ret = fprop_local_init_percpu(&wb->memcg_completions, gfp); + if (ret) + goto err_ref_exit; + + wb->congested = wb_congested_get_create(bdi, blkcg_css->id, gfp); + if (!wb->congested) { + ret = -ENOMEM; + goto err_fprop_exit; + } + + wb->memcg_css = memcg_css; + wb->blkcg_css = blkcg_css; + INIT_WORK(&wb->release_work, cgwb_release_workfn); + set_bit(WB_registered, &wb->state); /* - * Drain work list and shutdown the delayed_work. At this point, - * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi - * is dying and its work_list needs to be drained no matter what. + * The root wb determines the registered state of the whole bdi and + * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate + * whether they're still online. Don't link @wb if any is dead. + * See wb_memcg_offline() and wb_blkcg_offline(). */ - mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); - flush_delayed_work(&bdi->wb.dwork); + ret = -ENODEV; + spin_lock_irqsave(&cgwb_lock, flags); + if (test_bit(WB_registered, &bdi->wb.state) && + blkcg_cgwb_list->next && memcg_cgwb_list->next) { + /* we might have raced another instance of this function */ + ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb); + if (!ret) { + atomic_inc(&bdi->usage_cnt); + list_add(&wb->memcg_node, memcg_cgwb_list); + list_add(&wb->blkcg_node, blkcg_cgwb_list); + css_get(memcg_css); + css_get(blkcg_css); + } + } + spin_unlock_irqrestore(&cgwb_lock, flags); + if (ret) { + if (ret == -EEXIST) + ret = 0; + goto err_put_congested; + } + goto out_put; + +err_put_congested: + wb_congested_put(wb->congested); +err_fprop_exit: + fprop_local_destroy_percpu(&wb->memcg_completions); +err_ref_exit: + percpu_ref_exit(&wb->refcnt); +err_wb_exit: + wb_exit(wb); +err_free: + kfree(wb); +out_put: + css_put(blkcg_css); + return ret; } -static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) +/** + * wb_get_create - get wb for a given memcg, create if necessary + * @bdi: target bdi + * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) + * @gfp: allocation mask to use + * + * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to + * create one. The returned wb has its refcount incremented. + * + * This function uses css_get() on @memcg_css and thus expects its refcnt + * to be positive on invocation. IOW, rcu_read_lock() protection on + * @memcg_css isn't enough. try_get it before calling this function. + * + * A wb is keyed by its associated memcg. As blkcg implicitly enables + * memcg on the default hierarchy, memcg association is guaranteed to be + * more specific (equal or descendant to the associated blkcg) and thus can + * identify both the memcg and blkcg associations. + * + * Because the blkcg associated with a memcg may change as blkcg is enabled + * and disabled closer to root in the hierarchy, each wb keeps track of + * both the memcg and blkcg associated with it and verifies the blkcg on + * each lookup. On mismatch, the existing wb is discarded and a new one is + * created. + */ +struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, + struct cgroup_subsys_state *memcg_css, + gfp_t gfp) { - memset(wb, 0, sizeof(*wb)); + struct bdi_writeback *wb; + + might_sleep_if(gfp & __GFP_WAIT); + + if (!memcg_css->parent) + return &bdi->wb; + + do { + rcu_read_lock(); + wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); + if (wb) { + struct cgroup_subsys_state *blkcg_css; + + /* see whether the blkcg association has changed */ + blkcg_css = cgroup_get_e_css(memcg_css->cgroup, + &blkio_cgrp_subsys); + if (unlikely(wb->blkcg_css != blkcg_css || + !wb_tryget(wb))) + wb = NULL; + css_put(blkcg_css); + } + rcu_read_unlock(); + } while (!wb && !cgwb_create(bdi, memcg_css, gfp)); + + return wb; +} - wb->bdi = bdi; - wb->last_old_flush = jiffies; - INIT_LIST_HEAD(&wb->b_dirty); - INIT_LIST_HEAD(&wb->b_io); - INIT_LIST_HEAD(&wb->b_more_io); - INIT_LIST_HEAD(&wb->b_dirty_time); - spin_lock_init(&wb->list_lock); - INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn); +static void cgwb_bdi_init(struct backing_dev_info *bdi) +{ + bdi->wb.memcg_css = mem_cgroup_root_css; + bdi->wb.blkcg_css = blkcg_root_css; + bdi->wb_congested.blkcg_id = 1; + INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); + bdi->cgwb_congested_tree = RB_ROOT; + atomic_set(&bdi->usage_cnt, 1); } -/* - * Initial write bandwidth: 100 MB/s +static void cgwb_bdi_destroy(struct backing_dev_info *bdi) +{ + struct radix_tree_iter iter; + void **slot; + + WARN_ON(test_bit(WB_registered, &bdi->wb.state)); + + spin_lock_irq(&cgwb_lock); + radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) + cgwb_kill(*slot); + spin_unlock_irq(&cgwb_lock); + + /* + * All cgwb's and their congested states must be shutdown and + * released before returning. Drain the usage counter to wait for + * all cgwb's and cgwb_congested's ever created on @bdi. + */ + atomic_dec(&bdi->usage_cnt); + wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt)); +} + +/** + * wb_memcg_offline - kill all wb's associated with a memcg being offlined + * @memcg: memcg being offlined + * + * Also prevents creation of any new wb's associated with @memcg. */ -#define INIT_BW (100 << (20 - PAGE_SHIFT)) +void wb_memcg_offline(struct mem_cgroup *memcg) +{ + LIST_HEAD(to_destroy); + struct list_head *memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); + struct bdi_writeback *wb, *next; + + spin_lock_irq(&cgwb_lock); + list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node) + cgwb_kill(wb); + memcg_cgwb_list->next = NULL; /* prevent new wb's */ + spin_unlock_irq(&cgwb_lock); +} + +/** + * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined + * @blkcg: blkcg being offlined + * + * Also prevents creation of any new wb's associated with @blkcg. + */ +void wb_blkcg_offline(struct blkcg *blkcg) +{ + LIST_HEAD(to_destroy); + struct bdi_writeback *wb, *next; + + spin_lock_irq(&cgwb_lock); + list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node) + cgwb_kill(wb); + blkcg->cgwb_list.next = NULL; /* prevent new wb's */ + spin_unlock_irq(&cgwb_lock); +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static void cgwb_bdi_init(struct backing_dev_info *bdi) { } +static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { } + +#endif /* CONFIG_CGROUP_WRITEBACK */ int bdi_init(struct backing_dev_info *bdi) { - int i, err; + int err; bdi->dev = NULL; bdi->min_ratio = 0; bdi->max_ratio = 100; bdi->max_prop_frac = FPROP_FRAC_BASE; - spin_lock_init(&bdi->wb_lock); INIT_LIST_HEAD(&bdi->bdi_list); - INIT_LIST_HEAD(&bdi->work_list); + init_waitqueue_head(&bdi->wb_waitq); - bdi_wb_init(&bdi->wb, bdi); + err = wb_init(&bdi->wb, bdi, GFP_KERNEL); + if (err) + return err; - for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { - err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL); - if (err) - goto err; - } + bdi->wb_congested.state = 0; + bdi->wb.congested = &bdi->wb_congested; - bdi->dirty_exceeded = 0; + cgwb_bdi_init(bdi); + return 0; +} +EXPORT_SYMBOL(bdi_init); - bdi->bw_time_stamp = jiffies; - bdi->written_stamp = 0; +int bdi_register(struct backing_dev_info *bdi, struct device *parent, + const char *fmt, ...) +{ + va_list args; + struct device *dev; - bdi->balanced_dirty_ratelimit = INIT_BW; - bdi->dirty_ratelimit = INIT_BW; - bdi->write_bandwidth = INIT_BW; - bdi->avg_write_bandwidth = INIT_BW; + if (bdi->dev) /* The driver needs to use separate queues per device */ + return 0; - err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL); + va_start(args, fmt); + dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args); + va_end(args); + if (IS_ERR(dev)) + return PTR_ERR(dev); - if (err) { -err: - while (i--) - percpu_counter_destroy(&bdi->bdi_stat[i]); - } + bdi->dev = dev; - return err; + bdi_debug_register(bdi, dev_name(dev)); + set_bit(WB_registered, &bdi->wb.state); + + spin_lock_bh(&bdi_lock); + list_add_tail_rcu(&bdi->bdi_list, &bdi_list); + spin_unlock_bh(&bdi_lock); + + trace_writeback_bdi_register(bdi); + return 0; } -EXPORT_SYMBOL(bdi_init); +EXPORT_SYMBOL(bdi_register); -void bdi_destroy(struct backing_dev_info *bdi) +int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) { - int i; + return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev)); +} +EXPORT_SYMBOL(bdi_register_dev); + +/* + * Remove bdi from bdi_list, and ensure that it is no longer visible + */ +static void bdi_remove_from_list(struct backing_dev_info *bdi) +{ + spin_lock_bh(&bdi_lock); + list_del_rcu(&bdi->bdi_list); + spin_unlock_bh(&bdi_lock); - bdi_wb_shutdown(bdi); - bdi_set_min_ratio(bdi, 0); + synchronize_rcu_expedited(); +} - WARN_ON(!list_empty(&bdi->work_list)); - WARN_ON(delayed_work_pending(&bdi->wb.dwork)); +void bdi_destroy(struct backing_dev_info *bdi) +{ + /* make sure nobody finds us on the bdi_list anymore */ + bdi_remove_from_list(bdi); + wb_shutdown(&bdi->wb); + cgwb_bdi_destroy(bdi); if (bdi->dev) { bdi_debug_unregister(bdi); @@ -437,9 +821,7 @@ void bdi_destroy(struct backing_dev_info *bdi) bdi->dev = NULL; } - for (i = 0; i < NR_BDI_STAT_ITEMS; i++) - percpu_counter_destroy(&bdi->bdi_stat[i]); - fprop_local_destroy_percpu(&bdi->completions); + wb_exit(&bdi->wb); } EXPORT_SYMBOL(bdi_destroy); @@ -472,31 +854,31 @@ static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) }; -static atomic_t nr_bdi_congested[2]; +static atomic_t nr_wb_congested[2]; -void clear_bdi_congested(struct backing_dev_info *bdi, int sync) +void clear_wb_congested(struct bdi_writeback_congested *congested, int sync) { - enum bdi_state bit; wait_queue_head_t *wqh = &congestion_wqh[sync]; + enum wb_state bit; - bit = sync ? BDI_sync_congested : BDI_async_congested; - if (test_and_clear_bit(bit, &bdi->state)) - atomic_dec(&nr_bdi_congested[sync]); + bit = sync ? WB_sync_congested : WB_async_congested; + if (test_and_clear_bit(bit, &congested->state)) + atomic_dec(&nr_wb_congested[sync]); smp_mb__after_atomic(); if (waitqueue_active(wqh)) wake_up(wqh); } -EXPORT_SYMBOL(clear_bdi_congested); +EXPORT_SYMBOL(clear_wb_congested); -void set_bdi_congested(struct backing_dev_info *bdi, int sync) +void set_wb_congested(struct bdi_writeback_congested *congested, int sync) { - enum bdi_state bit; + enum wb_state bit; - bit = sync ? BDI_sync_congested : BDI_async_congested; - if (!test_and_set_bit(bit, &bdi->state)) - atomic_inc(&nr_bdi_congested[sync]); + bit = sync ? WB_sync_congested : WB_async_congested; + if (!test_and_set_bit(bit, &congested->state)) + atomic_inc(&nr_wb_congested[sync]); } -EXPORT_SYMBOL(set_bdi_congested); +EXPORT_SYMBOL(set_wb_congested); /** * congestion_wait - wait for a backing_dev to become uncongested @@ -555,7 +937,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout) * encountered in the current zone, yield if necessary instead * of sleeping on the congestion queue */ - if (atomic_read(&nr_bdi_congested[sync]) == 0 || + if (atomic_read(&nr_wb_congested[sync]) == 0 || !test_bit(ZONE_CONGESTED, &zone->flags)) { cond_resched(); diff --git a/mm/bootmem.c b/mm/bootmem.c index 477be696511d..a23dd1934654 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -164,7 +164,7 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size) end = PFN_DOWN(physaddr + size); for (; cursor < end; cursor++) { - __free_pages_bootmem(pfn_to_page(cursor), 0); + __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); totalram_pages++; } } @@ -172,7 +172,7 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size) static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { struct page *page; - unsigned long *map, start, end, pages, count = 0; + unsigned long *map, start, end, pages, cur, count = 0; if (!bdata->node_bootmem_map) return 0; @@ -210,17 +210,17 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) { int order = ilog2(BITS_PER_LONG); - __free_pages_bootmem(pfn_to_page(start), order); + __free_pages_bootmem(pfn_to_page(start), start, order); count += BITS_PER_LONG; start += BITS_PER_LONG; } else { - unsigned long cur = start; + cur = start; start = ALIGN(start + 1, BITS_PER_LONG); while (vec && cur != start) { if (vec & 1) { page = pfn_to_page(cur); - __free_pages_bootmem(page, 0); + __free_pages_bootmem(page, cur, 0); count++; } vec >>= 1; @@ -229,12 +229,13 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) } } + cur = bdata->node_min_pfn; page = virt_to_page(bdata->node_bootmem_map); pages = bdata->node_low_pfn - bdata->node_min_pfn; pages = bootmem_bootmap_pages(pages); count += pages; while (pages--) - __free_pages_bootmem(page++, 0); + __free_pages_bootmem(page++, cur++, 0); bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); diff --git a/mm/debug.c b/mm/debug.c index 3eb3ac2fcee7..76089ddf99ea 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -7,7 +7,7 @@ #include <linux/kernel.h> #include <linux/mm.h> -#include <linux/ftrace_event.h> +#include <linux/trace_events.h> #include <linux/memcontrol.h> static const struct trace_print_flags pageflag_names[] = { diff --git a/mm/fadvise.c b/mm/fadvise.c index 4a3907cf79f8..b8a5bc66b0c0 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -115,7 +115,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) case POSIX_FADV_NOREUSE: break; case POSIX_FADV_DONTNEED: - if (!bdi_write_congested(bdi)) + if (!inode_write_congested(mapping->host)) __filemap_fdatawrite_range(mapping, offset, endbyte, WB_SYNC_NONE); diff --git a/mm/filemap.c b/mm/filemap.c index 8d17ceea8dbe..11f10efd637c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -100,6 +100,7 @@ * ->tree_lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) + * ->memcg->move_lock (page_remove_rmap->mem_cgroup_begin_page_stat) * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) @@ -174,9 +175,11 @@ static void page_cache_tree_delete(struct address_space *mapping, /* * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold the mapping's tree_lock. + * is safe. The caller must hold the mapping's tree_lock and + * mem_cgroup_begin_page_stat(). */ -void __delete_from_page_cache(struct page *page, void *shadow) +void __delete_from_page_cache(struct page *page, void *shadow, + struct mem_cgroup *memcg) { struct address_space *mapping = page->mapping; @@ -212,7 +215,8 @@ void __delete_from_page_cache(struct page *page, void *shadow) * anyway will be cleared before returning page into buddy allocator. */ if (WARN_ON_ONCE(PageDirty(page))) - account_page_cleaned(page, mapping); + account_page_cleaned(page, mapping, memcg, + inode_to_wb(mapping->host)); } /** @@ -226,14 +230,20 @@ void __delete_from_page_cache(struct page *page, void *shadow) void delete_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; + struct mem_cgroup *memcg; + unsigned long flags; + void (*freepage)(struct page *); BUG_ON(!PageLocked(page)); freepage = mapping->a_ops->freepage; - spin_lock_irq(&mapping->tree_lock); - __delete_from_page_cache(page, NULL); - spin_unlock_irq(&mapping->tree_lock); + + memcg = mem_cgroup_begin_page_stat(page); + spin_lock_irqsave(&mapping->tree_lock, flags); + __delete_from_page_cache(page, NULL, memcg); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); if (freepage) freepage(page); @@ -283,7 +293,9 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, if (!mapping_cap_writeback_dirty(mapping)) return 0; + wbc_attach_fdatawrite_inode(&wbc, mapping->host); ret = do_writepages(mapping, &wbc); + wbc_detach_inode(&wbc); return ret; } @@ -472,6 +484,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (!error) { struct address_space *mapping = old->mapping; void (*freepage)(struct page *); + struct mem_cgroup *memcg; + unsigned long flags; pgoff_t offset = old->index; freepage = mapping->a_ops->freepage; @@ -480,8 +494,9 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->mapping = mapping; new->index = offset; - spin_lock_irq(&mapping->tree_lock); - __delete_from_page_cache(old, NULL); + memcg = mem_cgroup_begin_page_stat(old); + spin_lock_irqsave(&mapping->tree_lock, flags); + __delete_from_page_cache(old, NULL, memcg); error = radix_tree_insert(&mapping->page_tree, offset, new); BUG_ON(error); mapping->nrpages++; @@ -493,7 +508,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) __inc_zone_page_state(new, NR_FILE_PAGES); if (PageSwapBacked(new)) __inc_zone_page_state(new, NR_SHMEM); - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); mem_cgroup_migrate(old, new, true); radix_tree_preload_end(); if (freepage) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 75c0eef52c5d..a8c3087089d8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -975,7 +975,6 @@ static void update_and_free_page(struct hstate *h, struct page *page) destroy_compound_gigantic_page(page, huge_page_order(h)); free_gigantic_page(page, huge_page_order(h)); } else { - arch_release_hugepage(page); __free_pages(page, huge_page_order(h)); } } @@ -1160,10 +1159,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); if (page) { - if (arch_prepare_hugepage(page)) { - __free_pages(page, huge_page_order(h)); - return NULL; - } prep_new_huge_page(h, page, nid); } @@ -1315,11 +1310,6 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); - if (page && arch_prepare_hugepage(page)) { - __free_pages(page, huge_page_order(h)); - page = NULL; - } - spin_lock(&hugetlb_lock); if (page) { INIT_LIST_HEAD(&page->lru); diff --git a/mm/internal.h b/mm/internal.h index a25e359a4039..36b23f1e2ca6 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -155,7 +155,8 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) } extern int __isolate_free_page(struct page *page, unsigned int order); -extern void __free_pages_bootmem(struct page *page, unsigned int order); +extern void __free_pages_bootmem(struct page *page, unsigned long pfn, + unsigned int order); extern void prep_compound_page(struct page *page, unsigned long order); #ifdef CONFIG_MEMORY_FAILURE extern bool is_free_buddy_page(struct page *page); @@ -361,10 +362,7 @@ do { \ } while (0) extern void mminit_verify_pageflags_layout(void); -extern void mminit_verify_page_links(struct page *page, - enum zone_type zone, unsigned long nid, unsigned long pfn); extern void mminit_verify_zonelist(void); - #else static inline void mminit_dprintk(enum mminit_level level, @@ -376,11 +374,6 @@ static inline void mminit_verify_pageflags_layout(void) { } -static inline void mminit_verify_page_links(struct page *page, - enum zone_type zone, unsigned long nid, unsigned long pfn) -{ -} - static inline void mminit_verify_zonelist(void) { } diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 4986b0acab21..c242adf6bc85 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -7,7 +7,6 @@ #define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1) #define KASAN_FREE_PAGE 0xFF /* page was freed */ -#define KASAN_FREE_PAGE 0xFF /* page was freed */ #define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ #define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */ #define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */ diff --git a/mm/madvise.c b/mm/madvise.c index d551475517bf..64bb8a22110c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -17,6 +17,7 @@ #include <linux/fs.h> #include <linux/file.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/swap.h> #include <linux/swapops.h> diff --git a/mm/memblock.c b/mm/memblock.c index 1b444c730846..87108e77e476 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -820,6 +820,38 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) /** + * __next_reserved_mem_region - next function for for_each_reserved_region() + * @idx: pointer to u64 loop variable + * @out_start: ptr to phys_addr_t for start address of the region, can be %NULL + * @out_end: ptr to phys_addr_t for end address of the region, can be %NULL + * + * Iterate over all reserved memory regions. + */ +void __init_memblock __next_reserved_mem_region(u64 *idx, + phys_addr_t *out_start, + phys_addr_t *out_end) +{ + struct memblock_type *rsv = &memblock.reserved; + + if (*idx >= 0 && *idx < rsv->cnt) { + struct memblock_region *r = &rsv->regions[*idx]; + phys_addr_t base = r->base; + phys_addr_t size = r->size; + + if (out_start) + *out_start = base; + if (out_end) + *out_end = base + size - 1; + + *idx += 1; + return; + } + + /* signal end of iteration */ + *idx = ULLONG_MAX; +} + +/** * __next__mem_range - next function for for_each_free_mem_range() etc. * @idx: pointer to u64 loop variable * @nid: node selector, %NUMA_NO_NODE for all nodes @@ -1387,7 +1419,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) end = PFN_DOWN(base + size); for (; cursor < end; cursor++) { - __free_pages_bootmem(pfn_to_page(cursor), 0); + __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); totalram_pages++; } } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e65f7b0131d3..acb93c554f6e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -77,6 +77,7 @@ EXPORT_SYMBOL(memory_cgrp_subsys); #define MEM_CGROUP_RECLAIM_RETRIES 5 static struct mem_cgroup *root_mem_cgroup __read_mostly; +struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly; /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP @@ -90,6 +91,7 @@ static const char * const mem_cgroup_stat_names[] = { "rss", "rss_huge", "mapped_file", + "dirty", "writeback", "swap", }; @@ -322,11 +324,6 @@ struct mem_cgroup { * percpu counter. */ struct mem_cgroup_stat_cpu __percpu *stat; - /* - * used when a cpu is offlined or other synchronizations - * See mem_cgroup_read_stat(). - */ - struct mem_cgroup_stat_cpu nocpu_base; spinlock_t pcp_counter_lock; #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) @@ -346,6 +343,11 @@ struct mem_cgroup { atomic_t numainfo_updating; #endif +#ifdef CONFIG_CGROUP_WRITEBACK + struct list_head cgwb_list; + struct wb_domain cgwb_domain; +#endif + /* List of events which userspace want to receive */ struct list_head event_list; spinlock_t event_list_lock; @@ -596,6 +598,39 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) return &memcg->css; } +/** + * mem_cgroup_css_from_page - css of the memcg associated with a page + * @page: page of interest + * + * If memcg is bound to the default hierarchy, css of the memcg associated + * with @page is returned. The returned css remains associated with @page + * until it is released. + * + * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup + * is returned. + * + * XXX: The above description of behavior on the default hierarchy isn't + * strictly true yet as replace_page_cache_page() can modify the + * association before @page is released even on the default hierarchy; + * however, the current and planned usages don't mix the the two functions + * and replace_page_cache_page() will soon be updated to make the invariant + * actually true. + */ +struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) +{ + struct mem_cgroup *memcg; + + rcu_read_lock(); + + memcg = page->mem_cgroup; + + if (!memcg || !cgroup_on_dfl(memcg->css.cgroup)) + memcg = root_mem_cgroup; + + rcu_read_unlock(); + return &memcg->css; +} + static struct mem_cgroup_per_zone * mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) { @@ -795,15 +830,8 @@ static long mem_cgroup_read_stat(struct mem_cgroup *memcg, long val = 0; int cpu; - get_online_cpus(); - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) val += per_cpu(memcg->stat->count[idx], cpu); -#ifdef CONFIG_HOTPLUG_CPU - spin_lock(&memcg->pcp_counter_lock); - val += memcg->nocpu_base.count[idx]; - spin_unlock(&memcg->pcp_counter_lock); -#endif - put_online_cpus(); return val; } @@ -813,15 +841,8 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, unsigned long val = 0; int cpu; - get_online_cpus(); - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) val += per_cpu(memcg->stat->events[idx], cpu); -#ifdef CONFIG_HOTPLUG_CPU - spin_lock(&memcg->pcp_counter_lock); - val += memcg->nocpu_base.events[idx]; - spin_unlock(&memcg->pcp_counter_lock); -#endif - put_online_cpus(); return val; } @@ -2020,6 +2041,7 @@ again: return memcg; } +EXPORT_SYMBOL(mem_cgroup_begin_page_stat); /** * mem_cgroup_end_page_stat - finish a page state statistics transaction @@ -2038,6 +2060,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) rcu_read_unlock(); } +EXPORT_SYMBOL(mem_cgroup_end_page_stat); /** * mem_cgroup_update_page_stat - update page state statistics @@ -2178,37 +2201,12 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) mutex_unlock(&percpu_charge_mutex); } -/* - * This function drains percpu counter value from DEAD cpu and - * move it to local cpu. Note that this function can be preempted. - */ -static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) -{ - int i; - - spin_lock(&memcg->pcp_counter_lock); - for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { - long x = per_cpu(memcg->stat->count[i], cpu); - - per_cpu(memcg->stat->count[i], cpu) = 0; - memcg->nocpu_base.count[i] += x; - } - for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { - unsigned long x = per_cpu(memcg->stat->events[i], cpu); - - per_cpu(memcg->stat->events[i], cpu) = 0; - memcg->nocpu_base.events[i] += x; - } - spin_unlock(&memcg->pcp_counter_lock); -} - static int memcg_cpu_hotplug_callback(struct notifier_block *nb, unsigned long action, void *hcpu) { int cpu = (unsigned long)hcpu; struct memcg_stock_pcp *stock; - struct mem_cgroup *iter; if (action == CPU_ONLINE) return NOTIFY_OK; @@ -2216,9 +2214,6 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) return NOTIFY_OK; - for_each_mem_cgroup(iter) - mem_cgroup_drain_pcp_counter(iter, cpu); - stock = &per_cpu(memcg_stock, cpu); drain_stock(stock); return NOTIFY_OK; @@ -4004,6 +3999,98 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg) } #endif +#ifdef CONFIG_CGROUP_WRITEBACK + +struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg) +{ + return &memcg->cgwb_list; +} + +static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) +{ + return wb_domain_init(&memcg->cgwb_domain, gfp); +} + +static void memcg_wb_domain_exit(struct mem_cgroup *memcg) +{ + wb_domain_exit(&memcg->cgwb_domain); +} + +static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) +{ + wb_domain_size_changed(&memcg->cgwb_domain); +} + +struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); + + if (!memcg->css.parent) + return NULL; + + return &memcg->cgwb_domain; +} + +/** + * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg + * @wb: bdi_writeback in question + * @pavail: out parameter for number of available pages + * @pdirty: out parameter for number of dirty pages + * @pwriteback: out parameter for number of pages under writeback + * + * Determine the numbers of available, dirty, and writeback pages in @wb's + * memcg. Dirty and writeback are self-explanatory. Available is a bit + * more involved. + * + * A memcg's headroom is "min(max, high) - used". The available memory is + * calculated as the lowest headroom of itself and the ancestors plus the + * number of pages already being used for file pages. Note that this + * doesn't consider the actual amount of available memory in the system. + * The caller should further cap *@pavail accordingly. + */ +void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail, + unsigned long *pdirty, unsigned long *pwriteback) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); + struct mem_cgroup *parent; + unsigned long head_room = PAGE_COUNTER_MAX; + unsigned long file_pages; + + *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY); + + /* this should eventually include NR_UNSTABLE_NFS */ + *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); + + file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | + (1 << LRU_ACTIVE_FILE)); + while ((parent = parent_mem_cgroup(memcg))) { + unsigned long ceiling = min(memcg->memory.limit, memcg->high); + unsigned long used = page_counter_read(&memcg->memory); + + head_room = min(head_room, ceiling - min(ceiling, used)); + memcg = parent; + } + + *pavail = file_pages + head_room; +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) +{ + return 0; +} + +static void memcg_wb_domain_exit(struct mem_cgroup *memcg) +{ +} + +static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) +{ +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + /* * DO NOT USE IN NEW FILES. * @@ -4388,9 +4475,15 @@ static struct mem_cgroup *mem_cgroup_alloc(void) memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); if (!memcg->stat) goto out_free; + + if (memcg_wb_domain_init(memcg, GFP_KERNEL)) + goto out_free_stat; + spin_lock_init(&memcg->pcp_counter_lock); return memcg; +out_free_stat: + free_percpu(memcg->stat); out_free: kfree(memcg); return NULL; @@ -4417,6 +4510,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) free_mem_cgroup_per_zone_info(memcg, node); free_percpu(memcg->stat); + memcg_wb_domain_exit(memcg); kfree(memcg); } @@ -4449,6 +4543,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) /* root ? */ if (parent_css == NULL) { root_mem_cgroup = memcg; + mem_cgroup_root_css = &memcg->css; page_counter_init(&memcg->memory, NULL); memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; @@ -4467,7 +4562,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) #ifdef CONFIG_MEMCG_KMEM memcg->kmemcg_id = -1; #endif - +#ifdef CONFIG_CGROUP_WRITEBACK + INIT_LIST_HEAD(&memcg->cgwb_list); +#endif return &memcg->css; free_out: @@ -4555,6 +4652,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) vmpressure_cleanup(&memcg->vmpressure); memcg_deactivate_kmem(memcg); + + wb_memcg_offline(memcg); } static void mem_cgroup_css_free(struct cgroup_subsys_state *css) @@ -4588,6 +4687,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) memcg->low = 0; memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; + memcg_wb_domain_size_changed(memcg); } #ifdef CONFIG_MMU @@ -4757,6 +4857,7 @@ static int mem_cgroup_move_account(struct page *page, { unsigned long flags; int ret; + bool anon; VM_BUG_ON(from == to); VM_BUG_ON_PAGE(PageLRU(page), page); @@ -4782,15 +4883,33 @@ static int mem_cgroup_move_account(struct page *page, if (page->mem_cgroup != from) goto out_unlock; + anon = PageAnon(page); + spin_lock_irqsave(&from->move_lock, flags); - if (!PageAnon(page) && page_mapped(page)) { + if (!anon && page_mapped(page)) { __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], nr_pages); __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], nr_pages); } + /* + * move_lock grabbed above and caller set from->moving_account, so + * mem_cgroup_update_page_stat() will serialize updates to PageDirty. + * So mapping should be stable for dirty pages. + */ + if (!anon && PageDirty(page)) { + struct address_space *mapping = page_mapping(page); + + if (mapping_cap_account_dirty(mapping)) { + __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY], + nr_pages); + __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY], + nr_pages); + } + } + if (PageWriteback(page)) { __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], nr_pages); @@ -5306,6 +5425,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, memcg->high = high; + memcg_wb_domain_size_changed(memcg); return nbytes; } @@ -5338,6 +5458,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (err) return err; + memcg_wb_domain_size_changed(memcg); return nbytes; } diff --git a/mm/mm_init.c b/mm/mm_init.c index 5f420f7fafa1..fdadf918de76 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -11,6 +11,7 @@ #include <linux/export.h> #include <linux/memory.h> #include <linux/notifier.h> +#include <linux/sched.h> #include "internal.h" #ifdef CONFIG_DEBUG_MEMORY_INIT @@ -130,14 +131,6 @@ void __init mminit_verify_pageflags_layout(void) BUG_ON(or_mask != add_mask); } -void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone, - unsigned long nid, unsigned long pfn) -{ - BUG_ON(page_to_nid(page) != nid); - BUG_ON(page_zonenum(page) != zone); - BUG_ON(page_to_pfn(page) != pfn); -} - static __init int set_mminit_loglevel(char *str) { get_option(&str, &mminit_loglevel); diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 5258386fa1be..e57cf24babd6 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -86,7 +86,7 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) end = PFN_DOWN(addr + size); for (; cursor < end; cursor++) { - __free_pages_bootmem(pfn_to_page(cursor), 0); + __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); totalram_pages++; } } @@ -101,7 +101,7 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end) while (start + (1UL << order) > end) order--; - __free_pages_bootmem(pfn_to_page(start), order); + __free_pages_bootmem(pfn_to_page(start), start, order); start += (1UL << order); } @@ -130,6 +130,9 @@ static unsigned long __init free_low_memory_core_early(void) memblock_clear_hotplug(0, -1); + for_each_reserved_mem_region(i, &start, &end) + reserve_bootmem_region(start, end); + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) count += __free_memory_core(start, end); diff --git a/mm/nommu.c b/mm/nommu.c index 05e7447d960b..58ea3643b9e9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -2085,7 +2085,7 @@ static int __meminit init_user_reserve(void) sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); return 0; } -module_init(init_user_reserve) +subsys_initcall(init_user_reserve); /* * Initialise sysctl_admin_reserve_kbytes. @@ -2106,4 +2106,4 @@ static int __meminit init_admin_reserve(void) sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); return 0; } -module_init(init_admin_reserve) +subsys_initcall(init_admin_reserve); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index eb59f7eea508..22cddd3e5de8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -122,31 +122,31 @@ EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ -unsigned long global_dirty_limit; +struct wb_domain global_wb_domain; -/* - * Scale the writeback cache size proportional to the relative writeout speeds. - * - * We do this by keeping a floating proportion between BDIs, based on page - * writeback completions [end_page_writeback()]. Those devices that write out - * pages fastest will get the larger share, while the slower will get a smaller - * share. - * - * We use page writeout completions because we are interested in getting rid of - * dirty pages. Having them written out is the primary goal. - * - * We introduce a concept of time, a period over which we measure these events, - * because demand can/will vary over time. The length of this period itself is - * measured in page writeback completions. - * - */ -static struct fprop_global writeout_completions; +/* consolidated parameters for balance_dirty_pages() and its subroutines */ +struct dirty_throttle_control { +#ifdef CONFIG_CGROUP_WRITEBACK + struct wb_domain *dom; + struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */ +#endif + struct bdi_writeback *wb; + struct fprop_local_percpu *wb_completions; -static void writeout_period(unsigned long t); -/* Timer for aging of writeout_completions */ -static struct timer_list writeout_period_timer = - TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0); -static unsigned long writeout_period_time = 0; + unsigned long avail; /* dirtyable */ + unsigned long dirty; /* file_dirty + write + nfs */ + unsigned long thresh; /* dirty threshold */ + unsigned long bg_thresh; /* dirty background threshold */ + + unsigned long wb_dirty; /* per-wb counterparts */ + unsigned long wb_thresh; + unsigned long wb_bg_thresh; + + unsigned long pos_ratio; +}; + +#define DTC_INIT_COMMON(__wb) .wb = (__wb), \ + .wb_completions = &(__wb)->completions /* * Length of period for aging writeout fractions of bdis. This is an @@ -155,6 +155,97 @@ static unsigned long writeout_period_time = 0; */ #define VM_COMPLETIONS_PERIOD_LEN (3*HZ) +#ifdef CONFIG_CGROUP_WRITEBACK + +#define GDTC_INIT(__wb) .dom = &global_wb_domain, \ + DTC_INIT_COMMON(__wb) +#define GDTC_INIT_NO_WB .dom = &global_wb_domain +#define MDTC_INIT(__wb, __gdtc) .dom = mem_cgroup_wb_domain(__wb), \ + .gdtc = __gdtc, \ + DTC_INIT_COMMON(__wb) + +static bool mdtc_valid(struct dirty_throttle_control *dtc) +{ + return dtc->dom; +} + +static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) +{ + return dtc->dom; +} + +static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) +{ + return mdtc->gdtc; +} + +static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) +{ + return &wb->memcg_completions; +} + +static void wb_min_max_ratio(struct bdi_writeback *wb, + unsigned long *minp, unsigned long *maxp) +{ + unsigned long this_bw = wb->avg_write_bandwidth; + unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); + unsigned long long min = wb->bdi->min_ratio; + unsigned long long max = wb->bdi->max_ratio; + + /* + * @wb may already be clean by the time control reaches here and + * the total may not include its bw. + */ + if (this_bw < tot_bw) { + if (min) { + min *= this_bw; + do_div(min, tot_bw); + } + if (max < 100) { + max *= this_bw; + do_div(max, tot_bw); + } + } + + *minp = min; + *maxp = max; +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +#define GDTC_INIT(__wb) DTC_INIT_COMMON(__wb) +#define GDTC_INIT_NO_WB +#define MDTC_INIT(__wb, __gdtc) + +static bool mdtc_valid(struct dirty_throttle_control *dtc) +{ + return false; +} + +static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) +{ + return &global_wb_domain; +} + +static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) +{ + return NULL; +} + +static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) +{ + return NULL; +} + +static void wb_min_max_ratio(struct bdi_writeback *wb, + unsigned long *minp, unsigned long *maxp) +{ + *minp = wb->bdi->min_ratio; + *maxp = wb->bdi->max_ratio; +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + /* * In a memory zone, there is a certain amount of pages we consider * available for the page cache, which is essentially the number of @@ -250,42 +341,88 @@ static unsigned long global_dirtyable_memory(void) return x + 1; /* Ensure that we never return 0 */ } -/* - * global_dirty_limits - background-writeback and dirty-throttling thresholds +/** + * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain + * @dtc: dirty_throttle_control of interest * - * Calculate the dirty thresholds based on sysctl parameters - * - vm.dirty_background_ratio or vm.dirty_background_bytes - * - vm.dirty_ratio or vm.dirty_bytes - * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and + * Calculate @dtc->thresh and ->bg_thresh considering + * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller + * must ensure that @dtc->avail is set before calling this function. The + * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and * real-time tasks. */ -void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) +static void domain_dirty_limits(struct dirty_throttle_control *dtc) { - const unsigned long available_memory = global_dirtyable_memory(); - unsigned long background; - unsigned long dirty; + const unsigned long available_memory = dtc->avail; + struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc); + unsigned long bytes = vm_dirty_bytes; + unsigned long bg_bytes = dirty_background_bytes; + unsigned long ratio = vm_dirty_ratio; + unsigned long bg_ratio = dirty_background_ratio; + unsigned long thresh; + unsigned long bg_thresh; struct task_struct *tsk; - if (vm_dirty_bytes) - dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); + /* gdtc is !NULL iff @dtc is for memcg domain */ + if (gdtc) { + unsigned long global_avail = gdtc->avail; + + /* + * The byte settings can't be applied directly to memcg + * domains. Convert them to ratios by scaling against + * globally available memory. + */ + if (bytes) + ratio = min(DIV_ROUND_UP(bytes, PAGE_SIZE) * 100 / + global_avail, 100UL); + if (bg_bytes) + bg_ratio = min(DIV_ROUND_UP(bg_bytes, PAGE_SIZE) * 100 / + global_avail, 100UL); + bytes = bg_bytes = 0; + } + + if (bytes) + thresh = DIV_ROUND_UP(bytes, PAGE_SIZE); else - dirty = (vm_dirty_ratio * available_memory) / 100; + thresh = (ratio * available_memory) / 100; - if (dirty_background_bytes) - background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); + if (bg_bytes) + bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE); else - background = (dirty_background_ratio * available_memory) / 100; + bg_thresh = (bg_ratio * available_memory) / 100; - if (background >= dirty) - background = dirty / 2; + if (bg_thresh >= thresh) + bg_thresh = thresh / 2; tsk = current; if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { - background += background / 4; - dirty += dirty / 4; + bg_thresh += bg_thresh / 4; + thresh += thresh / 4; } - *pbackground = background; - *pdirty = dirty; - trace_global_dirty_state(background, dirty); + dtc->thresh = thresh; + dtc->bg_thresh = bg_thresh; + + /* we should eventually report the domain in the TP */ + if (!gdtc) + trace_global_dirty_state(bg_thresh, thresh); +} + +/** + * global_dirty_limits - background-writeback and dirty-throttling thresholds + * @pbackground: out parameter for bg_thresh + * @pdirty: out parameter for thresh + * + * Calculate bg_thresh and thresh for global_wb_domain. See + * domain_dirty_limits() for details. + */ +void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) +{ + struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB }; + + gdtc.avail = global_dirtyable_memory(); + domain_dirty_limits(&gdtc); + + *pbackground = gdtc.bg_thresh; + *pdirty = gdtc.thresh; } /** @@ -392,47 +529,52 @@ static unsigned long wp_next_time(unsigned long cur_time) return cur_time; } -/* - * Increment the BDI's writeout completion count and the global writeout - * completion count. Called from test_clear_page_writeback(). - */ -static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) +static void wb_domain_writeout_inc(struct wb_domain *dom, + struct fprop_local_percpu *completions, + unsigned int max_prop_frac) { - __inc_bdi_stat(bdi, BDI_WRITTEN); - __fprop_inc_percpu_max(&writeout_completions, &bdi->completions, - bdi->max_prop_frac); + __fprop_inc_percpu_max(&dom->completions, completions, + max_prop_frac); /* First event after period switching was turned off? */ - if (!unlikely(writeout_period_time)) { + if (!unlikely(dom->period_time)) { /* * We can race with other __bdi_writeout_inc calls here but * it does not cause any harm since the resulting time when * timer will fire and what is in writeout_period_time will be * roughly the same. */ - writeout_period_time = wp_next_time(jiffies); - mod_timer(&writeout_period_timer, writeout_period_time); + dom->period_time = wp_next_time(jiffies); + mod_timer(&dom->period_timer, dom->period_time); } } -void bdi_writeout_inc(struct backing_dev_info *bdi) +/* + * Increment @wb's writeout completion count and the global writeout + * completion count. Called from test_clear_page_writeback(). + */ +static inline void __wb_writeout_inc(struct bdi_writeback *wb) { - unsigned long flags; + struct wb_domain *cgdom; - local_irq_save(flags); - __bdi_writeout_inc(bdi); - local_irq_restore(flags); + __inc_wb_stat(wb, WB_WRITTEN); + wb_domain_writeout_inc(&global_wb_domain, &wb->completions, + wb->bdi->max_prop_frac); + + cgdom = mem_cgroup_wb_domain(wb); + if (cgdom) + wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb), + wb->bdi->max_prop_frac); } -EXPORT_SYMBOL_GPL(bdi_writeout_inc); -/* - * Obtain an accurate fraction of the BDI's portion. - */ -static void bdi_writeout_fraction(struct backing_dev_info *bdi, - long *numerator, long *denominator) +void wb_writeout_inc(struct bdi_writeback *wb) { - fprop_fraction_percpu(&writeout_completions, &bdi->completions, - numerator, denominator); + unsigned long flags; + + local_irq_save(flags); + __wb_writeout_inc(wb); + local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(wb_writeout_inc); /* * On idle system, we can be called long after we scheduled because we use @@ -440,22 +582,46 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi, */ static void writeout_period(unsigned long t) { - int miss_periods = (jiffies - writeout_period_time) / + struct wb_domain *dom = (void *)t; + int miss_periods = (jiffies - dom->period_time) / VM_COMPLETIONS_PERIOD_LEN; - if (fprop_new_period(&writeout_completions, miss_periods + 1)) { - writeout_period_time = wp_next_time(writeout_period_time + + if (fprop_new_period(&dom->completions, miss_periods + 1)) { + dom->period_time = wp_next_time(dom->period_time + miss_periods * VM_COMPLETIONS_PERIOD_LEN); - mod_timer(&writeout_period_timer, writeout_period_time); + mod_timer(&dom->period_timer, dom->period_time); } else { /* * Aging has zeroed all fractions. Stop wasting CPU on period * updates. */ - writeout_period_time = 0; + dom->period_time = 0; } } +int wb_domain_init(struct wb_domain *dom, gfp_t gfp) +{ + memset(dom, 0, sizeof(*dom)); + + spin_lock_init(&dom->lock); + + init_timer_deferrable(&dom->period_timer); + dom->period_timer.function = writeout_period; + dom->period_timer.data = (unsigned long)dom; + + dom->dirty_limit_tstamp = jiffies; + + return fprop_global_init(&dom->completions, gfp); +} + +#ifdef CONFIG_CGROUP_WRITEBACK +void wb_domain_exit(struct wb_domain *dom) +{ + del_timer_sync(&dom->period_timer); + fprop_global_destroy(&dom->completions); +} +#endif + /* * bdi_min_ratio keeps the sum of the minimum dirty shares of all * registered backing devices, which, for obvious reasons, can not @@ -510,17 +676,26 @@ static unsigned long dirty_freerun_ceiling(unsigned long thresh, return (thresh + bg_thresh) / 2; } -static unsigned long hard_dirty_limit(unsigned long thresh) +static unsigned long hard_dirty_limit(struct wb_domain *dom, + unsigned long thresh) { - return max(thresh, global_dirty_limit); + return max(thresh, dom->dirty_limit); +} + +/* memory available to a memcg domain is capped by system-wide clean memory */ +static void mdtc_cap_avail(struct dirty_throttle_control *mdtc) +{ + struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc); + unsigned long clean = gdtc->avail - min(gdtc->avail, gdtc->dirty); + + mdtc->avail = min(mdtc->avail, clean); } /** - * bdi_dirty_limit - @bdi's share of dirty throttling threshold - * @bdi: the backing_dev_info to query - * @dirty: global dirty limit in pages + * __wb_calc_thresh - @wb's share of dirty throttling threshold + * @dtc: dirty_throttle_context of interest * - * Returns @bdi's dirty limit in pages. The term "dirty" in the context of + * Returns @wb's dirty limit in pages. The term "dirty" in the context of * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. * * Note that balance_dirty_pages() will only seriously take it as a hard limit @@ -528,34 +703,47 @@ static unsigned long hard_dirty_limit(unsigned long thresh) * control. For example, when the device is completely stalled due to some error * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key. * In the other normal situations, it acts more gently by throttling the tasks - * more (rather than completely block them) when the bdi dirty pages go high. + * more (rather than completely block them) when the wb dirty pages go high. * * It allocates high/low dirty limits to fast/slow devices, in order to prevent * - starving fast devices * - piling up dirty pages (that will take long time to sync) on slow devices * - * The bdi's share of dirty limit will be adapting to its throughput and + * The wb's share of dirty limit will be adapting to its throughput and * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. */ -unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) +static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) { - u64 bdi_dirty; + struct wb_domain *dom = dtc_dom(dtc); + unsigned long thresh = dtc->thresh; + u64 wb_thresh; long numerator, denominator; + unsigned long wb_min_ratio, wb_max_ratio; /* - * Calculate this BDI's share of the dirty ratio. + * Calculate this BDI's share of the thresh ratio. */ - bdi_writeout_fraction(bdi, &numerator, &denominator); + fprop_fraction_percpu(&dom->completions, dtc->wb_completions, + &numerator, &denominator); + + wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100; + wb_thresh *= numerator; + do_div(wb_thresh, denominator); - bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100; - bdi_dirty *= numerator; - do_div(bdi_dirty, denominator); + wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio); - bdi_dirty += (dirty * bdi->min_ratio) / 100; - if (bdi_dirty > (dirty * bdi->max_ratio) / 100) - bdi_dirty = dirty * bdi->max_ratio / 100; + wb_thresh += (thresh * wb_min_ratio) / 100; + if (wb_thresh > (thresh * wb_max_ratio) / 100) + wb_thresh = thresh * wb_max_ratio / 100; - return bdi_dirty; + return wb_thresh; +} + +unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) +{ + struct dirty_throttle_control gdtc = { GDTC_INIT(wb), + .thresh = thresh }; + return __wb_calc_thresh(&gdtc); } /* @@ -594,7 +782,7 @@ static long long pos_ratio_polynom(unsigned long setpoint, * * (o) global/bdi setpoints * - * We want the dirty pages be balanced around the global/bdi setpoints. + * We want the dirty pages be balanced around the global/wb setpoints. * When the number of dirty pages is higher/lower than the setpoint, the * dirty position control ratio (and hence task dirty ratelimit) will be * decreased/increased to bring the dirty pages back to the setpoint. @@ -604,8 +792,8 @@ static long long pos_ratio_polynom(unsigned long setpoint, * if (dirty < setpoint) scale up pos_ratio * if (dirty > setpoint) scale down pos_ratio * - * if (bdi_dirty < bdi_setpoint) scale up pos_ratio - * if (bdi_dirty > bdi_setpoint) scale down pos_ratio + * if (wb_dirty < wb_setpoint) scale up pos_ratio + * if (wb_dirty > wb_setpoint) scale down pos_ratio * * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT * @@ -630,7 +818,7 @@ static long long pos_ratio_polynom(unsigned long setpoint, * 0 +------------.------------------.----------------------*-------------> * freerun^ setpoint^ limit^ dirty pages * - * (o) bdi control line + * (o) wb control line * * ^ pos_ratio * | @@ -656,33 +844,32 @@ static long long pos_ratio_polynom(unsigned long setpoint, * | . . * | . . * 0 +----------------------.-------------------------------.-------------> - * bdi_setpoint^ x_intercept^ + * wb_setpoint^ x_intercept^ * - * The bdi control line won't drop below pos_ratio=1/4, so that bdi_dirty can + * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can * be smoothly throttled down to normal if it starts high in situations like * - start writing to a slow SD card and a fast disk at the same time. The SD - * card's bdi_dirty may rush to many times higher than bdi_setpoint. - * - the bdi dirty thresh drops quickly due to change of JBOD workload + * card's wb_dirty may rush to many times higher than wb_setpoint. + * - the wb dirty thresh drops quickly due to change of JBOD workload */ -static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty) -{ - unsigned long write_bw = bdi->avg_write_bandwidth; - unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh); - unsigned long limit = hard_dirty_limit(thresh); +static void wb_position_ratio(struct dirty_throttle_control *dtc) +{ + struct bdi_writeback *wb = dtc->wb; + unsigned long write_bw = wb->avg_write_bandwidth; + unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); + unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); + unsigned long wb_thresh = dtc->wb_thresh; unsigned long x_intercept; unsigned long setpoint; /* dirty pages' target balance point */ - unsigned long bdi_setpoint; + unsigned long wb_setpoint; unsigned long span; long long pos_ratio; /* for scaling up/down the rate limit */ long x; - if (unlikely(dirty >= limit)) - return 0; + dtc->pos_ratio = 0; + + if (unlikely(dtc->dirty >= limit)) + return; /* * global setpoint @@ -690,165 +877,167 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, * See comment for pos_ratio_polynom(). */ setpoint = (freerun + limit) / 2; - pos_ratio = pos_ratio_polynom(setpoint, dirty, limit); + pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit); /* * The strictlimit feature is a tool preventing mistrusted filesystems * from growing a large number of dirty pages before throttling. For - * such filesystems balance_dirty_pages always checks bdi counters - * against bdi limits. Even if global "nr_dirty" is under "freerun". + * such filesystems balance_dirty_pages always checks wb counters + * against wb limits. Even if global "nr_dirty" is under "freerun". * This is especially important for fuse which sets bdi->max_ratio to * 1% by default. Without strictlimit feature, fuse writeback may * consume arbitrary amount of RAM because it is accounted in * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". * - * Here, in bdi_position_ratio(), we calculate pos_ratio based on - * two values: bdi_dirty and bdi_thresh. Let's consider an example: + * Here, in wb_position_ratio(), we calculate pos_ratio based on + * two values: wb_dirty and wb_thresh. Let's consider an example: * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global * limits are set by default to 10% and 20% (background and throttle). - * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. - * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is - * about ~6K pages (as the average of background and throttle bdi + * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. + * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is + * about ~6K pages (as the average of background and throttle wb * limits). The 3rd order polynomial will provide positive feedback if - * bdi_dirty is under bdi_setpoint and vice versa. + * wb_dirty is under wb_setpoint and vice versa. * * Note, that we cannot use global counters in these calculations - * because we want to throttle process writing to a strictlimit BDI + * because we want to throttle process writing to a strictlimit wb * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB * in the example above). */ - if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) { - long long bdi_pos_ratio; - unsigned long bdi_bg_thresh; + if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { + long long wb_pos_ratio; - if (bdi_dirty < 8) - return min_t(long long, pos_ratio * 2, - 2 << RATELIMIT_CALC_SHIFT); + if (dtc->wb_dirty < 8) { + dtc->pos_ratio = min_t(long long, pos_ratio * 2, + 2 << RATELIMIT_CALC_SHIFT); + return; + } - if (bdi_dirty >= bdi_thresh) - return 0; + if (dtc->wb_dirty >= wb_thresh) + return; - bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh); - bdi_setpoint = dirty_freerun_ceiling(bdi_thresh, - bdi_bg_thresh); + wb_setpoint = dirty_freerun_ceiling(wb_thresh, + dtc->wb_bg_thresh); - if (bdi_setpoint == 0 || bdi_setpoint == bdi_thresh) - return 0; + if (wb_setpoint == 0 || wb_setpoint == wb_thresh) + return; - bdi_pos_ratio = pos_ratio_polynom(bdi_setpoint, bdi_dirty, - bdi_thresh); + wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty, + wb_thresh); /* - * Typically, for strictlimit case, bdi_setpoint << setpoint - * and pos_ratio >> bdi_pos_ratio. In the other words global + * Typically, for strictlimit case, wb_setpoint << setpoint + * and pos_ratio >> wb_pos_ratio. In the other words global * state ("dirty") is not limiting factor and we have to - * make decision based on bdi counters. But there is an + * make decision based on wb counters. But there is an * important case when global pos_ratio should get precedence: * global limits are exceeded (e.g. due to activities on other - * BDIs) while given strictlimit BDI is below limit. + * wb's) while given strictlimit wb is below limit. * - * "pos_ratio * bdi_pos_ratio" would work for the case above, + * "pos_ratio * wb_pos_ratio" would work for the case above, * but it would look too non-natural for the case of all - * activity in the system coming from a single strictlimit BDI + * activity in the system coming from a single strictlimit wb * with bdi->max_ratio == 100%. * * Note that min() below somewhat changes the dynamics of the * control system. Normally, pos_ratio value can be well over 3 - * (when globally we are at freerun and bdi is well below bdi + * (when globally we are at freerun and wb is well below wb * setpoint). Now the maximum pos_ratio in the same situation * is 2. We might want to tweak this if we observe the control * system is too slow to adapt. */ - return min(pos_ratio, bdi_pos_ratio); + dtc->pos_ratio = min(pos_ratio, wb_pos_ratio); + return; } /* * We have computed basic pos_ratio above based on global situation. If - * the bdi is over/under its share of dirty pages, we want to scale + * the wb is over/under its share of dirty pages, we want to scale * pos_ratio further down/up. That is done by the following mechanism. */ /* - * bdi setpoint + * wb setpoint * - * f(bdi_dirty) := 1.0 + k * (bdi_dirty - bdi_setpoint) + * f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint) * - * x_intercept - bdi_dirty + * x_intercept - wb_dirty * := -------------------------- - * x_intercept - bdi_setpoint + * x_intercept - wb_setpoint * - * The main bdi control line is a linear function that subjects to + * The main wb control line is a linear function that subjects to * - * (1) f(bdi_setpoint) = 1.0 - * (2) k = - 1 / (8 * write_bw) (in single bdi case) - * or equally: x_intercept = bdi_setpoint + 8 * write_bw + * (1) f(wb_setpoint) = 1.0 + * (2) k = - 1 / (8 * write_bw) (in single wb case) + * or equally: x_intercept = wb_setpoint + 8 * write_bw * - * For single bdi case, the dirty pages are observed to fluctuate + * For single wb case, the dirty pages are observed to fluctuate * regularly within range - * [bdi_setpoint - write_bw/2, bdi_setpoint + write_bw/2] + * [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2] * for various filesystems, where (2) can yield in a reasonable 12.5% * fluctuation range for pos_ratio. * - * For JBOD case, bdi_thresh (not bdi_dirty!) could fluctuate up to its + * For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its * own size, so move the slope over accordingly and choose a slope that - * yields 100% pos_ratio fluctuation on suddenly doubled bdi_thresh. + * yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh. */ - if (unlikely(bdi_thresh > thresh)) - bdi_thresh = thresh; + if (unlikely(wb_thresh > dtc->thresh)) + wb_thresh = dtc->thresh; /* - * It's very possible that bdi_thresh is close to 0 not because the + * It's very possible that wb_thresh is close to 0 not because the * device is slow, but that it has remained inactive for long time. * Honour such devices a reasonable good (hopefully IO efficient) * threshold, so that the occasional writes won't be blocked and active * writes can rampup the threshold quickly. */ - bdi_thresh = max(bdi_thresh, (limit - dirty) / 8); + wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8); /* - * scale global setpoint to bdi's: - * bdi_setpoint = setpoint * bdi_thresh / thresh + * scale global setpoint to wb's: + * wb_setpoint = setpoint * wb_thresh / thresh */ - x = div_u64((u64)bdi_thresh << 16, thresh | 1); - bdi_setpoint = setpoint * (u64)x >> 16; + x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1); + wb_setpoint = setpoint * (u64)x >> 16; /* - * Use span=(8*write_bw) in single bdi case as indicated by - * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case. + * Use span=(8*write_bw) in single wb case as indicated by + * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case. * - * bdi_thresh thresh - bdi_thresh - * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh - * thresh thresh + * wb_thresh thresh - wb_thresh + * span = --------- * (8 * write_bw) + ------------------ * wb_thresh + * thresh thresh */ - span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16; - x_intercept = bdi_setpoint + span; + span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16; + x_intercept = wb_setpoint + span; - if (bdi_dirty < x_intercept - span / 4) { - pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty), - (x_intercept - bdi_setpoint) | 1); + if (dtc->wb_dirty < x_intercept - span / 4) { + pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty), + (x_intercept - wb_setpoint) | 1); } else pos_ratio /= 4; /* - * bdi reserve area, safeguard against dirty pool underrun and disk idle + * wb reserve area, safeguard against dirty pool underrun and disk idle * It may push the desired control point of global dirty pages higher * than setpoint. */ - x_intercept = bdi_thresh / 2; - if (bdi_dirty < x_intercept) { - if (bdi_dirty > x_intercept / 8) - pos_ratio = div_u64(pos_ratio * x_intercept, bdi_dirty); + x_intercept = wb_thresh / 2; + if (dtc->wb_dirty < x_intercept) { + if (dtc->wb_dirty > x_intercept / 8) + pos_ratio = div_u64(pos_ratio * x_intercept, + dtc->wb_dirty); else pos_ratio *= 8; } - return pos_ratio; + dtc->pos_ratio = pos_ratio; } -static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, - unsigned long elapsed, - unsigned long written) +static void wb_update_write_bandwidth(struct bdi_writeback *wb, + unsigned long elapsed, + unsigned long written) { const unsigned long period = roundup_pow_of_two(3 * HZ); - unsigned long avg = bdi->avg_write_bandwidth; - unsigned long old = bdi->write_bandwidth; + unsigned long avg = wb->avg_write_bandwidth; + unsigned long old = wb->write_bandwidth; u64 bw; /* @@ -861,14 +1050,14 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, * @written may have decreased due to account_page_redirty(). * Avoid underflowing @bw calculation. */ - bw = written - min(written, bdi->written_stamp); + bw = written - min(written, wb->written_stamp); bw *= HZ; if (unlikely(elapsed > period)) { do_div(bw, elapsed); avg = bw; goto out; } - bw += (u64)bdi->write_bandwidth * (period - elapsed); + bw += (u64)wb->write_bandwidth * (period - elapsed); bw >>= ilog2(period); /* @@ -881,21 +1070,22 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, avg += (old - avg) >> 3; out: - bdi->write_bandwidth = bw; - bdi->avg_write_bandwidth = avg; + /* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */ + avg = max(avg, 1LU); + if (wb_has_dirty_io(wb)) { + long delta = avg - wb->avg_write_bandwidth; + WARN_ON_ONCE(atomic_long_add_return(delta, + &wb->bdi->tot_write_bandwidth) <= 0); + } + wb->write_bandwidth = bw; + wb->avg_write_bandwidth = avg; } -/* - * The global dirtyable memory and dirty threshold could be suddenly knocked - * down by a large amount (eg. on the startup of KVM in a swapless system). - * This may throw the system into deep dirty exceeded state and throttle - * heavy/light dirtiers alike. To retain good responsiveness, maintain - * global_dirty_limit for tracking slowly down to the knocked down dirty - * threshold. - */ -static void update_dirty_limit(unsigned long thresh, unsigned long dirty) +static void update_dirty_limit(struct dirty_throttle_control *dtc) { - unsigned long limit = global_dirty_limit; + struct wb_domain *dom = dtc_dom(dtc); + unsigned long thresh = dtc->thresh; + unsigned long limit = dom->dirty_limit; /* * Follow up in one step. @@ -908,63 +1098,57 @@ static void update_dirty_limit(unsigned long thresh, unsigned long dirty) /* * Follow down slowly. Use the higher one as the target, because thresh * may drop below dirty. This is exactly the reason to introduce - * global_dirty_limit which is guaranteed to lie above the dirty pages. + * dom->dirty_limit which is guaranteed to lie above the dirty pages. */ - thresh = max(thresh, dirty); + thresh = max(thresh, dtc->dirty); if (limit > thresh) { limit -= (limit - thresh) >> 5; goto update; } return; update: - global_dirty_limit = limit; + dom->dirty_limit = limit; } -static void global_update_bandwidth(unsigned long thresh, - unsigned long dirty, +static void domain_update_bandwidth(struct dirty_throttle_control *dtc, unsigned long now) { - static DEFINE_SPINLOCK(dirty_lock); - static unsigned long update_time = INITIAL_JIFFIES; + struct wb_domain *dom = dtc_dom(dtc); /* * check locklessly first to optimize away locking for the most time */ - if (time_before(now, update_time + BANDWIDTH_INTERVAL)) + if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) return; - spin_lock(&dirty_lock); - if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) { - update_dirty_limit(thresh, dirty); - update_time = now; + spin_lock(&dom->lock); + if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) { + update_dirty_limit(dtc); + dom->dirty_limit_tstamp = now; } - spin_unlock(&dirty_lock); + spin_unlock(&dom->lock); } /* - * Maintain bdi->dirty_ratelimit, the base dirty throttle rate. + * Maintain wb->dirty_ratelimit, the base dirty throttle rate. * - * Normal bdi tasks will be curbed at or below it in long term. + * Normal wb tasks will be curbed at or below it in long term. * Obviously it should be around (write_bw / N) when there are N dd tasks. */ -static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty, - unsigned long dirtied, - unsigned long elapsed) -{ - unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh); - unsigned long limit = hard_dirty_limit(thresh); +static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, + unsigned long dirtied, + unsigned long elapsed) +{ + struct bdi_writeback *wb = dtc->wb; + unsigned long dirty = dtc->dirty; + unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); + unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); unsigned long setpoint = (freerun + limit) / 2; - unsigned long write_bw = bdi->avg_write_bandwidth; - unsigned long dirty_ratelimit = bdi->dirty_ratelimit; + unsigned long write_bw = wb->avg_write_bandwidth; + unsigned long dirty_ratelimit = wb->dirty_ratelimit; unsigned long dirty_rate; unsigned long task_ratelimit; unsigned long balanced_dirty_ratelimit; - unsigned long pos_ratio; unsigned long step; unsigned long x; @@ -972,20 +1156,18 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, * The dirty rate will match the writeout rate in long term, except * when dirty pages are truncated by userspace or re-dirtied by FS. */ - dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed; + dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed; - pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty, - bdi_thresh, bdi_dirty); /* * task_ratelimit reflects each dd's dirty rate for the past 200ms. */ task_ratelimit = (u64)dirty_ratelimit * - pos_ratio >> RATELIMIT_CALC_SHIFT; + dtc->pos_ratio >> RATELIMIT_CALC_SHIFT; task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */ /* * A linear estimation of the "balanced" throttle rate. The theory is, - * if there are N dd tasks, each throttled at task_ratelimit, the bdi's + * if there are N dd tasks, each throttled at task_ratelimit, the wb's * dirty_rate will be measured to be (N * task_ratelimit). So the below * formula will yield the balanced rate limit (write_bw / N). * @@ -1024,7 +1206,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, /* * We could safely do this and return immediately: * - * bdi->dirty_ratelimit = balanced_dirty_ratelimit; + * wb->dirty_ratelimit = balanced_dirty_ratelimit; * * However to get a more stable dirty_ratelimit, the below elaborated * code makes use of task_ratelimit to filter out singular points and @@ -1058,32 +1240,31 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, step = 0; /* - * For strictlimit case, calculations above were based on bdi counters - * and limits (starting from pos_ratio = bdi_position_ratio() and up to + * For strictlimit case, calculations above were based on wb counters + * and limits (starting from pos_ratio = wb_position_ratio() and up to * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). - * Hence, to calculate "step" properly, we have to use bdi_dirty as - * "dirty" and bdi_setpoint as "setpoint". + * Hence, to calculate "step" properly, we have to use wb_dirty as + * "dirty" and wb_setpoint as "setpoint". * - * We rampup dirty_ratelimit forcibly if bdi_dirty is low because - * it's possible that bdi_thresh is close to zero due to inactivity - * of backing device (see the implementation of bdi_dirty_limit()). + * We rampup dirty_ratelimit forcibly if wb_dirty is low because + * it's possible that wb_thresh is close to zero due to inactivity + * of backing device. */ - if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) { - dirty = bdi_dirty; - if (bdi_dirty < 8) - setpoint = bdi_dirty + 1; + if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { + dirty = dtc->wb_dirty; + if (dtc->wb_dirty < 8) + setpoint = dtc->wb_dirty + 1; else - setpoint = (bdi_thresh + - bdi_dirty_limit(bdi, bg_thresh)) / 2; + setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; } if (dirty < setpoint) { - x = min3(bdi->balanced_dirty_ratelimit, + x = min3(wb->balanced_dirty_ratelimit, balanced_dirty_ratelimit, task_ratelimit); if (dirty_ratelimit < x) step = x - dirty_ratelimit; } else { - x = max3(bdi->balanced_dirty_ratelimit, + x = max3(wb->balanced_dirty_ratelimit, balanced_dirty_ratelimit, task_ratelimit); if (dirty_ratelimit > x) step = dirty_ratelimit - x; @@ -1105,69 +1286,67 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, else dirty_ratelimit -= step; - bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL); - bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit; + wb->dirty_ratelimit = max(dirty_ratelimit, 1UL); + wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; - trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit); + trace_bdi_dirty_ratelimit(wb->bdi, dirty_rate, task_ratelimit); } -void __bdi_update_bandwidth(struct backing_dev_info *bdi, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty, - unsigned long start_time) +static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, + struct dirty_throttle_control *mdtc, + unsigned long start_time, + bool update_ratelimit) { + struct bdi_writeback *wb = gdtc->wb; unsigned long now = jiffies; - unsigned long elapsed = now - bdi->bw_time_stamp; + unsigned long elapsed = now - wb->bw_time_stamp; unsigned long dirtied; unsigned long written; + lockdep_assert_held(&wb->list_lock); + /* * rate-limit, only update once every 200ms. */ if (elapsed < BANDWIDTH_INTERVAL) return; - dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]); - written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]); + dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); + written = percpu_counter_read(&wb->stat[WB_WRITTEN]); /* * Skip quiet periods when disk bandwidth is under-utilized. * (at least 1s idle time between two flusher runs) */ - if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time)) + if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time)) goto snapshot; - if (thresh) { - global_update_bandwidth(thresh, dirty, now); - bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty, - bdi_thresh, bdi_dirty, - dirtied, elapsed); + if (update_ratelimit) { + domain_update_bandwidth(gdtc, now); + wb_update_dirty_ratelimit(gdtc, dirtied, elapsed); + + /* + * @mdtc is always NULL if !CGROUP_WRITEBACK but the + * compiler has no way to figure that out. Help it. + */ + if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) { + domain_update_bandwidth(mdtc, now); + wb_update_dirty_ratelimit(mdtc, dirtied, elapsed); + } } - bdi_update_write_bandwidth(bdi, elapsed, written); + wb_update_write_bandwidth(wb, elapsed, written); snapshot: - bdi->dirtied_stamp = dirtied; - bdi->written_stamp = written; - bdi->bw_time_stamp = now; + wb->dirtied_stamp = dirtied; + wb->written_stamp = written; + wb->bw_time_stamp = now; } -static void bdi_update_bandwidth(struct backing_dev_info *bdi, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty, - unsigned long start_time) +void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) { - if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL)) - return; - spin_lock(&bdi->wb.list_lock); - __bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty, - bdi_thresh, bdi_dirty, start_time); - spin_unlock(&bdi->wb.list_lock); + struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; + + __wb_update_bandwidth(&gdtc, NULL, start_time, false); } /* @@ -1187,10 +1366,10 @@ static unsigned long dirty_poll_interval(unsigned long dirty, return 1; } -static unsigned long bdi_max_pause(struct backing_dev_info *bdi, - unsigned long bdi_dirty) +static unsigned long wb_max_pause(struct bdi_writeback *wb, + unsigned long wb_dirty) { - unsigned long bw = bdi->avg_write_bandwidth; + unsigned long bw = wb->avg_write_bandwidth; unsigned long t; /* @@ -1200,20 +1379,20 @@ static unsigned long bdi_max_pause(struct backing_dev_info *bdi, * * 8 serves as the safety ratio. */ - t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); + t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); t++; return min_t(unsigned long, t, MAX_PAUSE); } -static long bdi_min_pause(struct backing_dev_info *bdi, - long max_pause, - unsigned long task_ratelimit, - unsigned long dirty_ratelimit, - int *nr_dirtied_pause) +static long wb_min_pause(struct bdi_writeback *wb, + long max_pause, + unsigned long task_ratelimit, + unsigned long dirty_ratelimit, + int *nr_dirtied_pause) { - long hi = ilog2(bdi->avg_write_bandwidth); - long lo = ilog2(bdi->dirty_ratelimit); + long hi = ilog2(wb->avg_write_bandwidth); + long lo = ilog2(wb->dirty_ratelimit); long t; /* target pause */ long pause; /* estimated next pause */ int pages; /* target nr_dirtied_pause */ @@ -1281,34 +1460,27 @@ static long bdi_min_pause(struct backing_dev_info *bdi, return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; } -static inline void bdi_dirty_limits(struct backing_dev_info *bdi, - unsigned long dirty_thresh, - unsigned long background_thresh, - unsigned long *bdi_dirty, - unsigned long *bdi_thresh, - unsigned long *bdi_bg_thresh) +static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) { - unsigned long bdi_reclaimable; + struct bdi_writeback *wb = dtc->wb; + unsigned long wb_reclaimable; /* - * bdi_thresh is not treated as some limiting factor as + * wb_thresh is not treated as some limiting factor as * dirty_thresh, due to reasons - * - in JBOD setup, bdi_thresh can fluctuate a lot + * - in JBOD setup, wb_thresh can fluctuate a lot * - in a system with HDD and USB key, the USB key may somehow - * go into state (bdi_dirty >> bdi_thresh) either because - * bdi_dirty starts high, or because bdi_thresh drops low. + * go into state (wb_dirty >> wb_thresh) either because + * wb_dirty starts high, or because wb_thresh drops low. * In this case we don't want to hard throttle the USB key - * dirtiers for 100 seconds until bdi_dirty drops under - * bdi_thresh. Instead the auxiliary bdi control line in - * bdi_position_ratio() will let the dirtier task progress - * at some rate <= (write_bw / 2) for bringing down bdi_dirty. + * dirtiers for 100 seconds until wb_dirty drops under + * wb_thresh. Instead the auxiliary wb control line in + * wb_position_ratio() will let the dirtier task progress + * at some rate <= (write_bw / 2) for bringing down wb_dirty. */ - *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); - - if (bdi_bg_thresh) - *bdi_bg_thresh = dirty_thresh ? div_u64((u64)*bdi_thresh * - background_thresh, - dirty_thresh) : 0; + dtc->wb_thresh = __wb_calc_thresh(dtc); + dtc->wb_bg_thresh = dtc->thresh ? + div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; /* * In order to avoid the stacked BDI deadlock we need @@ -1320,14 +1492,12 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi, * actually dirty; with m+n sitting in the percpu * deltas. */ - if (*bdi_thresh < 2 * bdi_stat_error(bdi)) { - bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); - *bdi_dirty = bdi_reclaimable + - bdi_stat_sum(bdi, BDI_WRITEBACK); + if (dtc->wb_thresh < 2 * wb_stat_error(wb)) { + wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); + dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK); } else { - bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); - *bdi_dirty = bdi_reclaimable + - bdi_stat(bdi, BDI_WRITEBACK); + wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE); + dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK); } } @@ -1339,12 +1509,16 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi, * perform some writeout. */ static void balance_dirty_pages(struct address_space *mapping, + struct bdi_writeback *wb, unsigned long pages_dirtied) { + struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; + struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; + struct dirty_throttle_control * const gdtc = &gdtc_stor; + struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? + &mdtc_stor : NULL; + struct dirty_throttle_control *sdtc; unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ - unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ - unsigned long background_thresh; - unsigned long dirty_thresh; long period; long pause; long max_pause; @@ -1353,18 +1527,14 @@ static void balance_dirty_pages(struct address_space *mapping, bool dirty_exceeded = false; unsigned long task_ratelimit; unsigned long dirty_ratelimit; - unsigned long pos_ratio; - struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct backing_dev_info *bdi = wb->bdi; bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; unsigned long start_time = jiffies; for (;;) { unsigned long now = jiffies; - unsigned long uninitialized_var(bdi_thresh); - unsigned long thresh; - unsigned long uninitialized_var(bdi_dirty); - unsigned long dirty; - unsigned long bg_thresh; + unsigned long dirty, thresh, bg_thresh; + unsigned long m_dirty, m_thresh, m_bg_thresh; /* * Unstable writes are a feature of certain networked @@ -1374,65 +1544,127 @@ static void balance_dirty_pages(struct address_space *mapping, */ nr_reclaimable = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS); - nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); + gdtc->avail = global_dirtyable_memory(); + gdtc->dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); - global_dirty_limits(&background_thresh, &dirty_thresh); + domain_dirty_limits(gdtc); if (unlikely(strictlimit)) { - bdi_dirty_limits(bdi, dirty_thresh, background_thresh, - &bdi_dirty, &bdi_thresh, &bg_thresh); + wb_dirty_limits(gdtc); - dirty = bdi_dirty; - thresh = bdi_thresh; + dirty = gdtc->wb_dirty; + thresh = gdtc->wb_thresh; + bg_thresh = gdtc->wb_bg_thresh; } else { - dirty = nr_dirty; - thresh = dirty_thresh; - bg_thresh = background_thresh; + dirty = gdtc->dirty; + thresh = gdtc->thresh; + bg_thresh = gdtc->bg_thresh; + } + + if (mdtc) { + unsigned long writeback; + + /* + * If @wb belongs to !root memcg, repeat the same + * basic calculations for the memcg domain. + */ + mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty, + &writeback); + mdtc_cap_avail(mdtc); + mdtc->dirty += writeback; + + domain_dirty_limits(mdtc); + + if (unlikely(strictlimit)) { + wb_dirty_limits(mdtc); + m_dirty = mdtc->wb_dirty; + m_thresh = mdtc->wb_thresh; + m_bg_thresh = mdtc->wb_bg_thresh; + } else { + m_dirty = mdtc->dirty; + m_thresh = mdtc->thresh; + m_bg_thresh = mdtc->bg_thresh; + } } /* * Throttle it only when the background writeback cannot * catch-up. This avoids (excessively) small writeouts - * when the bdi limits are ramping up in case of !strictlimit. + * when the wb limits are ramping up in case of !strictlimit. * - * In strictlimit case make decision based on the bdi counters - * and limits. Small writeouts when the bdi limits are ramping + * In strictlimit case make decision based on the wb counters + * and limits. Small writeouts when the wb limits are ramping * up are the price we consciously pay for strictlimit-ing. + * + * If memcg domain is in effect, @dirty should be under + * both global and memcg freerun ceilings. */ - if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) { + if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && + (!mdtc || + m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { + unsigned long intv = dirty_poll_interval(dirty, thresh); + unsigned long m_intv = ULONG_MAX; + current->dirty_paused_when = now; current->nr_dirtied = 0; - current->nr_dirtied_pause = - dirty_poll_interval(dirty, thresh); + if (mdtc) + m_intv = dirty_poll_interval(m_dirty, m_thresh); + current->nr_dirtied_pause = min(intv, m_intv); break; } - if (unlikely(!writeback_in_progress(bdi))) - bdi_start_background_writeback(bdi); + if (unlikely(!writeback_in_progress(wb))) + wb_start_background_writeback(wb); + /* + * Calculate global domain's pos_ratio and select the + * global dtc by default. + */ if (!strictlimit) - bdi_dirty_limits(bdi, dirty_thresh, background_thresh, - &bdi_dirty, &bdi_thresh, NULL); - - dirty_exceeded = (bdi_dirty > bdi_thresh) && - ((nr_dirty > dirty_thresh) || strictlimit); - if (dirty_exceeded && !bdi->dirty_exceeded) - bdi->dirty_exceeded = 1; - - bdi_update_bandwidth(bdi, dirty_thresh, background_thresh, - nr_dirty, bdi_thresh, bdi_dirty, - start_time); - - dirty_ratelimit = bdi->dirty_ratelimit; - pos_ratio = bdi_position_ratio(bdi, dirty_thresh, - background_thresh, nr_dirty, - bdi_thresh, bdi_dirty); - task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >> + wb_dirty_limits(gdtc); + + dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && + ((gdtc->dirty > gdtc->thresh) || strictlimit); + + wb_position_ratio(gdtc); + sdtc = gdtc; + + if (mdtc) { + /* + * If memcg domain is in effect, calculate its + * pos_ratio. @wb should satisfy constraints from + * both global and memcg domains. Choose the one + * w/ lower pos_ratio. + */ + if (!strictlimit) + wb_dirty_limits(mdtc); + + dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && + ((mdtc->dirty > mdtc->thresh) || strictlimit); + + wb_position_ratio(mdtc); + if (mdtc->pos_ratio < gdtc->pos_ratio) + sdtc = mdtc; + } + + if (dirty_exceeded && !wb->dirty_exceeded) + wb->dirty_exceeded = 1; + + if (time_is_before_jiffies(wb->bw_time_stamp + + BANDWIDTH_INTERVAL)) { + spin_lock(&wb->list_lock); + __wb_update_bandwidth(gdtc, mdtc, start_time, true); + spin_unlock(&wb->list_lock); + } + + /* throttle according to the chosen dtc */ + dirty_ratelimit = wb->dirty_ratelimit; + task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >> RATELIMIT_CALC_SHIFT; - max_pause = bdi_max_pause(bdi, bdi_dirty); - min_pause = bdi_min_pause(bdi, max_pause, - task_ratelimit, dirty_ratelimit, - &nr_dirtied_pause); + max_pause = wb_max_pause(wb, sdtc->wb_dirty); + min_pause = wb_min_pause(wb, max_pause, + task_ratelimit, dirty_ratelimit, + &nr_dirtied_pause); if (unlikely(task_ratelimit == 0)) { period = max_pause; @@ -1452,11 +1684,11 @@ static void balance_dirty_pages(struct address_space *mapping, */ if (pause < min_pause) { trace_balance_dirty_pages(bdi, - dirty_thresh, - background_thresh, - nr_dirty, - bdi_thresh, - bdi_dirty, + sdtc->thresh, + sdtc->bg_thresh, + sdtc->dirty, + sdtc->wb_thresh, + sdtc->wb_dirty, dirty_ratelimit, task_ratelimit, pages_dirtied, @@ -1481,11 +1713,11 @@ static void balance_dirty_pages(struct address_space *mapping, pause: trace_balance_dirty_pages(bdi, - dirty_thresh, - background_thresh, - nr_dirty, - bdi_thresh, - bdi_dirty, + sdtc->thresh, + sdtc->bg_thresh, + sdtc->dirty, + sdtc->wb_thresh, + sdtc->wb_dirty, dirty_ratelimit, task_ratelimit, pages_dirtied, @@ -1500,33 +1732,33 @@ pause: current->nr_dirtied_pause = nr_dirtied_pause; /* - * This is typically equal to (nr_dirty < dirty_thresh) and can - * also keep "1000+ dd on a slow USB stick" under control. + * This is typically equal to (dirty < thresh) and can also + * keep "1000+ dd on a slow USB stick" under control. */ if (task_ratelimit) break; /* * In the case of an unresponding NFS server and the NFS dirty - * pages exceeds dirty_thresh, give the other good bdi's a pipe + * pages exceeds dirty_thresh, give the other good wb's a pipe * to go through, so that tasks on them still remain responsive. * * In theory 1 page is enough to keep the comsumer-producer * pipe going: the flusher cleans 1 page => the task dirties 1 - * more page. However bdi_dirty has accounting errors. So use - * the larger and more IO friendly bdi_stat_error. + * more page. However wb_dirty has accounting errors. So use + * the larger and more IO friendly wb_stat_error. */ - if (bdi_dirty <= bdi_stat_error(bdi)) + if (sdtc->wb_dirty <= wb_stat_error(wb)) break; if (fatal_signal_pending(current)) break; } - if (!dirty_exceeded && bdi->dirty_exceeded) - bdi->dirty_exceeded = 0; + if (!dirty_exceeded && wb->dirty_exceeded) + wb->dirty_exceeded = 0; - if (writeback_in_progress(bdi)) + if (writeback_in_progress(wb)) return; /* @@ -1540,8 +1772,8 @@ pause: if (laptop_mode) return; - if (nr_reclaimable > background_thresh) - bdi_start_background_writeback(bdi); + if (nr_reclaimable > gdtc->bg_thresh) + wb_start_background_writeback(wb); } static DEFINE_PER_CPU(int, bdp_ratelimits); @@ -1577,15 +1809,22 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; */ void balance_dirty_pages_ratelimited(struct address_space *mapping) { - struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct inode *inode = mapping->host; + struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb = NULL; int ratelimit; int *p; if (!bdi_cap_account_dirty(bdi)) return; + if (inode_cgwb_enabled(inode)) + wb = wb_get_create_current(bdi, GFP_KERNEL); + if (!wb) + wb = &bdi->wb; + ratelimit = current->nr_dirtied_pause; - if (bdi->dirty_exceeded) + if (wb->dirty_exceeded) ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); preempt_disable(); @@ -1617,10 +1856,59 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) preempt_enable(); if (unlikely(current->nr_dirtied >= ratelimit)) - balance_dirty_pages(mapping, current->nr_dirtied); + balance_dirty_pages(mapping, wb, current->nr_dirtied); + + wb_put(wb); } EXPORT_SYMBOL(balance_dirty_pages_ratelimited); +/** + * wb_over_bg_thresh - does @wb need to be written back? + * @wb: bdi_writeback of interest + * + * Determines whether background writeback should keep writing @wb or it's + * clean enough. Returns %true if writeback should continue. + */ +bool wb_over_bg_thresh(struct bdi_writeback *wb) +{ + struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; + struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; + struct dirty_throttle_control * const gdtc = &gdtc_stor; + struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? + &mdtc_stor : NULL; + + /* + * Similar to balance_dirty_pages() but ignores pages being written + * as we're trying to decide whether to put more under writeback. + */ + gdtc->avail = global_dirtyable_memory(); + gdtc->dirty = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); + domain_dirty_limits(gdtc); + + if (gdtc->dirty > gdtc->bg_thresh) + return true; + + if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(gdtc)) + return true; + + if (mdtc) { + unsigned long writeback; + + mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty, &writeback); + mdtc_cap_avail(mdtc); + domain_dirty_limits(mdtc); /* ditto, ignore writeback */ + + if (mdtc->dirty > mdtc->bg_thresh) + return true; + + if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(mdtc)) + return true; + } + + return false; +} + void throttle_vm_writeout(gfp_t gfp_mask) { unsigned long background_thresh; @@ -1628,7 +1916,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) for ( ; ; ) { global_dirty_limits(&background_thresh, &dirty_thresh); - dirty_thresh = hard_dirty_limit(dirty_thresh); + dirty_thresh = hard_dirty_limit(&global_wb_domain, dirty_thresh); /* * Boost the allowable dirty threshold a bit for page @@ -1667,14 +1955,20 @@ void laptop_mode_timer_fn(unsigned long data) struct request_queue *q = (struct request_queue *)data; int nr_pages = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS); + struct bdi_writeback *wb; + struct wb_iter iter; /* * We want to write everything out, not just down to the dirty * threshold */ - if (bdi_has_dirty_io(&q->backing_dev_info)) - bdi_start_writeback(&q->backing_dev_info, nr_pages, - WB_REASON_LAPTOP_TIMER); + if (!bdi_has_dirty_io(&q->backing_dev_info)) + return; + + bdi_for_each_wb(wb, &q->backing_dev_info, &iter, 0) + if (wb_has_dirty_io(wb)) + wb_start_writeback(wb, nr_pages, true, + WB_REASON_LAPTOP_TIMER); } /* @@ -1718,10 +2012,12 @@ void laptop_sync_completion(void) void writeback_set_ratelimit(void) { + struct wb_domain *dom = &global_wb_domain; unsigned long background_thresh; unsigned long dirty_thresh; + global_dirty_limits(&background_thresh, &dirty_thresh); - global_dirty_limit = dirty_thresh; + dom->dirty_limit = dirty_thresh; ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); if (ratelimit_pages < 16) ratelimit_pages = 16; @@ -1770,7 +2066,7 @@ void __init page_writeback_init(void) writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); - fprop_global_init(&writeout_completions, GFP_KERNEL); + BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL)); } /** @@ -2090,19 +2386,29 @@ int __set_page_dirty_no_writeback(struct page *page) /* * Helper function for set_page_dirty family. + * + * Caller must hold mem_cgroup_begin_page_stat(). + * * NOTE: This relies on being atomic wrt interrupts. */ -void account_page_dirtied(struct page *page, struct address_space *mapping) +void account_page_dirtied(struct page *page, struct address_space *mapping, + struct mem_cgroup *memcg) { + struct inode *inode = mapping->host; + trace_writeback_dirty_page(page, mapping); if (mapping_cap_account_dirty(mapping)) { - struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct bdi_writeback *wb; + inode_attach_wb(inode, page); + wb = inode_to_wb(inode); + + mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_DIRTIED); - __inc_bdi_stat(bdi, BDI_RECLAIMABLE); - __inc_bdi_stat(bdi, BDI_DIRTIED); + __inc_wb_stat(wb, WB_RECLAIMABLE); + __inc_wb_stat(wb, WB_DIRTIED); task_io_account_write(PAGE_CACHE_SIZE); current->nr_dirtied++; this_cpu_inc(bdp_ratelimits); @@ -2113,21 +2419,18 @@ EXPORT_SYMBOL(account_page_dirtied); /* * Helper function for deaccounting dirty page without writeback. * - * Doing this should *normally* only ever be done when a page - * is truncated, and is not actually mapped anywhere at all. However, - * fs/buffer.c does this when it notices that somebody has cleaned - * out all the buffers on a page without actually doing it through - * the VM. Can you say "ext3 is horribly ugly"? Thought you could. + * Caller must hold mem_cgroup_begin_page_stat(). */ -void account_page_cleaned(struct page *page, struct address_space *mapping) +void account_page_cleaned(struct page *page, struct address_space *mapping, + struct mem_cgroup *memcg, struct bdi_writeback *wb) { if (mapping_cap_account_dirty(mapping)) { + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); + dec_wb_stat(wb, WB_RECLAIMABLE); task_io_account_cancelled_write(PAGE_CACHE_SIZE); } } -EXPORT_SYMBOL(account_page_cleaned); /* * For address_spaces which do not use buffers. Just tag the page as dirty in @@ -2143,26 +2446,34 @@ EXPORT_SYMBOL(account_page_cleaned); */ int __set_page_dirty_nobuffers(struct page *page) { + struct mem_cgroup *memcg; + + memcg = mem_cgroup_begin_page_stat(page); if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); unsigned long flags; - if (!mapping) + if (!mapping) { + mem_cgroup_end_page_stat(memcg); return 1; + } spin_lock_irqsave(&mapping->tree_lock, flags); BUG_ON(page_mapping(page) != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); - account_page_dirtied(page, mapping); + account_page_dirtied(page, mapping, memcg); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); + if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } return 1; } + mem_cgroup_end_page_stat(memcg); return 0; } EXPORT_SYMBOL(__set_page_dirty_nobuffers); @@ -2177,10 +2488,17 @@ EXPORT_SYMBOL(__set_page_dirty_nobuffers); void account_page_redirty(struct page *page) { struct address_space *mapping = page->mapping; + if (mapping && mapping_cap_account_dirty(mapping)) { + struct inode *inode = mapping->host; + struct bdi_writeback *wb; + bool locked; + + wb = unlocked_inode_to_wb_begin(inode, &locked); current->nr_dirtied--; dec_zone_page_state(page, NR_DIRTIED); - dec_bdi_stat(inode_to_bdi(mapping->host), BDI_DIRTIED); + dec_wb_stat(wb, WB_DIRTIED); + unlocked_inode_to_wb_end(inode, locked); } } EXPORT_SYMBOL(account_page_redirty); @@ -2266,6 +2584,43 @@ int set_page_dirty_lock(struct page *page) EXPORT_SYMBOL(set_page_dirty_lock); /* + * This cancels just the dirty bit on the kernel page itself, it does NOT + * actually remove dirty bits on any mmap's that may be around. It also + * leaves the page tagged dirty, so any sync activity will still find it on + * the dirty lists, and in particular, clear_page_dirty_for_io() will still + * look at the dirty bits in the VM. + * + * Doing this should *normally* only ever be done when a page is truncated, + * and is not actually mapped anywhere at all. However, fs/buffer.c does + * this when it notices that somebody has cleaned out all the buffers on a + * page without actually doing it through the VM. Can you say "ext3 is + * horribly ugly"? Thought you could. + */ +void cancel_dirty_page(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + + if (mapping_cap_account_dirty(mapping)) { + struct inode *inode = mapping->host; + struct bdi_writeback *wb; + struct mem_cgroup *memcg; + bool locked; + + memcg = mem_cgroup_begin_page_stat(page); + wb = unlocked_inode_to_wb_begin(inode, &locked); + + if (TestClearPageDirty(page)) + account_page_cleaned(page, mapping, memcg, wb); + + unlocked_inode_to_wb_end(inode, locked); + mem_cgroup_end_page_stat(memcg); + } else { + ClearPageDirty(page); + } +} +EXPORT_SYMBOL(cancel_dirty_page); + +/* * Clear a page's dirty flag, while caring for dirty memory accounting. * Returns true if the page was previously dirty. * @@ -2282,10 +2637,16 @@ EXPORT_SYMBOL(set_page_dirty_lock); int clear_page_dirty_for_io(struct page *page) { struct address_space *mapping = page_mapping(page); + int ret = 0; BUG_ON(!PageLocked(page)); if (mapping && mapping_cap_account_dirty(mapping)) { + struct inode *inode = mapping->host; + struct bdi_writeback *wb; + struct mem_cgroup *memcg; + bool locked; + /* * Yes, Virginia, this is indeed insane. * @@ -2321,13 +2682,17 @@ int clear_page_dirty_for_io(struct page *page) * always locked coming in here, so we get the desired * exclusion. */ + memcg = mem_cgroup_begin_page_stat(page); + wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) { + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(inode_to_bdi(mapping->host), - BDI_RECLAIMABLE); - return 1; + dec_wb_stat(wb, WB_RECLAIMABLE); + ret = 1; } - return 0; + unlocked_inode_to_wb_end(inode, locked); + mem_cgroup_end_page_stat(memcg); + return ret; } return TestClearPageDirty(page); } @@ -2341,7 +2706,8 @@ int test_clear_page_writeback(struct page *page) memcg = mem_cgroup_begin_page_stat(page); if (mapping) { - struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct inode *inode = mapping->host; + struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; spin_lock_irqsave(&mapping->tree_lock, flags); @@ -2351,8 +2717,10 @@ int test_clear_page_writeback(struct page *page) page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) { - __dec_bdi_stat(bdi, BDI_WRITEBACK); - __bdi_writeout_inc(bdi); + struct bdi_writeback *wb = inode_to_wb(inode); + + __dec_wb_stat(wb, WB_WRITEBACK); + __wb_writeout_inc(wb); } } spin_unlock_irqrestore(&mapping->tree_lock, flags); @@ -2376,7 +2744,8 @@ int __test_set_page_writeback(struct page *page, bool keep_write) memcg = mem_cgroup_begin_page_stat(page); if (mapping) { - struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct inode *inode = mapping->host; + struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; spin_lock_irqsave(&mapping->tree_lock, flags); @@ -2386,7 +2755,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) - __inc_bdi_stat(bdi, BDI_WRITEBACK); + __inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); } if (!PageDirty(page)) radix_tree_tag_clear(&mapping->page_tree, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5e6fa06f2784..506eac8b38af 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -18,6 +18,7 @@ #include <linux/mm.h> #include <linux/swap.h> #include <linux/interrupt.h> +#include <linux/rwsem.h> #include <linux/pagemap.h> #include <linux/jiffies.h> #include <linux/bootmem.h> @@ -61,6 +62,7 @@ #include <linux/hugetlb.h> #include <linux/sched/rt.h> #include <linux/page_owner.h> +#include <linux/kthread.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -235,6 +237,77 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +static inline void reset_deferred_meminit(pg_data_t *pgdat) +{ + pgdat->first_deferred_pfn = ULONG_MAX; +} + +/* Returns true if the struct page for the pfn is uninitialised */ +static inline bool __meminit early_page_uninitialised(unsigned long pfn) +{ + int nid = early_pfn_to_nid(pfn); + + if (pfn >= NODE_DATA(nid)->first_deferred_pfn) + return true; + + return false; +} + +static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid) +{ + if (pfn >= NODE_DATA(nid)->first_deferred_pfn) + return true; + + return false; +} + +/* + * Returns false when the remaining initialisation should be deferred until + * later in the boot cycle when it can be parallelised. + */ +static inline bool update_defer_init(pg_data_t *pgdat, + unsigned long pfn, unsigned long zone_end, + unsigned long *nr_initialised) +{ + /* Always populate low zones for address-contrained allocations */ + if (zone_end < pgdat_end_pfn(pgdat)) + return true; + + /* Initialise at least 2G of the highest zone */ + (*nr_initialised)++; + if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) && + (pfn & (PAGES_PER_SECTION - 1)) == 0) { + pgdat->first_deferred_pfn = pfn; + return false; + } + + return true; +} +#else +static inline void reset_deferred_meminit(pg_data_t *pgdat) +{ +} + +static inline bool early_page_uninitialised(unsigned long pfn) +{ + return false; +} + +static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid) +{ + return false; +} + +static inline bool update_defer_init(pg_data_t *pgdat, + unsigned long pfn, unsigned long zone_end, + unsigned long *nr_initialised) +{ + return true; +} +#endif + + void set_pageblock_migratetype(struct page *page, int migratetype) { if (unlikely(page_group_by_mobility_disabled && @@ -764,6 +837,75 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) return 0; } +static void __meminit __init_single_page(struct page *page, unsigned long pfn, + unsigned long zone, int nid) +{ + set_page_links(page, zone, nid, pfn); + init_page_count(page); + page_mapcount_reset(page); + page_cpupid_reset_last(page); + + INIT_LIST_HEAD(&page->lru); +#ifdef WANT_PAGE_VIRTUAL + /* The shift won't overflow because ZONE_NORMAL is below 4G. */ + if (!is_highmem_idx(zone)) + set_page_address(page, __va(pfn << PAGE_SHIFT)); +#endif +} + +static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone, + int nid) +{ + return __init_single_page(pfn_to_page(pfn), pfn, zone, nid); +} + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +static void init_reserved_page(unsigned long pfn) +{ + pg_data_t *pgdat; + int nid, zid; + + if (!early_page_uninitialised(pfn)) + return; + + nid = early_pfn_to_nid(pfn); + pgdat = NODE_DATA(nid); + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + struct zone *zone = &pgdat->node_zones[zid]; + + if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone)) + break; + } + __init_single_pfn(pfn, zid, nid); +} +#else +static inline void init_reserved_page(unsigned long pfn) +{ +} +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ + +/* + * Initialised pages do not have PageReserved set. This function is + * called for each range allocated by the bootmem allocator and + * marks the pages PageReserved. The remaining valid pages are later + * sent to the buddy page allocator. + */ +void __meminit reserve_bootmem_region(unsigned long start, unsigned long end) +{ + unsigned long start_pfn = PFN_DOWN(start); + unsigned long end_pfn = PFN_UP(end); + + for (; start_pfn < end_pfn; start_pfn++) { + if (pfn_valid(start_pfn)) { + struct page *page = pfn_to_page(start_pfn); + + init_reserved_page(start_pfn); + SetPageReserved(page); + } + } +} + static bool free_pages_prepare(struct page *page, unsigned int order) { bool compound = PageCompound(page); @@ -818,7 +960,8 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -void __init __free_pages_bootmem(struct page *page, unsigned int order) +static void __init __free_pages_boot_core(struct page *page, + unsigned long pfn, unsigned int order) { unsigned int nr_pages = 1 << order; struct page *p = page; @@ -838,6 +981,223 @@ void __init __free_pages_bootmem(struct page *page, unsigned int order) __free_pages(page, order); } +#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \ + defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) +/* Only safe to use early in boot when initialisation is single-threaded */ +static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; + +int __meminit early_pfn_to_nid(unsigned long pfn) +{ + int nid; + + /* The system will behave unpredictably otherwise */ + BUG_ON(system_state != SYSTEM_BOOTING); + + nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); + if (nid >= 0) + return nid; + /* just returns 0 */ + return 0; +} +#endif + +#ifdef CONFIG_NODES_SPAN_OTHER_NODES +static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state) +{ + int nid; + + nid = __early_pfn_to_nid(pfn, state); + if (nid >= 0 && nid != node) + return false; + return true; +} + +/* Only safe to use early in boot when initialisation is single-threaded */ +static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +{ + return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache); +} + +#else + +static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +{ + return true; +} +static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state) +{ + return true; +} +#endif + + +void __init __free_pages_bootmem(struct page *page, unsigned long pfn, + unsigned int order) +{ + if (early_page_uninitialised(pfn)) + return; + return __free_pages_boot_core(page, pfn, order); +} + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +static void __init deferred_free_range(struct page *page, + unsigned long pfn, int nr_pages) +{ + int i; + + if (!page) + return; + + /* Free a large naturally-aligned chunk if possible */ + if (nr_pages == MAX_ORDER_NR_PAGES && + (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) { + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + __free_pages_boot_core(page, pfn, MAX_ORDER-1); + return; + } + + for (i = 0; i < nr_pages; i++, page++, pfn++) + __free_pages_boot_core(page, pfn, 0); +} + +static __initdata DECLARE_RWSEM(pgdat_init_rwsem); + +/* Initialise remaining memory on a node */ +static int __init deferred_init_memmap(void *data) +{ + pg_data_t *pgdat = data; + int nid = pgdat->node_id; + struct mminit_pfnnid_cache nid_init_state = { }; + unsigned long start = jiffies; + unsigned long nr_pages = 0; + unsigned long walk_start, walk_end; + int i, zid; + struct zone *zone; + unsigned long first_init_pfn = pgdat->first_deferred_pfn; + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + + if (first_init_pfn == ULONG_MAX) { + up_read(&pgdat_init_rwsem); + return 0; + } + + /* Bind memory initialisation thread to a local node if possible */ + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(current, cpumask); + + /* Sanity check boundaries */ + BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn); + BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); + pgdat->first_deferred_pfn = ULONG_MAX; + + /* Only the highest zone is deferred so find it */ + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + zone = pgdat->node_zones + zid; + if (first_init_pfn < zone_end_pfn(zone)) + break; + } + + for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) { + unsigned long pfn, end_pfn; + struct page *page = NULL; + struct page *free_base_page = NULL; + unsigned long free_base_pfn = 0; + int nr_to_free = 0; + + end_pfn = min(walk_end, zone_end_pfn(zone)); + pfn = first_init_pfn; + if (pfn < walk_start) + pfn = walk_start; + if (pfn < zone->zone_start_pfn) + pfn = zone->zone_start_pfn; + + for (; pfn < end_pfn; pfn++) { + if (!pfn_valid_within(pfn)) + goto free_range; + + /* + * Ensure pfn_valid is checked every + * MAX_ORDER_NR_PAGES for memory holes + */ + if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { + if (!pfn_valid(pfn)) { + page = NULL; + goto free_range; + } + } + + if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { + page = NULL; + goto free_range; + } + + /* Minimise pfn page lookups and scheduler checks */ + if (page && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) { + page++; + } else { + nr_pages += nr_to_free; + deferred_free_range(free_base_page, + free_base_pfn, nr_to_free); + free_base_page = NULL; + free_base_pfn = nr_to_free = 0; + + page = pfn_to_page(pfn); + cond_resched(); + } + + if (page->flags) { + VM_BUG_ON(page_zone(page) != zone); + goto free_range; + } + + __init_single_page(page, pfn, zid, nid); + if (!free_base_page) { + free_base_page = page; + free_base_pfn = pfn; + nr_to_free = 0; + } + nr_to_free++; + + /* Where possible, batch up pages for a single free */ + continue; +free_range: + /* Free the current block of pages to allocator */ + nr_pages += nr_to_free; + deferred_free_range(free_base_page, free_base_pfn, + nr_to_free); + free_base_page = NULL; + free_base_pfn = nr_to_free = 0; + } + + first_init_pfn = max(end_pfn, first_init_pfn); + } + + /* Sanity check that the next zone really is unpopulated */ + WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); + + pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages, + jiffies_to_msecs(jiffies - start)); + up_read(&pgdat_init_rwsem); + return 0; +} + +void __init page_alloc_init_late(void) +{ + int nid; + + for_each_node_state(nid, N_MEMORY) { + down_read(&pgdat_init_rwsem); + kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid); + } + + /* Block until all are initialised */ + down_write(&pgdat_init_rwsem); + up_write(&pgdat_init_rwsem); +} +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ + #ifdef CONFIG_CMA /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void __init init_cma_reserved_pageblock(struct page *page) @@ -4150,6 +4510,9 @@ static void setup_zone_migrate_reserve(struct zone *zone) zone->nr_migrate_reserve_block = reserve; for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { + if (!early_page_nid_uninitialised(pfn, zone_to_nid(zone))) + return; + if (!pfn_valid(pfn)) continue; page = pfn_to_page(pfn); @@ -4212,15 +4575,16 @@ static void setup_zone_migrate_reserve(struct zone *zone) void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn, enum memmap_context context) { - struct page *page; + pg_data_t *pgdat = NODE_DATA(nid); unsigned long end_pfn = start_pfn + size; unsigned long pfn; struct zone *z; + unsigned long nr_initialised = 0; if (highest_memmap_pfn < end_pfn - 1) highest_memmap_pfn = end_pfn - 1; - z = &NODE_DATA(nid)->node_zones[zone]; + z = &pgdat->node_zones[zone]; for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* * There can be holes in boot-time mem_map[]s @@ -4232,14 +4596,11 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, continue; if (!early_pfn_in_nid(pfn, nid)) continue; + if (!update_defer_init(pgdat, pfn, end_pfn, + &nr_initialised)) + break; } - page = pfn_to_page(pfn); - set_page_links(page, zone, nid, pfn); - mminit_verify_page_links(page, zone, nid, pfn); - init_page_count(page); - page_mapcount_reset(page); - page_cpupid_reset_last(page); - SetPageReserved(page); + /* * Mark the block movable so that blocks are reserved for * movable at startup. This will force kernel allocations @@ -4254,17 +4615,14 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * check here not to call set_pageblock_migratetype() against * pfn out of zone. */ - if ((z->zone_start_pfn <= pfn) - && (pfn < zone_end_pfn(z)) - && !(pfn & (pageblock_nr_pages - 1))) - set_pageblock_migratetype(page, MIGRATE_MOVABLE); + if (!(pfn & (pageblock_nr_pages - 1))) { + struct page *page = pfn_to_page(pfn); - INIT_LIST_HEAD(&page->lru); -#ifdef WANT_PAGE_VIRTUAL - /* The shift won't overflow because ZONE_NORMAL is below 4G. */ - if (!is_highmem_idx(zone)) - set_page_address(page, __va(pfn << PAGE_SHIFT)); -#endif + __init_single_page(page, pfn, zone, nid); + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + } else { + __init_single_pfn(pfn, zone, nid); + } } } @@ -4522,57 +4880,30 @@ int __meminit init_currently_empty_zone(struct zone *zone, #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID + /* * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. */ -int __meminit __early_pfn_to_nid(unsigned long pfn) +int __meminit __early_pfn_to_nid(unsigned long pfn, + struct mminit_pfnnid_cache *state) { unsigned long start_pfn, end_pfn; int nid; - /* - * NOTE: The following SMP-unsafe globals are only used early in boot - * when the kernel is running single-threaded. - */ - static unsigned long __meminitdata last_start_pfn, last_end_pfn; - static int __meminitdata last_nid; - if (last_start_pfn <= pfn && pfn < last_end_pfn) - return last_nid; + if (state->last_start <= pfn && pfn < state->last_end) + return state->last_nid; nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); if (nid != -1) { - last_start_pfn = start_pfn; - last_end_pfn = end_pfn; - last_nid = nid; + state->last_start = start_pfn; + state->last_end = end_pfn; + state->last_nid = nid; } return nid; } #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ -int __meminit early_pfn_to_nid(unsigned long pfn) -{ - int nid; - - nid = __early_pfn_to_nid(pfn); - if (nid >= 0) - return nid; - /* just returns 0 */ - return 0; -} - -#ifdef CONFIG_NODES_SPAN_OTHER_NODES -bool __meminit early_pfn_in_nid(unsigned long pfn, int node) -{ - int nid; - - nid = __early_pfn_to_nid(pfn); - if (nid >= 0 && nid != node) - return false; - return true; -} -#endif - /** * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. @@ -5090,6 +5421,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, /* pg_data_t should be reset to zero when it's allocated */ WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); + reset_deferred_meminit(pgdat); pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP diff --git a/mm/page_io.c b/mm/page_io.c index 6424869e275e..520baa4b04d7 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -69,7 +69,7 @@ void end_swap_bio_write(struct bio *bio, int err) bio_put(bio); } -void end_swap_bio_read(struct bio *bio, int err) +static void end_swap_bio_read(struct bio *bio, int err) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct page *page = bio->bi_io_vec[0].bv_page; diff --git a/mm/page_owner.c b/mm/page_owner.c index 0993f5f36b01..bd5f842b56d2 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -310,4 +310,4 @@ static int __init pageowner_init(void) return 0; } -module_init(pageowner_init) +late_initcall(pageowner_init) diff --git a/mm/readahead.c b/mm/readahead.c index 935675844b2e..60cd846a9a44 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -541,7 +541,7 @@ page_cache_async_readahead(struct address_space *mapping, /* * Defer asynchronous read-ahead on IO congestion. */ - if (bdi_read_congested(inode_to_bdi(mapping->host))) + if (inode_read_congested(mapping->host)) return; /* do read-ahead */ diff --git a/mm/rmap.c b/mm/rmap.c index 7af1ecb21ccb..171b68768df1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -30,6 +30,8 @@ * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) + * mem_cgroup_{begin,end}_page_stat (memcg->move_lock) + * mapping->tree_lock (widely used) * inode->i_lock (in set_page_dirty's __mark_inode_dirty) * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) diff --git a/mm/slab_common.c b/mm/slab_common.c index 9f8d71f78404..3e5f8f29c286 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -855,6 +855,12 @@ void __init setup_kmalloc_cache_index_table(void) } } +static void __init new_kmalloc_cache(int idx, unsigned long flags) +{ + kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name, + kmalloc_info[idx].size, flags); +} + /* * Create the kmalloc array. Some of the regular kmalloc arrays * may already have been created because they were needed to @@ -864,25 +870,19 @@ void __init create_kmalloc_caches(unsigned long flags) { int i; - for (i = KMALLOC_LOOP_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { - if (!kmalloc_caches[i]) { - kmalloc_caches[i] = create_kmalloc_cache( - kmalloc_info[i].name, - kmalloc_info[i].size, - flags); - } + for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { + if (!kmalloc_caches[i]) + new_kmalloc_cache(i, flags); /* - * "i == 2" is the "kmalloc-192" case which is the last special - * case for initialization and it's the point to jump to - * allocate the minimize size of the object. In slab allocator, - * the KMALLOC_SHIFT_LOW = 5. So, it needs to skip 2^3 and 2^4 - * and go straight to allocate 2^5. If the ARCH_DMA_MINALIGN is - * defined, it may be larger than 2^5 and here is also the - * trick to skip the empty gap. + * Caches that are not of the two-to-the-power-of size. + * These have to be created immediately after the + * earlier power of two caches */ - if (i == 2) - i = (KMALLOC_SHIFT_LOW - 1); + if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6) + new_kmalloc_cache(1, flags); + if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7) + new_kmalloc_cache(2, flags); } /* Kmalloc array is now usable */ diff --git a/mm/truncate.c b/mm/truncate.c index 66af9031fae8..76e35ad97102 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -116,9 +116,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page) * the VM has canceled the dirty bit (eg ext3 journaling). * Hence dirty accounting check is placed after invalidation. */ - if (TestClearPageDirty(page)) - account_page_cleaned(page, mapping); - + cancel_dirty_page(page); ClearPageMappedToDisk(page); delete_from_page_cache(page); return 0; @@ -512,19 +510,24 @@ EXPORT_SYMBOL(invalidate_mapping_pages); static int invalidate_complete_page2(struct address_space *mapping, struct page *page) { + struct mem_cgroup *memcg; + unsigned long flags; + if (page->mapping != mapping) return 0; if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; - spin_lock_irq(&mapping->tree_lock); + memcg = mem_cgroup_begin_page_stat(page); + spin_lock_irqsave(&mapping->tree_lock, flags); if (PageDirty(page)) goto failed; BUG_ON(page_has_private(page)); - __delete_from_page_cache(page, NULL); - spin_unlock_irq(&mapping->tree_lock); + __delete_from_page_cache(page, NULL, memcg); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); @@ -532,7 +535,8 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) page_cache_release(page); /* pagecache ref */ return 1; failed: - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); return 0; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 19ef01e90ac4..e61445dce04e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -154,11 +154,42 @@ static bool global_reclaim(struct scan_control *sc) { return !sc->target_mem_cgroup; } + +/** + * sane_reclaim - is the usual dirty throttling mechanism operational? + * @sc: scan_control in question + * + * The normal page dirty throttling mechanism in balance_dirty_pages() is + * completely broken with the legacy memcg and direct stalling in + * shrink_page_list() is used for throttling instead, which lacks all the + * niceties such as fairness, adaptive pausing, bandwidth proportional + * allocation and configurability. + * + * This function tests whether the vmscan currently in progress can assume + * that the normal dirty throttling mechanism is operational. + */ +static bool sane_reclaim(struct scan_control *sc) +{ + struct mem_cgroup *memcg = sc->target_mem_cgroup; + + if (!memcg) + return true; +#ifdef CONFIG_CGROUP_WRITEBACK + if (cgroup_on_dfl(mem_cgroup_css(memcg)->cgroup)) + return true; +#endif + return false; +} #else static bool global_reclaim(struct scan_control *sc) { return true; } + +static bool sane_reclaim(struct scan_control *sc) +{ + return true; +} #endif static unsigned long zone_reclaimable_pages(struct zone *zone) @@ -452,14 +483,13 @@ static inline int is_page_cache_freeable(struct page *page) return page_count(page) - page_has_private(page) == 2; } -static int may_write_to_queue(struct backing_dev_info *bdi, - struct scan_control *sc) +static int may_write_to_inode(struct inode *inode, struct scan_control *sc) { if (current->flags & PF_SWAPWRITE) return 1; - if (!bdi_write_congested(bdi)) + if (!inode_write_congested(inode)) return 1; - if (bdi == current->backing_dev_info) + if (inode_to_bdi(inode) == current->backing_dev_info) return 1; return 0; } @@ -538,7 +568,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; - if (!may_write_to_queue(inode_to_bdi(mapping->host), sc)) + if (!may_write_to_inode(mapping->host, sc)) return PAGE_KEEP; if (clear_page_dirty_for_io(page)) { @@ -579,10 +609,14 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, static int __remove_mapping(struct address_space *mapping, struct page *page, bool reclaimed) { + unsigned long flags; + struct mem_cgroup *memcg; + BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - spin_lock_irq(&mapping->tree_lock); + memcg = mem_cgroup_begin_page_stat(page); + spin_lock_irqsave(&mapping->tree_lock, flags); /* * The non racy check for a busy page. * @@ -620,7 +654,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, swp_entry_t swap = { .val = page_private(page) }; mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); swapcache_free(swap); } else { void (*freepage)(struct page *); @@ -640,8 +675,9 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, if (reclaimed && page_is_file_cache(page) && !mapping_exiting(mapping)) shadow = workingset_eviction(mapping, page); - __delete_from_page_cache(page, shadow); - spin_unlock_irq(&mapping->tree_lock); + __delete_from_page_cache(page, shadow, memcg); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); if (freepage != NULL) freepage(page); @@ -650,7 +686,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, return 1; cannot_free: - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); return 0; } @@ -917,7 +954,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ mapping = page_mapping(page); if (((dirty || writeback) && mapping && - bdi_write_congested(inode_to_bdi(mapping->host))) || + inode_write_congested(mapping->host)) || (writeback && PageReclaim(page))) nr_congested++; @@ -935,10 +972,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, * note that the LRU is being scanned too quickly and the * caller can stall after page list has been processed. * - * 2) Global reclaim encounters a page, memcg encounters a - * page that is not marked for immediate reclaim or - * the caller does not have __GFP_IO. In this case mark - * the page for immediate reclaim and continue scanning. + * 2) Global or new memcg reclaim encounters a page that is + * not marked for immediate reclaim or the caller does not + * have __GFP_IO. In this case mark the page for immediate + * reclaim and continue scanning. * * __GFP_IO is checked because a loop driver thread might * enter reclaim, and deadlock if it waits on a page for @@ -952,7 +989,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing * may_enter_fs here is liable to OOM on them. * - * 3) memcg encounters a page that is not already marked + * 3) Legacy memcg encounters a page that is not already marked * PageReclaim. memcg does not have any dirty pages * throttling so we could easily OOM just because too many * pages are in writeback and there is nothing else to @@ -967,7 +1004,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; /* Case 2 above */ - } else if (global_reclaim(sc) || + } else if (sane_reclaim(sc) || !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { /* * This is slightly racy - end_page_writeback() @@ -1416,7 +1453,7 @@ static int too_many_isolated(struct zone *zone, int file, if (current_is_kswapd()) return 0; - if (!global_reclaim(sc)) + if (!sane_reclaim(sc)) return 0; if (file) { @@ -1608,10 +1645,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, set_bit(ZONE_WRITEBACK, &zone->flags); /* - * memcg will stall in page writeback so only consider forcibly - * stalling for global reclaim + * Legacy memcg will stall in page writeback so avoid forcibly + * stalling here. */ - if (global_reclaim(sc)) { + if (sane_reclaim(sc)) { /* * Tag a zone as congested if all the dirty pages scanned were * backed by a congested BDI and wait_iff_congested will stall. diff --git a/mm/zbud.c b/mm/zbud.c index 2ee4e4520493..f3bf6f7627d8 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -97,6 +97,10 @@ struct zbud_pool { struct list_head lru; u64 pages_nr; struct zbud_ops *ops; +#ifdef CONFIG_ZPOOL + struct zpool *zpool; + struct zpool_ops *zpool_ops; +#endif }; /* @@ -123,7 +127,10 @@ struct zbud_header { static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle) { - return zpool_evict(pool, handle); + if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict) + return pool->zpool_ops->evict(pool->zpool, handle); + else + return -ENOENT; } static struct zbud_ops zbud_zpool_ops = { @@ -131,9 +138,17 @@ static struct zbud_ops zbud_zpool_ops = { }; static void *zbud_zpool_create(char *name, gfp_t gfp, - struct zpool_ops *zpool_ops) + struct zpool_ops *zpool_ops, + struct zpool *zpool) { - return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); + struct zbud_pool *pool; + + pool = zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); + if (pool) { + pool->zpool = zpool; + pool->zpool_ops = zpool_ops; + } + return pool; } static void zbud_zpool_destroy(void *pool) @@ -292,7 +307,7 @@ struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops) struct zbud_pool *pool; int i; - pool = kmalloc(sizeof(struct zbud_pool), gfp); + pool = kzalloc(sizeof(struct zbud_pool), gfp); if (!pool) return NULL; spin_lock_init(&pool->lock); diff --git a/mm/zpool.c b/mm/zpool.c index bacdab6e47de..722a4f60e90b 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -73,33 +73,6 @@ int zpool_unregister_driver(struct zpool_driver *driver) } EXPORT_SYMBOL(zpool_unregister_driver); -/** - * zpool_evict() - evict callback from a zpool implementation. - * @pool: pool to evict from. - * @handle: handle to evict. - * - * This can be used by zpool implementations to call the - * user's evict zpool_ops struct evict callback. - */ -int zpool_evict(void *pool, unsigned long handle) -{ - struct zpool *zpool; - - spin_lock(&pools_lock); - list_for_each_entry(zpool, &pools_head, list) { - if (zpool->pool == pool) { - spin_unlock(&pools_lock); - if (!zpool->ops || !zpool->ops->evict) - return -EINVAL; - return zpool->ops->evict(zpool, handle); - } - } - spin_unlock(&pools_lock); - - return -ENOENT; -} -EXPORT_SYMBOL(zpool_evict); - static struct zpool_driver *zpool_get_driver(char *type) { struct zpool_driver *driver; @@ -147,7 +120,7 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, struct zpool_driver *driver; struct zpool *zpool; - pr_info("creating pool type %s\n", type); + pr_debug("creating pool type %s\n", type); driver = zpool_get_driver(type); @@ -170,7 +143,7 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, zpool->type = driver->type; zpool->driver = driver; - zpool->pool = driver->create(name, gfp, ops); + zpool->pool = driver->create(name, gfp, ops, zpool); zpool->ops = ops; if (!zpool->pool) { @@ -180,7 +153,7 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, return NULL; } - pr_info("created %s pool\n", type); + pr_debug("created pool type %s\n", type); spin_lock(&pools_lock); list_add(&zpool->list, &pools_head); @@ -202,7 +175,7 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, */ void zpool_destroy_pool(struct zpool *zpool) { - pr_info("destroying pool type %s\n", zpool->type); + pr_debug("destroying pool type %s\n", zpool->type); spin_lock(&pools_lock); list_del(&zpool->list); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index a8b5e749e84e..0a7f81aa2249 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -45,10 +45,6 @@ * */ -#ifdef CONFIG_ZSMALLOC_DEBUG -#define DEBUG -#endif - #include <linux/module.h> #include <linux/kernel.h> #include <linux/sched.h> @@ -313,7 +309,8 @@ static void record_obj(unsigned long handle, unsigned long obj) #ifdef CONFIG_ZPOOL -static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops) +static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops, + struct zpool *zpool) { return zs_create_pool(name, gfp); } diff --git a/mm/zswap.c b/mm/zswap.c index 4249e82ff934..2d5727baed59 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -75,9 +75,10 @@ static u64 zswap_duplicate_entry; /********************************* * tunables **********************************/ -/* Enable/disable zswap (disabled by default, fixed at boot for now) */ -static bool zswap_enabled __read_mostly; -module_param_named(enabled, zswap_enabled, bool, 0444); + +/* Enable/disable zswap (disabled by default) */ +static bool zswap_enabled; +module_param_named(enabled, zswap_enabled, bool, 0644); /* Compressor to be used by zswap (fixed at boot for now) */ #define ZSWAP_COMPRESSOR_DEFAULT "lzo" @@ -648,7 +649,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, u8 *src, *dst; struct zswap_header *zhdr; - if (!tree) { + if (!zswap_enabled || !tree) { ret = -ENODEV; goto reject; } @@ -901,9 +902,6 @@ static int __init init_zswap(void) { gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; - if (!zswap_enabled) - return 0; - pr_info("loading zswap\n"); zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp, |