summaryrefslogtreecommitdiff
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c1096
1 files changed, 587 insertions, 509 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ef91e856c7e4..d18d3a6e7337 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -72,22 +72,13 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
#define MEM_CGROUP_RECLAIM_RETRIES 5
static struct mem_cgroup *root_mem_cgroup __read_mostly;
+/* Whether the swap controller is active */
#ifdef CONFIG_MEMCG_SWAP
-/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
int do_swap_account __read_mostly;
-
-/* for remember boot option*/
-#ifdef CONFIG_MEMCG_SWAP_ENABLED
-static int really_do_swap_account __initdata = 1;
-#else
-static int really_do_swap_account __initdata;
-#endif
-
#else
#define do_swap_account 0
#endif
-
static const char * const mem_cgroup_stat_names[] = {
"cache",
"rss",
@@ -97,14 +88,6 @@ static const char * const mem_cgroup_stat_names[] = {
"swap",
};
-enum mem_cgroup_events_index {
- MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
- MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
- MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
- MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
- MEM_CGROUP_EVENTS_NSTATS,
-};
-
static const char * const mem_cgroup_events_names[] = {
"pgpgin",
"pgpgout",
@@ -138,7 +121,7 @@ enum mem_cgroup_events_target {
struct mem_cgroup_stat_cpu {
long count[MEM_CGROUP_STAT_NSTATS];
- unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
+ unsigned long events[MEMCG_NR_EVENTS];
unsigned long nr_page_events;
unsigned long targets[MEM_CGROUP_NTARGETS];
};
@@ -284,6 +267,10 @@ struct mem_cgroup {
struct page_counter memsw;
struct page_counter kmem;
+ /* Normal memory consumption range */
+ unsigned long low;
+ unsigned long high;
+
unsigned long soft_limit;
/* vmpressure notifications */
@@ -325,9 +312,11 @@ struct mem_cgroup {
/*
* set > 0 if pages under this cgroup are moving to other cgroup.
*/
- atomic_t moving_account;
+ atomic_t moving_account;
/* taken only while moving_account > 0 */
- spinlock_t move_lock;
+ spinlock_t move_lock;
+ struct task_struct *move_lock_task;
+ unsigned long move_lock_flags;
/*
* percpu counter.
*/
@@ -343,11 +332,10 @@ struct mem_cgroup {
struct cg_proto tcp_mem;
#endif
#if defined(CONFIG_MEMCG_KMEM)
- /* analogous to slab_common's slab_caches list, but per-memcg;
- * protected by memcg_slab_mutex */
- struct list_head memcg_slab_caches;
- /* Index in the kmem_cache->memcg_params->memcg_caches array */
+ /* Index in the kmem_cache->memcg_params.memcg_caches array */
int kmemcg_id;
+ bool kmem_acct_activated;
+ bool kmem_acct_active;
#endif
int last_scanned_node;
@@ -366,29 +354,26 @@ struct mem_cgroup {
};
#ifdef CONFIG_MEMCG_KMEM
-static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+bool memcg_kmem_is_active(struct mem_cgroup *memcg)
{
- return memcg->kmemcg_id >= 0;
+ return memcg->kmem_acct_active;
}
#endif
/* Stuffs for move charges at task migration. */
/*
- * Types of charges to be moved. "move_charge_at_immitgrate" and
- * "immigrate_flags" are treated as a left-shifted bitmap of these types.
+ * Types of charges to be moved.
*/
-enum move_type {
- MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
- MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */
- NR_MOVE_TYPE,
-};
+#define MOVE_ANON 0x1U
+#define MOVE_FILE 0x2U
+#define MOVE_MASK (MOVE_ANON | MOVE_FILE)
/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
spinlock_t lock; /* for from, to */
struct mem_cgroup *from;
struct mem_cgroup *to;
- unsigned long immigrate_flags;
+ unsigned long flags;
unsigned long precharge;
unsigned long moved_charge;
unsigned long moved_swap;
@@ -399,16 +384,6 @@ static struct move_charge_struct {
.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
};
-static bool move_anon(void)
-{
- return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
-}
-
-static bool move_file(void)
-{
- return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
-}
-
/*
* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
* limit reclaim to prevent infinite loops, if they ever occur.
@@ -544,33 +519,35 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
}
EXPORT_SYMBOL(tcp_proto_cgroup);
-static void disarm_sock_keys(struct mem_cgroup *memcg)
-{
- if (!memcg_proto_activated(&memcg->tcp_mem))
- return;
- static_key_slow_dec(&memcg_socket_limit_enabled);
-}
-#else
-static void disarm_sock_keys(struct mem_cgroup *memcg)
-{
-}
#endif
#ifdef CONFIG_MEMCG_KMEM
/*
- * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
+ * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
* The main reason for not using cgroup id for this:
* this works better in sparse environments, where we have a lot of memcgs,
* but only a few kmem-limited. Or also, if we have, for instance, 200
* memcgs, and none but the 200th is kmem-limited, we'd have to have a
* 200 entry array for that.
*
- * The current size of the caches array is stored in
- * memcg_limited_groups_array_size. It will double each time we have to
- * increase it.
+ * The current size of the caches array is stored in memcg_nr_cache_ids. It
+ * will double each time we have to increase it.
*/
-static DEFINE_IDA(kmem_limited_groups);
-int memcg_limited_groups_array_size;
+static DEFINE_IDA(memcg_cache_ida);
+int memcg_nr_cache_ids;
+
+/* Protects memcg_nr_cache_ids */
+static DECLARE_RWSEM(memcg_cache_ids_sem);
+
+void memcg_get_cache_ids(void)
+{
+ down_read(&memcg_cache_ids_sem);
+}
+
+void memcg_put_cache_ids(void)
+{
+ up_read(&memcg_cache_ids_sem);
+}
/*
* MIN_SIZE is different than 1, because we would like to avoid going through
@@ -596,32 +573,8 @@ int memcg_limited_groups_array_size;
struct static_key memcg_kmem_enabled_key;
EXPORT_SYMBOL(memcg_kmem_enabled_key);
-static void memcg_free_cache_id(int id);
-
-static void disarm_kmem_keys(struct mem_cgroup *memcg)
-{
- if (memcg_kmem_is_active(memcg)) {
- static_key_slow_dec(&memcg_kmem_enabled_key);
- memcg_free_cache_id(memcg->kmemcg_id);
- }
- /*
- * This check can't live in kmem destruction function,
- * since the charges will outlive the cgroup
- */
- WARN_ON(page_counter_read(&memcg->kmem));
-}
-#else
-static void disarm_kmem_keys(struct mem_cgroup *memcg)
-{
-}
#endif /* CONFIG_MEMCG_KMEM */
-static void disarm_static_keys(struct mem_cgroup *memcg)
-{
- disarm_sock_keys(memcg);
- disarm_kmem_keys(memcg);
-}
-
static struct mem_cgroup_per_zone *
mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
{
@@ -1368,6 +1321,20 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
return inactive * inactive_ratio < active;
}
+bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
+{
+ struct mem_cgroup_per_zone *mz;
+ struct mem_cgroup *memcg;
+
+ if (mem_cgroup_disabled())
+ return true;
+
+ mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+ memcg = mz->memcg;
+
+ return !!(memcg->css.flags & CSS_ONLINE);
+}
+
#define mem_cgroup_from_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
@@ -1477,9 +1444,9 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
pr_info("Task in ");
pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
- pr_info(" killed as a result of limit of ");
+ pr_cont(" killed as a result of limit of ");
pr_cont_cgroup_path(memcg->css.cgroup);
- pr_info("\n");
+ pr_cont("\n");
rcu_read_unlock();
@@ -1560,7 +1527,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
* quickly exit and free its memory.
*/
if (fatal_signal_pending(current) || task_will_free_mem(current)) {
- set_thread_flag(TIF_MEMDIE);
+ mark_tsk_oom_victim(current);
return;
}
@@ -1934,7 +1901,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
if (!memcg)
return false;
- if (!handle)
+ if (!handle || oom_killer_disabled)
goto cleanup;
owait.memcg = memcg;
@@ -1980,34 +1947,33 @@ cleanup:
/**
* mem_cgroup_begin_page_stat - begin a page state statistics transaction
* @page: page that is going to change accounted state
- * @locked: &memcg->move_lock slowpath was taken
- * @flags: IRQ-state flags for &memcg->move_lock
*
* This function must mark the beginning of an accounted page state
* change to prevent double accounting when the page is concurrently
* being moved to another memcg:
*
- * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
+ * memcg = mem_cgroup_begin_page_stat(page);
* if (TestClearPageState(page))
* mem_cgroup_update_page_stat(memcg, state, -1);
- * mem_cgroup_end_page_stat(memcg, locked, flags);
- *
- * The RCU lock is held throughout the transaction. The fast path can
- * get away without acquiring the memcg->move_lock (@locked is false)
- * because page moving starts with an RCU grace period.
- *
- * The RCU lock also protects the memcg from being freed when the page
- * state that is going to change is the only thing preventing the page
- * from being uncharged. E.g. end-writeback clearing PageWriteback(),
- * which allows migration to go ahead and uncharge the page before the
- * account transaction might be complete.
+ * mem_cgroup_end_page_stat(memcg);
*/
-struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
- bool *locked,
- unsigned long *flags)
+struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
{
struct mem_cgroup *memcg;
+ unsigned long flags;
+ /*
+ * The RCU lock is held throughout the transaction. The fast
+ * path can get away without acquiring the memcg->move_lock
+ * because page moving starts with an RCU grace period.
+ *
+ * The RCU lock also protects the memcg from being freed when
+ * the page state that is going to change is the only thing
+ * preventing the page from being uncharged.
+ * E.g. end-writeback clearing PageWriteback(), which allows
+ * migration to go ahead and uncharge the page before the
+ * account transaction might be complete.
+ */
rcu_read_lock();
if (mem_cgroup_disabled())
@@ -2017,16 +1983,22 @@ again:
if (unlikely(!memcg))
return NULL;
- *locked = false;
if (atomic_read(&memcg->moving_account) <= 0)
return memcg;
- spin_lock_irqsave(&memcg->move_lock, *flags);
+ spin_lock_irqsave(&memcg->move_lock, flags);
if (memcg != page->mem_cgroup) {
- spin_unlock_irqrestore(&memcg->move_lock, *flags);
+ spin_unlock_irqrestore(&memcg->move_lock, flags);
goto again;
}
- *locked = true;
+
+ /*
+ * When charge migration first begins, we can have locked and
+ * unlocked page stat updates happening concurrently. Track
+ * the task who has the lock for mem_cgroup_end_page_stat().
+ */
+ memcg->move_lock_task = current;
+ memcg->move_lock_flags = flags;
return memcg;
}
@@ -2034,14 +2006,17 @@ again:
/**
* mem_cgroup_end_page_stat - finish a page state statistics transaction
* @memcg: the memcg that was accounted against
- * @locked: value received from mem_cgroup_begin_page_stat()
- * @flags: value received from mem_cgroup_begin_page_stat()
*/
-void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked,
- unsigned long *flags)
+void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
{
- if (memcg && *locked)
- spin_unlock_irqrestore(&memcg->move_lock, *flags);
+ if (memcg && memcg->move_lock_task == current) {
+ unsigned long flags = memcg->move_lock_flags;
+
+ memcg->move_lock_task = NULL;
+ memcg->move_lock_flags = 0;
+
+ spin_unlock_irqrestore(&memcg->move_lock, flags);
+ }
rcu_read_unlock();
}
@@ -2134,17 +2109,6 @@ static void drain_local_stock(struct work_struct *dummy)
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
}
-static void __init memcg_stock_init(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct memcg_stock_pcp *stock =
- &per_cpu(memcg_stock, cpu);
- INIT_WORK(&stock->work, drain_local_stock);
- }
-}
-
/*
* Cache charges(val) to local per_cpu area.
* This will be consumed by consume_stock() function, later.
@@ -2294,6 +2258,8 @@ retry:
if (!(gfp_mask & __GFP_WAIT))
goto nomem;
+ mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
+
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
gfp_mask, may_swap);
@@ -2335,6 +2301,8 @@ retry:
if (fatal_signal_pending(current))
goto bypass;
+ mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
+
mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
nomem:
if (!(gfp_mask & __GFP_NOFAIL))
@@ -2346,6 +2314,16 @@ done_restock:
css_get_many(&memcg->css, batch);
if (batch > nr_pages)
refill_stock(memcg, batch - nr_pages);
+ /*
+ * If the hierarchy is above the normal consumption range,
+ * make the charging task trim their excess contribution.
+ */
+ do {
+ if (page_counter_read(&memcg->memory) <= memcg->high)
+ continue;
+ mem_cgroup_events(memcg, MEMCG_HIGH, 1);
+ try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+ } while ((memcg = parent_mem_cgroup(memcg)));
done:
return ret;
}
@@ -2476,27 +2454,8 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
}
#ifdef CONFIG_MEMCG_KMEM
-/*
- * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or
- * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists.
- */
-static DEFINE_MUTEX(memcg_slab_mutex);
-
-/*
- * This is a bit cumbersome, but it is rarely used and avoids a backpointer
- * in the memcg_cache_params struct.
- */
-static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
-{
- struct kmem_cache *cachep;
-
- VM_BUG_ON(p->is_root_cache);
- cachep = p->root_cache;
- return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
-}
-
-static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
- unsigned long nr_pages)
+int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
+ unsigned long nr_pages)
{
struct page_counter *counter;
int ret = 0;
@@ -2533,8 +2492,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
return ret;
}
-static void memcg_uncharge_kmem(struct mem_cgroup *memcg,
- unsigned long nr_pages)
+void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
{
page_counter_uncharge(&memcg->memory, nr_pages);
if (do_swap_account)
@@ -2560,18 +2518,19 @@ static int memcg_alloc_cache_id(void)
int id, size;
int err;
- id = ida_simple_get(&kmem_limited_groups,
+ id = ida_simple_get(&memcg_cache_ida,
0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
if (id < 0)
return id;
- if (id < memcg_limited_groups_array_size)
+ if (id < memcg_nr_cache_ids)
return id;
/*
* There's no space for the new id in memcg_caches arrays,
* so we have to grow them.
*/
+ down_write(&memcg_cache_ids_sem);
size = 2 * (id + 1);
if (size < MEMCG_CACHES_MIN_SIZE)
@@ -2579,12 +2538,16 @@ static int memcg_alloc_cache_id(void)
else if (size > MEMCG_CACHES_MAX_SIZE)
size = MEMCG_CACHES_MAX_SIZE;
- mutex_lock(&memcg_slab_mutex);
err = memcg_update_all_caches(size);
- mutex_unlock(&memcg_slab_mutex);
+ if (!err)
+ err = memcg_update_all_list_lrus(size);
+ if (!err)
+ memcg_nr_cache_ids = size;
+
+ up_write(&memcg_cache_ids_sem);
if (err) {
- ida_simple_remove(&kmem_limited_groups, id);
+ ida_simple_remove(&memcg_cache_ida, id);
return err;
}
return id;
@@ -2592,136 +2555,23 @@ static int memcg_alloc_cache_id(void)
static void memcg_free_cache_id(int id)
{
- ida_simple_remove(&kmem_limited_groups, id);
-}
-
-/*
- * We should update the current array size iff all caches updates succeed. This
- * can only be done from the slab side. The slab mutex needs to be held when
- * calling this.
- */
-void memcg_update_array_size(int num)
-{
- memcg_limited_groups_array_size = num;
+ ida_simple_remove(&memcg_cache_ida, id);
}
-static void memcg_register_cache(struct mem_cgroup *memcg,
- struct kmem_cache *root_cache)
-{
- static char memcg_name_buf[NAME_MAX + 1]; /* protected by
- memcg_slab_mutex */
- struct kmem_cache *cachep;
- int id;
-
- lockdep_assert_held(&memcg_slab_mutex);
-
- id = memcg_cache_id(memcg);
-
- /*
- * Since per-memcg caches are created asynchronously on first
- * allocation (see memcg_kmem_get_cache()), several threads can try to
- * create the same cache, but only one of them may succeed.
- */
- if (cache_from_memcg_idx(root_cache, id))
- return;
-
- cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1);
- cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);
- /*
- * If we could not create a memcg cache, do not complain, because
- * that's not critical at all as we can always proceed with the root
- * cache.
- */
- if (!cachep)
- return;
-
- list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
-
- /*
- * Since readers won't lock (see cache_from_memcg_idx()), we need a
- * barrier here to ensure nobody will see the kmem_cache partially
- * initialized.
- */
- smp_wmb();
-
- BUG_ON(root_cache->memcg_params->memcg_caches[id]);
- root_cache->memcg_params->memcg_caches[id] = cachep;
-}
-
-static void memcg_unregister_cache(struct kmem_cache *cachep)
-{
- struct kmem_cache *root_cache;
- struct mem_cgroup *memcg;
- int id;
-
- lockdep_assert_held(&memcg_slab_mutex);
-
- BUG_ON(is_root_cache(cachep));
-
- root_cache = cachep->memcg_params->root_cache;
- memcg = cachep->memcg_params->memcg;
- id = memcg_cache_id(memcg);
-
- BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
- root_cache->memcg_params->memcg_caches[id] = NULL;
-
- list_del(&cachep->memcg_params->list);
-
- kmem_cache_destroy(cachep);
-}
-
-int __memcg_cleanup_cache_params(struct kmem_cache *s)
-{
- struct kmem_cache *c;
- int i, failed = 0;
-
- mutex_lock(&memcg_slab_mutex);
- for_each_memcg_cache_index(i) {
- c = cache_from_memcg_idx(s, i);
- if (!c)
- continue;
-
- memcg_unregister_cache(c);
-
- if (cache_from_memcg_idx(s, i))
- failed++;
- }
- mutex_unlock(&memcg_slab_mutex);
- return failed;
-}
-
-static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
-{
- struct kmem_cache *cachep;
- struct memcg_cache_params *params, *tmp;
-
- if (!memcg_kmem_is_active(memcg))
- return;
-
- mutex_lock(&memcg_slab_mutex);
- list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
- cachep = memcg_params_to_cache(params);
- memcg_unregister_cache(cachep);
- }
- mutex_unlock(&memcg_slab_mutex);
-}
-
-struct memcg_register_cache_work {
+struct memcg_kmem_cache_create_work {
struct mem_cgroup *memcg;
struct kmem_cache *cachep;
struct work_struct work;
};
-static void memcg_register_cache_func(struct work_struct *w)
+static void memcg_kmem_cache_create_func(struct work_struct *w)
{
- struct memcg_register_cache_work *cw =
- container_of(w, struct memcg_register_cache_work, work);
+ struct memcg_kmem_cache_create_work *cw =
+ container_of(w, struct memcg_kmem_cache_create_work, work);
struct mem_cgroup *memcg = cw->memcg;
struct kmem_cache *cachep = cw->cachep;
- mutex_lock(&memcg_slab_mutex);
- memcg_register_cache(memcg, cachep);
- mutex_unlock(&memcg_slab_mutex);
+ memcg_create_kmem_cache(memcg, cachep);
css_put(&memcg->css);
kfree(cw);
@@ -2730,10 +2580,10 @@ static void memcg_register_cache_func(struct work_struct *w)
/*
* Enqueue the creation of a per-memcg kmem_cache.
*/
-static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
+static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
+ struct kmem_cache *cachep)
{
- struct memcg_register_cache_work *cw;
+ struct memcg_kmem_cache_create_work *cw;
cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
if (!cw)
@@ -2743,18 +2593,18 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
cw->memcg = memcg;
cw->cachep = cachep;
+ INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
- INIT_WORK(&cw->work, memcg_register_cache_func);
schedule_work(&cw->work);
}
-static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
+static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
+ struct kmem_cache *cachep)
{
/*
* We need to stop accounting when we kmalloc, because if the
* corresponding kmalloc cache is not yet created, the first allocation
- * in __memcg_schedule_register_cache will recurse.
+ * in __memcg_schedule_kmem_cache_create will recurse.
*
* However, it is better to enclose the whole function. Depending on
* the debugging options enabled, INIT_WORK(), for instance, can
@@ -2763,24 +2613,10 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
* the safest choice is to do it like this, wrapping the whole function.
*/
current->memcg_kmem_skip_account = 1;
- __memcg_schedule_register_cache(memcg, cachep);
+ __memcg_schedule_kmem_cache_create(memcg, cachep);
current->memcg_kmem_skip_account = 0;
}
-int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
-{
- unsigned int nr_pages = 1 << order;
-
- return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
-}
-
-void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
-{
- unsigned int nr_pages = 1 << order;
-
- memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
-}
-
/*
* Return the kmem_cache we're supposed to use for a slab allocation.
* We try to use the current memcg's version of the cache.
@@ -2798,18 +2634,19 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
{
struct mem_cgroup *memcg;
struct kmem_cache *memcg_cachep;
+ int kmemcg_id;
- VM_BUG_ON(!cachep->memcg_params);
- VM_BUG_ON(!cachep->memcg_params->is_root_cache);
+ VM_BUG_ON(!is_root_cache(cachep));
if (current->memcg_kmem_skip_account)
return cachep;
memcg = get_mem_cgroup_from_mm(current->mm);
- if (!memcg_kmem_is_active(memcg))
+ kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id);
+ if (kmemcg_id < 0)
goto out;
- memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
+ memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
if (likely(memcg_cachep))
return memcg_cachep;
@@ -2825,7 +2662,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
* could happen with the slab_mutex held. So it's better to
* defer everything.
*/
- memcg_schedule_register_cache(memcg, cachep);
+ memcg_schedule_kmem_cache_create(memcg, cachep);
out:
css_put(&memcg->css);
return cachep;
@@ -2834,7 +2671,7 @@ out:
void __memcg_kmem_put_cache(struct kmem_cache *cachep)
{
if (!is_root_cache(cachep))
- css_put(&cachep->memcg_params->memcg->css);
+ css_put(&cachep->memcg_params.memcg->css);
}
/*
@@ -2899,6 +2736,24 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
memcg_uncharge_kmem(memcg, 1 << order);
page->mem_cgroup = NULL;
}
+
+struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
+{
+ struct mem_cgroup *memcg = NULL;
+ struct kmem_cache *cachep;
+ struct page *page;
+
+ page = virt_to_head_page(ptr);
+ if (PageSlab(page)) {
+ cachep = page->slab_cache;
+ if (!is_root_cache(cachep))
+ memcg = cachep->memcg_params.memcg;
+ } else
+ /* page allocated by alloc_kmem_pages */
+ memcg = page->mem_cgroup;
+
+ return memcg;
+}
#endif /* CONFIG_MEMCG_KMEM */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -3043,18 +2898,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
mem_cgroup_swap_statistics(from, false);
mem_cgroup_swap_statistics(to, true);
- /*
- * This function is only called from task migration context now.
- * It postpones page_counter and refcount handling till the end
- * of task migration(mem_cgroup_clear_mc()) for performance
- * improvement. But we cannot postpone css_get(to) because if
- * the process that has been moved to @to does swap-in, the
- * refcount of @to might be decreased to 0.
- *
- * We are in attach() phase, so the cgroup is guaranteed to be
- * alive, so we can just call css_get().
- */
- css_get(&to->css);
return 0;
}
return -EINVAL;
@@ -3445,8 +3288,9 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
int err = 0;
int memcg_id;
- if (memcg_kmem_is_active(memcg))
- return 0;
+ BUG_ON(memcg->kmemcg_id >= 0);
+ BUG_ON(memcg->kmem_acct_activated);
+ BUG_ON(memcg->kmem_acct_active);
/*
* For simplicity, we won't allow this to be disabled. It also can't
@@ -3489,6 +3333,8 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
* patched.
*/
memcg->kmemcg_id = memcg_id;
+ memcg->kmem_acct_activated = true;
+ memcg->kmem_acct_active = true;
out:
return err;
}
@@ -3545,7 +3391,7 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
int ret;
buf = strstrip(buf);
- ret = page_counter_memparse(buf, &nr_pages);
+ ret = page_counter_memparse(buf, "-1", &nr_pages);
if (ret)
return ret;
@@ -3621,7 +3467,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- if (val >= (1 << NR_MOVE_TYPE))
+ if (val & ~MOVE_MASK)
return -EINVAL;
/*
@@ -3699,6 +3545,10 @@ static int memcg_stat_show(struct seq_file *m, void *v)
struct mem_cgroup *mi;
unsigned int i;
+ BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) !=
+ MEM_CGROUP_STAT_NSTATS);
+ BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) !=
+ MEM_CGROUP_EVENTS_NSTATS);
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
@@ -3913,7 +3763,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
unsigned long usage;
int i, size, ret;
- ret = page_counter_memparse(args, &threshold);
+ ret = page_counter_memparse(args, "-1", &threshold);
if (ret)
return ret;
@@ -4164,9 +4014,59 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
return mem_cgroup_sockets_init(memcg, ss);
}
+static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
+{
+ struct cgroup_subsys_state *css;
+ struct mem_cgroup *parent, *child;
+ int kmemcg_id;
+
+ if (!memcg->kmem_acct_active)
+ return;
+
+ /*
+ * Clear the 'active' flag before clearing memcg_caches arrays entries.
+ * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it
+ * guarantees no cache will be created for this cgroup after we are
+ * done (see memcg_create_kmem_cache()).
+ */
+ memcg->kmem_acct_active = false;
+
+ memcg_deactivate_kmem_caches(memcg);
+
+ kmemcg_id = memcg->kmemcg_id;
+ BUG_ON(kmemcg_id < 0);
+
+ parent = parent_mem_cgroup(memcg);
+ if (!parent)
+ parent = root_mem_cgroup;
+
+ /*
+ * Change kmemcg_id of this cgroup and all its descendants to the
+ * parent's id, and then move all entries from this cgroup's list_lrus
+ * to ones of the parent. After we have finished, all list_lrus
+ * corresponding to this cgroup are guaranteed to remain empty. The
+ * ordering is imposed by list_lru_node->lock taken by
+ * memcg_drain_all_list_lrus().
+ */
+ css_for_each_descendant_pre(css, &memcg->css) {
+ child = mem_cgroup_from_css(css);
+ BUG_ON(child->kmemcg_id != kmemcg_id);
+ child->kmemcg_id = parent->kmemcg_id;
+ if (!memcg->use_hierarchy)
+ break;
+ }
+ memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
+
+ memcg_free_cache_id(kmemcg_id);
+}
+
static void memcg_destroy_kmem(struct mem_cgroup *memcg)
{
- memcg_unregister_all_caches(memcg);
+ if (memcg->kmem_acct_activated) {
+ memcg_destroy_kmem_caches(memcg);
+ static_key_slow_dec(&memcg_kmem_enabled_key);
+ WARN_ON(page_counter_read(&memcg->kmem));
+ }
mem_cgroup_sockets_destroy(memcg);
}
#else
@@ -4175,6 +4075,10 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
return 0;
}
+static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
+{
+}
+
static void memcg_destroy_kmem(struct mem_cgroup *memcg)
{
}
@@ -4403,7 +4307,7 @@ out_kfree:
return ret;
}
-static struct cftype mem_cgroup_files[] = {
+static struct cftype mem_cgroup_legacy_files[] = {
{
.name = "usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
@@ -4514,34 +4418,6 @@ static struct cftype mem_cgroup_files[] = {
{ }, /* terminate */
};
-#ifdef CONFIG_MEMCG_SWAP
-static struct cftype memsw_cgroup_files[] = {
- {
- .name = "memsw.usage_in_bytes",
- .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
- .read_u64 = mem_cgroup_read_u64,
- },
- {
- .name = "memsw.max_usage_in_bytes",
- .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
- .write = mem_cgroup_reset,
- .read_u64 = mem_cgroup_read_u64,
- },
- {
- .name = "memsw.limit_in_bytes",
- .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
- .write = mem_cgroup_write,
- .read_u64 = mem_cgroup_read_u64,
- },
- {
- .name = "memsw.failcnt",
- .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
- .write = mem_cgroup_reset,
- .read_u64 = mem_cgroup_read_u64,
- },
- { }, /* terminate */
-};
-#endif
static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn;
@@ -4621,8 +4497,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
free_mem_cgroup_per_zone_info(memcg, node);
free_percpu(memcg->stat);
-
- disarm_static_keys(memcg);
kfree(memcg);
}
@@ -4637,29 +4511,6 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
}
EXPORT_SYMBOL(parent_mem_cgroup);
-static void __init mem_cgroup_soft_limit_tree_init(void)
-{
- struct mem_cgroup_tree_per_node *rtpn;
- struct mem_cgroup_tree_per_zone *rtpz;
- int tmp, node, zone;
-
- for_each_node(node) {
- tmp = node;
- if (!node_state(node, N_NORMAL_MEMORY))
- tmp = -1;
- rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
- BUG_ON(!rtpn);
-
- soft_limit_tree.rb_tree_per_node[node] = rtpn;
-
- for (zone = 0; zone < MAX_NR_ZONES; zone++) {
- rtpz = &rtpn->rb_tree_per_zone[zone];
- rtpz->rb_root = RB_ROOT;
- spin_lock_init(&rtpz->lock);
- }
- }
-}
-
static struct cgroup_subsys_state * __ref
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
@@ -4679,6 +4530,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (parent_css == NULL) {
root_mem_cgroup = memcg;
page_counter_init(&memcg->memory, NULL);
+ memcg->high = PAGE_COUNTER_MAX;
+ memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL);
}
@@ -4693,7 +4546,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
spin_lock_init(&memcg->event_list_lock);
#ifdef CONFIG_MEMCG_KMEM
memcg->kmemcg_id = -1;
- INIT_LIST_HEAD(&memcg->memcg_slab_caches);
#endif
return &memcg->css;
@@ -4724,6 +4576,8 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (parent->use_hierarchy) {
page_counter_init(&memcg->memory, &parent->memory);
+ memcg->high = PAGE_COUNTER_MAX;
+ memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, &parent->memsw);
page_counter_init(&memcg->kmem, &parent->kmem);
@@ -4733,6 +4587,8 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
*/
} else {
page_counter_init(&memcg->memory, NULL);
+ memcg->high = PAGE_COUNTER_MAX;
+ memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL);
/*
@@ -4777,6 +4633,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
spin_unlock(&memcg->event_list_lock);
vmpressure_cleanup(&memcg->vmpressure);
+
+ memcg_deactivate_kmem(memcg);
}
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -4807,7 +4665,9 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
- memcg->soft_limit = 0;
+ memcg->low = 0;
+ memcg->high = PAGE_COUNTER_MAX;
+ memcg->soft_limit = PAGE_COUNTER_MAX;
}
#ifdef CONFIG_MMU
@@ -4883,12 +4743,12 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
if (!page || !page_mapped(page))
return NULL;
if (PageAnon(page)) {
- /* we don't move shared anon */
- if (!move_anon())
+ if (!(mc.flags & MOVE_ANON))
return NULL;
- } else if (!move_file())
- /* we ignore mapcount for file pages */
- return NULL;
+ } else {
+ if (!(mc.flags & MOVE_FILE))
+ return NULL;
+ }
if (!get_page_unless_zero(page))
return NULL;
@@ -4902,7 +4762,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
struct page *page = NULL;
swp_entry_t ent = pte_to_swp_entry(ptent);
- if (!move_anon() || non_swap_entry(ent))
+ if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
return NULL;
/*
* Because lookup_swap_cache() updates some statistics counter,
@@ -4931,14 +4791,11 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
if (!vma->vm_file) /* anonymous vma */
return NULL;
- if (!move_file())
+ if (!(mc.flags & MOVE_FILE))
return NULL;
mapping = vma->vm_file->f_mapping;
- if (pte_none(ptent))
- pgoff = linear_page_index(vma, addr);
- else /* pte_file(ptent) is true */
- pgoff = pte_to_pgoff(ptent);
+ pgoff = linear_page_index(vma, addr);
/* page is moved even if it's not RSS of this task(page-faulted). */
#ifdef CONFIG_SWAP
@@ -4970,7 +4827,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
page = mc_handle_present_pte(vma, addr, ptent);
else if (is_swap_pte(ptent))
page = mc_handle_swap_pte(vma, addr, ptent, &ent);
- else if (pte_none(ptent) || pte_file(ptent))
+ else if (pte_none(ptent))
page = mc_handle_file_pte(vma, addr, ptent, &ent);
if (!page && !ent.val)
@@ -5013,7 +4870,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
page = pmd_page(pmd);
VM_BUG_ON_PAGE(!page || !PageHead(page), page);
- if (!move_anon())
+ if (!(mc.flags & MOVE_ANON))
return ret;
if (page->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
@@ -5036,7 +4893,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
- struct vm_area_struct *vma = walk->private;
+ struct vm_area_struct *vma = walk->vma;
pte_t *pte;
spinlock_t *ptl;
@@ -5062,20 +4919,13 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
{
unsigned long precharge;
- struct vm_area_struct *vma;
+ struct mm_walk mem_cgroup_count_precharge_walk = {
+ .pmd_entry = mem_cgroup_count_precharge_pte_range,
+ .mm = mm,
+ };
down_read(&mm->mmap_sem);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- struct mm_walk mem_cgroup_count_precharge_walk = {
- .pmd_entry = mem_cgroup_count_precharge_pte_range,
- .mm = mm,
- .private = vma,
- };
- if (is_vm_hugetlb_page(vma))
- continue;
- walk_page_range(vma->vm_start, vma->vm_end,
- &mem_cgroup_count_precharge_walk);
- }
+ walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk);
up_read(&mm->mmap_sem);
precharge = mc.precharge;
@@ -5155,15 +5005,15 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
struct task_struct *p = cgroup_taskset_first(tset);
int ret = 0;
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- unsigned long move_charge_at_immigrate;
+ unsigned long move_flags;
/*
* We are now commited to this value whatever it is. Changes in this
* tunable will only affect upcoming migrations, not the current one.
* So we need to save it, and keep it going.
*/
- move_charge_at_immigrate = memcg->move_charge_at_immigrate;
- if (move_charge_at_immigrate) {
+ move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate);
+ if (move_flags) {
struct mm_struct *mm;
struct mem_cgroup *from = mem_cgroup_from_task(p);
@@ -5183,7 +5033,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
spin_lock(&mc.lock);
mc.from = from;
mc.to = memcg;
- mc.immigrate_flags = move_charge_at_immigrate;
+ mc.flags = move_flags;
spin_unlock(&mc.lock);
/* We set mc.moving_task later */
@@ -5208,7 +5058,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
struct mm_walk *walk)
{
int ret = 0;
- struct vm_area_struct *vma = walk->private;
+ struct vm_area_struct *vma = walk->vma;
pte_t *pte;
spinlock_t *ptl;
enum mc_target_type target_type;
@@ -5304,7 +5154,10 @@ put: /* get_mctgt_type() gets the page */
static void mem_cgroup_move_charge(struct mm_struct *mm)
{
- struct vm_area_struct *vma;
+ struct mm_walk mem_cgroup_move_charge_walk = {
+ .pmd_entry = mem_cgroup_move_charge_pte_range,
+ .mm = mm,
+ };
lru_add_drain_all();
/*
@@ -5327,24 +5180,11 @@ retry:
cond_resched();
goto retry;
}
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- int ret;
- struct mm_walk mem_cgroup_move_charge_walk = {
- .pmd_entry = mem_cgroup_move_charge_pte_range,
- .mm = mm,
- .private = vma,
- };
- if (is_vm_hugetlb_page(vma))
- continue;
- ret = walk_page_range(vma->vm_start, vma->vm_end,
- &mem_cgroup_move_charge_walk);
- if (ret)
- /*
- * means we have consumed all precharges and failed in
- * doing additional charge. Just abandon here.
- */
- break;
- }
+ /*
+ * When we have consumed all precharges and failed in doing
+ * additional charge, the page walk just aborts.
+ */
+ walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk);
up_read(&mm->mmap_sem);
atomic_dec(&mc.from->moving_account);
}
@@ -5395,118 +5235,211 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
mem_cgroup_from_css(root_css)->use_hierarchy = true;
}
-struct cgroup_subsys memory_cgrp_subsys = {
- .css_alloc = mem_cgroup_css_alloc,
- .css_online = mem_cgroup_css_online,
- .css_offline = mem_cgroup_css_offline,
- .css_free = mem_cgroup_css_free,
- .css_reset = mem_cgroup_css_reset,
- .can_attach = mem_cgroup_can_attach,
- .cancel_attach = mem_cgroup_cancel_attach,
- .attach = mem_cgroup_move_task,
- .bind = mem_cgroup_bind,
- .legacy_cftypes = mem_cgroup_files,
- .early_init = 0,
-};
+static u64 memory_current_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return mem_cgroup_usage(mem_cgroup_from_css(css), false);
+}
-#ifdef CONFIG_MEMCG_SWAP
-static int __init enable_swap_account(char *s)
+static int memory_low_show(struct seq_file *m, void *v)
{
- if (!strcmp(s, "1"))
- really_do_swap_account = 1;
- else if (!strcmp(s, "0"))
- really_do_swap_account = 0;
- return 1;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ unsigned long low = ACCESS_ONCE(memcg->low);
+
+ if (low == PAGE_COUNTER_MAX)
+ seq_puts(m, "infinity\n");
+ else
+ seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
+
+ return 0;
}
-__setup("swapaccount=", enable_swap_account);
-static void __init memsw_file_init(void)
+static ssize_t memory_low_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
- WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
- memsw_cgroup_files));
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long low;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "infinity", &low);
+ if (err)
+ return err;
+
+ memcg->low = low;
+
+ return nbytes;
}
-static void __init enable_swap_cgroup(void)
+static int memory_high_show(struct seq_file *m, void *v)
{
- if (!mem_cgroup_disabled() && really_do_swap_account) {
- do_swap_account = 1;
- memsw_file_init();
- }
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ unsigned long high = ACCESS_ONCE(memcg->high);
+
+ if (high == PAGE_COUNTER_MAX)
+ seq_puts(m, "infinity\n");
+ else
+ seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
+
+ return 0;
}
-#else
-static void __init enable_swap_cgroup(void)
+static ssize_t memory_high_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long high;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "infinity", &high);
+ if (err)
+ return err;
+
+ memcg->high = high;
+
+ return nbytes;
}
-#endif
-#ifdef CONFIG_MEMCG_SWAP
-/**
- * mem_cgroup_swapout - transfer a memsw charge to swap
- * @page: page whose memsw charge to transfer
- * @entry: swap entry to move the charge to
- *
- * Transfer the memsw charge of @page to @entry.
- */
-void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
+static int memory_max_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg;
- unsigned short oldid;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ unsigned long max = ACCESS_ONCE(memcg->memory.limit);
- VM_BUG_ON_PAGE(PageLRU(page), page);
- VM_BUG_ON_PAGE(page_count(page), page);
+ if (max == PAGE_COUNTER_MAX)
+ seq_puts(m, "infinity\n");
+ else
+ seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
- if (!do_swap_account)
- return;
+ return 0;
+}
- memcg = page->mem_cgroup;
+static ssize_t memory_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long max;
+ int err;
- /* Readahead page, never charged */
- if (!memcg)
- return;
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "infinity", &max);
+ if (err)
+ return err;
- oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
- VM_BUG_ON_PAGE(oldid, page);
- mem_cgroup_swap_statistics(memcg, true);
+ err = mem_cgroup_resize_limit(memcg, max);
+ if (err)
+ return err;
- page->mem_cgroup = NULL;
+ return nbytes;
+}
- if (!mem_cgroup_is_root(memcg))
- page_counter_uncharge(&memcg->memory, 1);
+static int memory_events_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- /* XXX: caller holds IRQ-safe mapping->tree_lock */
- VM_BUG_ON(!irqs_disabled());
+ seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW));
+ seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH));
+ seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX));
+ seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM));
- mem_cgroup_charge_statistics(memcg, page, -1);
- memcg_check_events(memcg, page);
+ return 0;
}
+static struct cftype memory_files[] = {
+ {
+ .name = "current",
+ .read_u64 = memory_current_read,
+ },
+ {
+ .name = "low",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = memory_low_show,
+ .write = memory_low_write,
+ },
+ {
+ .name = "high",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = memory_high_show,
+ .write = memory_high_write,
+ },
+ {
+ .name = "max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = memory_max_show,
+ .write = memory_max_write,
+ },
+ {
+ .name = "events",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = memory_events_show,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys memory_cgrp_subsys = {
+ .css_alloc = mem_cgroup_css_alloc,
+ .css_online = mem_cgroup_css_online,
+ .css_offline = mem_cgroup_css_offline,
+ .css_free = mem_cgroup_css_free,
+ .css_reset = mem_cgroup_css_reset,
+ .can_attach = mem_cgroup_can_attach,
+ .cancel_attach = mem_cgroup_cancel_attach,
+ .attach = mem_cgroup_move_task,
+ .bind = mem_cgroup_bind,
+ .dfl_cftypes = memory_files,
+ .legacy_cftypes = mem_cgroup_legacy_files,
+ .early_init = 0,
+};
+
/**
- * mem_cgroup_uncharge_swap - uncharge a swap entry
- * @entry: swap entry to uncharge
+ * mem_cgroup_events - count memory events against a cgroup
+ * @memcg: the memory cgroup
+ * @idx: the event index
+ * @nr: the number of events to account for
+ */
+void mem_cgroup_events(struct mem_cgroup *memcg,
+ enum mem_cgroup_events_index idx,
+ unsigned int nr)
+{
+ this_cpu_add(memcg->stat->events[idx], nr);
+}
+
+/**
+ * mem_cgroup_low - check if memory consumption is below the normal range
+ * @root: the highest ancestor to consider
+ * @memcg: the memory cgroup to check
*
- * Drop the memsw charge associated with @entry.
+ * Returns %true if memory consumption of @memcg, and that of all
+ * configurable ancestors up to @root, is below the normal range.
*/
-void mem_cgroup_uncharge_swap(swp_entry_t entry)
+bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
{
- struct mem_cgroup *memcg;
- unsigned short id;
+ if (mem_cgroup_disabled())
+ return false;
- if (!do_swap_account)
- return;
+ /*
+ * The toplevel group doesn't have a configurable range, so
+ * it's never low when looked at directly, and it is not
+ * considered an ancestor when assessing the hierarchy.
+ */
- id = swap_cgroup_record(entry, 0);
- rcu_read_lock();
- memcg = mem_cgroup_lookup(id);
- if (memcg) {
- if (!mem_cgroup_is_root(memcg))
- page_counter_uncharge(&memcg->memsw, 1);
- mem_cgroup_swap_statistics(memcg, false);
- css_put(&memcg->css);
+ if (memcg == root_mem_cgroup)
+ return false;
+
+ if (page_counter_read(&memcg->memory) > memcg->low)
+ return false;
+
+ while (memcg != root) {
+ memcg = parent_mem_cgroup(memcg);
+
+ if (memcg == root_mem_cgroup)
+ break;
+
+ if (page_counter_read(&memcg->memory) > memcg->low)
+ return false;
}
- rcu_read_unlock();
+ return true;
}
-#endif
/**
* mem_cgroup_try_charge - try charging a page
@@ -5782,7 +5715,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
* mem_cgroup_migrate - migrate a charge to another page
* @oldpage: currently charged page
* @newpage: page to transfer the charge to
- * @lrucare: both pages might be on the LRU already
+ * @lrucare: either or both pages might be on the LRU already
*
* Migrate the charge from @oldpage to @newpage.
*
@@ -5840,10 +5773,155 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
*/
static int __init mem_cgroup_init(void)
{
+ int cpu, node;
+
hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
- enable_swap_cgroup();
- mem_cgroup_soft_limit_tree_init();
- memcg_stock_init();
+
+ for_each_possible_cpu(cpu)
+ INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
+ drain_local_stock);
+
+ for_each_node(node) {
+ struct mem_cgroup_tree_per_node *rtpn;
+ int zone;
+
+ rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
+ node_online(node) ? node : NUMA_NO_NODE);
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ struct mem_cgroup_tree_per_zone *rtpz;
+
+ rtpz = &rtpn->rb_tree_per_zone[zone];
+ rtpz->rb_root = RB_ROOT;
+ spin_lock_init(&rtpz->lock);
+ }
+ soft_limit_tree.rb_tree_per_node[node] = rtpn;
+ }
+
return 0;
}
subsys_initcall(mem_cgroup_init);
+
+#ifdef CONFIG_MEMCG_SWAP
+/**
+ * mem_cgroup_swapout - transfer a memsw charge to swap
+ * @page: page whose memsw charge to transfer
+ * @entry: swap entry to move the charge to
+ *
+ * Transfer the memsw charge of @page to @entry.
+ */
+void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
+{
+ struct mem_cgroup *memcg;
+ unsigned short oldid;
+
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_PAGE(page_count(page), page);
+
+ if (!do_swap_account)
+ return;
+
+ memcg = page->mem_cgroup;
+
+ /* Readahead page, never charged */
+ if (!memcg)
+ return;
+
+ oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
+ VM_BUG_ON_PAGE(oldid, page);
+ mem_cgroup_swap_statistics(memcg, true);
+
+ page->mem_cgroup = NULL;
+
+ if (!mem_cgroup_is_root(memcg))
+ page_counter_uncharge(&memcg->memory, 1);
+
+ /* XXX: caller holds IRQ-safe mapping->tree_lock */
+ VM_BUG_ON(!irqs_disabled());
+
+ mem_cgroup_charge_statistics(memcg, page, -1);
+ memcg_check_events(memcg, page);
+}
+
+/**
+ * mem_cgroup_uncharge_swap - uncharge a swap entry
+ * @entry: swap entry to uncharge
+ *
+ * Drop the memsw charge associated with @entry.
+ */
+void mem_cgroup_uncharge_swap(swp_entry_t entry)
+{
+ struct mem_cgroup *memcg;
+ unsigned short id;
+
+ if (!do_swap_account)
+ return;
+
+ id = swap_cgroup_record(entry, 0);
+ rcu_read_lock();
+ memcg = mem_cgroup_lookup(id);
+ if (memcg) {
+ if (!mem_cgroup_is_root(memcg))
+ page_counter_uncharge(&memcg->memsw, 1);
+ mem_cgroup_swap_statistics(memcg, false);
+ css_put(&memcg->css);
+ }
+ rcu_read_unlock();
+}
+
+/* for remember boot option*/
+#ifdef CONFIG_MEMCG_SWAP_ENABLED
+static int really_do_swap_account __initdata = 1;
+#else
+static int really_do_swap_account __initdata;
+#endif
+
+static int __init enable_swap_account(char *s)
+{
+ if (!strcmp(s, "1"))
+ really_do_swap_account = 1;
+ else if (!strcmp(s, "0"))
+ really_do_swap_account = 0;
+ return 1;
+}
+__setup("swapaccount=", enable_swap_account);
+
+static struct cftype memsw_cgroup_files[] = {
+ {
+ .name = "memsw.usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "memsw.max_usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "memsw.limit_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
+ .write = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "memsw.failcnt",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ { }, /* terminate */
+};
+
+static int __init mem_cgroup_swap_init(void)
+{
+ if (!mem_cgroup_disabled() && really_do_swap_account) {
+ do_swap_account = 1;
+ WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
+ memsw_cgroup_files));
+ }
+ return 0;
+}
+subsys_initcall(mem_cgroup_swap_init);
+
+#endif /* CONFIG_MEMCG_SWAP */