summaryrefslogtreecommitdiff
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c478
1 files changed, 92 insertions, 386 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index acb93c554f6e..6ddaeba34e09 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -111,56 +111,10 @@ static const char * const mem_cgroup_lru_names[] = {
"unevictable",
};
-/*
- * Per memcg event counter is incremented at every pagein/pageout. With THP,
- * it will be incremated by the number of pages. This counter is used for
- * for trigger some periodic events. This is straightforward and better
- * than using jiffies etc. to handle periodic memcg event.
- */
-enum mem_cgroup_events_target {
- MEM_CGROUP_TARGET_THRESH,
- MEM_CGROUP_TARGET_SOFTLIMIT,
- MEM_CGROUP_TARGET_NUMAINFO,
- MEM_CGROUP_NTARGETS,
-};
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
#define NUMAINFO_EVENTS_TARGET 1024
-struct mem_cgroup_stat_cpu {
- long count[MEM_CGROUP_STAT_NSTATS];
- unsigned long events[MEMCG_NR_EVENTS];
- unsigned long nr_page_events;
- unsigned long targets[MEM_CGROUP_NTARGETS];
-};
-
-struct reclaim_iter {
- struct mem_cgroup *position;
- /* scan generation, increased every round-trip */
- unsigned int generation;
-};
-
-/*
- * per-zone information in memory controller.
- */
-struct mem_cgroup_per_zone {
- struct lruvec lruvec;
- unsigned long lru_size[NR_LRU_LISTS];
-
- struct reclaim_iter iter[DEF_PRIORITY + 1];
-
- struct rb_node tree_node; /* RB tree node */
- unsigned long usage_in_excess;/* Set to the value by which */
- /* the soft limit is exceeded*/
- bool on_tree;
- struct mem_cgroup *memcg; /* Back pointer, we cannot */
- /* use container_of */
-};
-
-struct mem_cgroup_per_node {
- struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
-};
-
/*
* Cgroups above their limits are maintained in a RB-Tree, independent of
* their hierarchy representation
@@ -181,32 +135,6 @@ struct mem_cgroup_tree {
static struct mem_cgroup_tree soft_limit_tree __read_mostly;
-struct mem_cgroup_threshold {
- struct eventfd_ctx *eventfd;
- unsigned long threshold;
-};
-
-/* For threshold */
-struct mem_cgroup_threshold_ary {
- /* An array index points to threshold just below or equal to usage. */
- int current_threshold;
- /* Size of entries[] */
- unsigned int size;
- /* Array of thresholds */
- struct mem_cgroup_threshold entries[0];
-};
-
-struct mem_cgroup_thresholds {
- /* Primary thresholds array */
- struct mem_cgroup_threshold_ary *primary;
- /*
- * Spare threshold array.
- * This is needed to make mem_cgroup_unregister_event() "never fail".
- * It must be able to store at least primary->size - 1 entries.
- */
- struct mem_cgroup_threshold_ary *spare;
-};
-
/* for OOM */
struct mem_cgroup_eventfd_list {
struct list_head list;
@@ -256,113 +184,6 @@ struct mem_cgroup_event {
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
-/*
- * The memory controller data structure. The memory controller controls both
- * page cache and RSS per cgroup. We would eventually like to provide
- * statistics based on the statistics developed by Rik Van Riel for clock-pro,
- * to help the administrator determine what knobs to tune.
- */
-struct mem_cgroup {
- struct cgroup_subsys_state css;
-
- /* Accounted resources */
- struct page_counter memory;
- struct page_counter memsw;
- struct page_counter kmem;
-
- /* Normal memory consumption range */
- unsigned long low;
- unsigned long high;
-
- unsigned long soft_limit;
-
- /* vmpressure notifications */
- struct vmpressure vmpressure;
-
- /* css_online() has been completed */
- int initialized;
-
- /*
- * Should the accounting and control be hierarchical, per subtree?
- */
- bool use_hierarchy;
-
- /* protected by memcg_oom_lock */
- bool oom_lock;
- int under_oom;
-
- int swappiness;
- /* OOM-Killer disable */
- int oom_kill_disable;
-
- /* protect arrays of thresholds */
- struct mutex thresholds_lock;
-
- /* thresholds for memory usage. RCU-protected */
- struct mem_cgroup_thresholds thresholds;
-
- /* thresholds for mem+swap usage. RCU-protected */
- struct mem_cgroup_thresholds memsw_thresholds;
-
- /* For oom notifier event fd */
- struct list_head oom_notify;
-
- /*
- * Should we move charges of a task when a task is moved into this
- * mem_cgroup ? And what type of charges should we move ?
- */
- unsigned long move_charge_at_immigrate;
- /*
- * set > 0 if pages under this cgroup are moving to other cgroup.
- */
- atomic_t moving_account;
- /* taken only while moving_account > 0 */
- spinlock_t move_lock;
- struct task_struct *move_lock_task;
- unsigned long move_lock_flags;
- /*
- * percpu counter.
- */
- struct mem_cgroup_stat_cpu __percpu *stat;
- spinlock_t pcp_counter_lock;
-
-#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
- struct cg_proto tcp_mem;
-#endif
-#if defined(CONFIG_MEMCG_KMEM)
- /* Index in the kmem_cache->memcg_params.memcg_caches array */
- int kmemcg_id;
- bool kmem_acct_activated;
- bool kmem_acct_active;
-#endif
-
- int last_scanned_node;
-#if MAX_NUMNODES > 1
- nodemask_t scan_nodes;
- atomic_t numainfo_events;
- atomic_t numainfo_updating;
-#endif
-
-#ifdef CONFIG_CGROUP_WRITEBACK
- struct list_head cgwb_list;
- struct wb_domain cgwb_domain;
-#endif
-
- /* List of events which userspace want to receive */
- struct list_head event_list;
- spinlock_t event_list_lock;
-
- struct mem_cgroup_per_node *nodeinfo[0];
- /* WARNING: nodeinfo must be the last member here */
-};
-
-#ifdef CONFIG_MEMCG_KMEM
-bool memcg_kmem_is_active(struct mem_cgroup *memcg)
-{
- return memcg->kmem_acct_active;
-}
-#endif
-
/* Stuffs for move charges at task migration. */
/*
* Types of charges to be moved.
@@ -423,11 +244,6 @@ enum res_type {
*/
static DEFINE_MUTEX(memcg_create_mutex);
-struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
-{
- return s ? container_of(s, struct mem_cgroup, css) : NULL;
-}
-
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
@@ -499,8 +315,7 @@ void sock_update_memcg(struct sock *sk)
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
cg_proto = sk->sk_prot->proto_cgroup(memcg);
- if (!mem_cgroup_is_root(memcg) &&
- memcg_proto_active(cg_proto) &&
+ if (cg_proto && test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags) &&
css_tryget_online(&memcg->css)) {
sk->sk_cgrp = cg_proto;
}
@@ -593,11 +408,6 @@ mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
return &memcg->nodeinfo[nid]->zoneinfo[zid];
}
-struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
-{
- return &memcg->css;
-}
-
/**
* mem_cgroup_css_from_page - css of the memcg associated with a page
* @page: page of interest
@@ -631,6 +441,34 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
return &memcg->css;
}
+/**
+ * page_cgroup_ino - return inode number of the memcg a page is charged to
+ * @page: the page
+ *
+ * Look up the closest online ancestor of the memory cgroup @page is charged to
+ * and return its inode number or 0 if @page is not charged to any cgroup. It
+ * is safe to call this function without holding a reference to @page.
+ *
+ * Note, this function is inherently racy, because there is nothing to prevent
+ * the cgroup inode from getting torn down and potentially reallocated a moment
+ * after page_cgroup_ino() returns, so it only should be used by callers that
+ * do not care (such as procfs interfaces).
+ */
+ino_t page_cgroup_ino(struct page *page)
+{
+ struct mem_cgroup *memcg;
+ unsigned long ino = 0;
+
+ rcu_read_lock();
+ memcg = READ_ONCE(page->mem_cgroup);
+ while (memcg && !(memcg->css.flags & CSS_ONLINE))
+ memcg = parent_mem_cgroup(memcg);
+ if (memcg)
+ ino = cgroup_ino(memcg->css.cgroup);
+ rcu_read_unlock();
+ return ino;
+}
+
static struct mem_cgroup_per_zone *
mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
{
@@ -876,14 +714,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
}
-unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
-{
- struct mem_cgroup_per_zone *mz;
-
- mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
- return mz->lru_size[lru];
-}
-
static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
int nid,
unsigned int lru_mask)
@@ -986,6 +816,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
}
+EXPORT_SYMBOL(mem_cgroup_from_task);
static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
@@ -1031,7 +862,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
struct mem_cgroup_reclaim_cookie *reclaim)
{
- struct reclaim_iter *uninitialized_var(iter);
+ struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
struct cgroup_subsys_state *css = NULL;
struct mem_cgroup *memcg = NULL;
struct mem_cgroup *pos = NULL;
@@ -1173,30 +1004,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
-void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
-{
- struct mem_cgroup *memcg;
-
- rcu_read_lock();
- memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
- if (unlikely(!memcg))
- goto out;
-
- switch (idx) {
- case PGFAULT:
- this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
- break;
- case PGMAJFAULT:
- this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
- break;
- default:
- BUG();
- }
-out:
- rcu_read_unlock();
-}
-EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
-
/**
* mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
* @zone: zone of the wanted lruvec
@@ -1295,15 +1102,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
VM_BUG_ON((long)(*lru_size) < 0);
}
-bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root)
-{
- if (root == memcg)
- return true;
- if (!root->use_hierarchy)
- return false;
- return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
-}
-
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
{
struct mem_cgroup *task_memcg;
@@ -1330,39 +1128,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
return ret;
}
-int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
-{
- unsigned long inactive_ratio;
- unsigned long inactive;
- unsigned long active;
- unsigned long gb;
-
- inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
- active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
-
- gb = (inactive + active) >> (30 - PAGE_SHIFT);
- if (gb)
- inactive_ratio = int_sqrt(10 * gb);
- else
- inactive_ratio = 1;
-
- return inactive * inactive_ratio < active;
-}
-
-bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
-{
- struct mem_cgroup_per_zone *mz;
- struct mem_cgroup *memcg;
-
- if (mem_cgroup_disabled())
- return true;
-
- mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
- memcg = mz->memcg;
-
- return !!(memcg->css.flags & CSS_ONLINE);
-}
-
#define mem_cgroup_from_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
@@ -1394,15 +1159,6 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
return margin;
}
-int mem_cgroup_swappiness(struct mem_cgroup *memcg)
-{
- /* root ? */
- if (mem_cgroup_disabled() || !memcg->css.parent)
- return vm_swappiness;
-
- return memcg->swappiness;
-}
-
/*
* A routine for checking "mem" is under move_account() or not.
*
@@ -1545,6 +1301,12 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
int order)
{
+ struct oom_control oc = {
+ .zonelist = NULL,
+ .nodemask = NULL,
+ .gfp_mask = gfp_mask,
+ .order = order,
+ };
struct mem_cgroup *iter;
unsigned long chosen_points = 0;
unsigned long totalpages;
@@ -1563,7 +1325,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
goto unlock;
}
- check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
+ check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg);
totalpages = mem_cgroup_get_limit(memcg) ? : 1;
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
@@ -1571,8 +1333,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
css_task_iter_start(&iter->css, &it);
while ((task = css_task_iter_next(&it))) {
- switch (oom_scan_process_thread(task, totalpages, NULL,
- false)) {
+ switch (oom_scan_process_thread(&oc, task, totalpages)) {
case OOM_SCAN_SELECT:
if (chosen)
put_task_struct(chosen);
@@ -1610,8 +1371,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
if (chosen) {
points = chosen_points * 1000 / totalpages;
- oom_kill_process(chosen, gfp_mask, order, points, totalpages,
- memcg, NULL, "Memory cgroup out of memory");
+ oom_kill_process(&oc, chosen, points, totalpages, memcg,
+ "Memory cgroup out of memory");
}
unlock:
mutex_unlock(&oom_lock);
@@ -2062,23 +1823,6 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
}
EXPORT_SYMBOL(mem_cgroup_end_page_stat);
-/**
- * mem_cgroup_update_page_stat - update page state statistics
- * @memcg: memcg to account against
- * @idx: page state item to account
- * @val: number of pages (positive or negative)
- *
- * See mem_cgroup_begin_page_stat() for locking requirements.
- */
-void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
- enum mem_cgroup_stat_index idx, int val)
-{
- VM_BUG_ON(!rcu_read_lock_held());
-
- if (memcg)
- this_cpu_add(memcg->stat->count[idx], val);
-}
-
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
* TODO: maybe necessary to use big numbers in big irons.
@@ -2355,40 +2099,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
css_put_many(&memcg->css, nr_pages);
}
-/*
- * try_get_mem_cgroup_from_page - look up page's memcg association
- * @page: the page
- *
- * Look up, get a css reference, and return the memcg that owns @page.
- *
- * The page must be locked to prevent racing with swap-in and page
- * cache charges. If coming from an unlocked page table, the caller
- * must ensure the page is on the LRU or this can race with charging.
- */
-struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
-{
- struct mem_cgroup *memcg;
- unsigned short id;
- swp_entry_t ent;
-
- VM_BUG_ON_PAGE(!PageLocked(page), page);
-
- memcg = page->mem_cgroup;
- if (memcg) {
- if (!css_tryget_online(&memcg->css))
- memcg = NULL;
- } else if (PageSwapCache(page)) {
- ent.val = page_private(page);
- id = lookup_swap_cgroup_id(ent);
- rcu_read_lock();
- memcg = mem_cgroup_from_id(id);
- if (memcg && !css_tryget_online(&memcg->css))
- memcg = NULL;
- rcu_read_unlock();
- }
- return memcg;
-}
-
static void lock_page_lru(struct page *page, int *isolated)
{
struct zone *zone = page_zone(page);
@@ -2504,16 +2214,6 @@ void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
css_put_many(&memcg->css, nr_pages);
}
-/*
- * helper for acessing a memcg's index. It will be used as an index in the
- * child cache array in kmem_cache, and also to derive its name. This function
- * will return -1 when this is not a kmem-limited memcg.
- */
-int memcg_cache_id(struct mem_cgroup *memcg)
-{
- return memcg ? memcg->kmemcg_id : -1;
-}
-
static int memcg_alloc_cache_id(void)
{
int id, size;
@@ -5127,10 +4827,12 @@ static void mem_cgroup_clear_mc(void)
static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
- struct task_struct *p = cgroup_taskset_first(tset);
- int ret = 0;
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *from;
+ struct task_struct *p;
+ struct mm_struct *mm;
unsigned long move_flags;
+ int ret = 0;
/*
* We are now commited to this value whatever it is. Changes in this
@@ -5138,36 +4840,37 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
* So we need to save it, and keep it going.
*/
move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
- if (move_flags) {
- struct mm_struct *mm;
- struct mem_cgroup *from = mem_cgroup_from_task(p);
+ if (!move_flags)
+ return 0;
- VM_BUG_ON(from == memcg);
+ p = cgroup_taskset_first(tset);
+ from = mem_cgroup_from_task(p);
- mm = get_task_mm(p);
- if (!mm)
- return 0;
- /* We move charges only when we move a owner of the mm */
- if (mm->owner == p) {
- VM_BUG_ON(mc.from);
- VM_BUG_ON(mc.to);
- VM_BUG_ON(mc.precharge);
- VM_BUG_ON(mc.moved_charge);
- VM_BUG_ON(mc.moved_swap);
-
- spin_lock(&mc.lock);
- mc.from = from;
- mc.to = memcg;
- mc.flags = move_flags;
- spin_unlock(&mc.lock);
- /* We set mc.moving_task later */
-
- ret = mem_cgroup_precharge_mc(mm);
- if (ret)
- mem_cgroup_clear_mc();
- }
- mmput(mm);
+ VM_BUG_ON(from == memcg);
+
+ mm = get_task_mm(p);
+ if (!mm)
+ return 0;
+ /* We move charges only when we move a owner of the mm */
+ if (mm->owner == p) {
+ VM_BUG_ON(mc.from);
+ VM_BUG_ON(mc.to);
+ VM_BUG_ON(mc.precharge);
+ VM_BUG_ON(mc.moved_charge);
+ VM_BUG_ON(mc.moved_swap);
+
+ spin_lock(&mc.lock);
+ mc.from = from;
+ mc.to = memcg;
+ mc.flags = move_flags;
+ spin_unlock(&mc.lock);
+ /* We set mc.moving_task later */
+
+ ret = mem_cgroup_precharge_mc(mm);
+ if (ret)
+ mem_cgroup_clear_mc();
}
+ mmput(mm);
return ret;
}
@@ -5521,19 +5224,6 @@ struct cgroup_subsys memory_cgrp_subsys = {
};
/**
- * mem_cgroup_events - count memory events against a cgroup
- * @memcg: the memory cgroup
- * @idx: the event index
- * @nr: the number of events to account for
- */
-void mem_cgroup_events(struct mem_cgroup *memcg,
- enum mem_cgroup_events_index idx,
- unsigned int nr)
-{
- this_cpu_add(memcg->stat->events[idx], nr);
-}
-
-/**
* mem_cgroup_low - check if memory consumption is below the normal range
* @root: the highest ancestor to consider
* @memcg: the memory cgroup to check
@@ -5605,8 +5295,20 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
* the page lock, which serializes swap cache removal, which
* in turn serializes uncharging.
*/
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
if (page->mem_cgroup)
goto out;
+
+ if (do_swap_account) {
+ swp_entry_t ent = { .val = page_private(page), };
+ unsigned short id = lookup_swap_cgroup_id(ent);
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(id);
+ if (memcg && !css_tryget_online(&memcg->css))
+ memcg = NULL;
+ rcu_read_unlock();
+ }
}
if (PageTransHuge(page)) {
@@ -5614,8 +5316,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
}
- if (do_swap_account && PageSwapCache(page))
- memcg = try_get_mem_cgroup_from_page(page);
if (!memcg)
memcg = get_mem_cgroup_from_mm(mm);
@@ -5965,7 +5665,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
if (!mem_cgroup_is_root(memcg))
page_counter_uncharge(&memcg->memory, 1);
- /* Caller disabled preemption with mapping->tree_lock */
+ /*
+ * Interrupts should be disabled here because the caller holds the
+ * mapping->tree_lock lock which is taken with interrupts-off. It is
+ * important here to have the interrupts disabled because it is the
+ * only synchronisation we have for udpating the per-CPU variables.
+ */
+ VM_BUG_ON(!irqs_disabled());
mem_cgroup_charge_statistics(memcg, page, -1);
memcg_check_events(memcg, page);
}