summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/huge_memory.c9
-rw-r--r--mm/kasan/Makefile1
-rw-r--r--mm/list_lru.c12
-rw-r--r--mm/memcontrol.c852
-rw-r--r--mm/memory.c3
-rw-r--r--mm/process_vm_access.c2
-rw-r--r--mm/shmem.c4
-rw-r--r--mm/slab.h6
-rw-r--r--mm/slab_common.c14
-rw-r--r--mm/slub.c10
-rw-r--r--mm/swap_state.c5
-rw-r--r--mm/swapfile.c6
-rw-r--r--mm/util.c16
-rw-r--r--mm/vmscan.c28
-rw-r--r--mm/zsmalloc.c14
15 files changed, 586 insertions, 396 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b1cf73bc3b12..8ad580273521 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3357,6 +3357,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
struct anon_vma *anon_vma;
int count, mapcount, ret;
bool mlocked;
+ unsigned long flags;
VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
VM_BUG_ON_PAGE(!PageAnon(page), page);
@@ -3396,7 +3397,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
lru_add_drain();
/* Prevent deferred_split_scan() touching ->_count */
- spin_lock(&split_queue_lock);
+ spin_lock_irqsave(&split_queue_lock, flags);
count = page_count(head);
mapcount = total_mapcount(head);
if (!mapcount && count == 1) {
@@ -3404,11 +3405,11 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
split_queue_len--;
list_del(page_deferred_list(head));
}
- spin_unlock(&split_queue_lock);
+ spin_unlock_irqrestore(&split_queue_lock, flags);
__split_huge_page(page, list);
ret = 0;
} else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
- spin_unlock(&split_queue_lock);
+ spin_unlock_irqrestore(&split_queue_lock, flags);
pr_alert("total_mapcount: %u, page_count(): %u\n",
mapcount, count);
if (PageTail(page))
@@ -3416,7 +3417,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
dump_page(page, "total_mapcount(head) > 0");
BUG();
} else {
- spin_unlock(&split_queue_lock);
+ spin_unlock_irqrestore(&split_queue_lock, flags);
unfreeze_page(anon_vma, head);
ret = -EBUSY;
}
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 64710148941e..a61460d9f5b0 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -1,4 +1,5 @@
KASAN_SANITIZE := n
+UBSAN_SANITIZE_kasan.o := n
CFLAGS_REMOVE_kasan.o = -pg
# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
diff --git a/mm/list_lru.c b/mm/list_lru.c
index afc71ea9a381..1d05cb9d363d 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -12,7 +12,7 @@
#include <linux/mutex.h>
#include <linux/memcontrol.h>
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
static LIST_HEAD(list_lrus);
static DEFINE_MUTEX(list_lrus_mutex);
@@ -37,9 +37,9 @@ static void list_lru_register(struct list_lru *lru)
static void list_lru_unregister(struct list_lru *lru)
{
}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
/*
@@ -104,7 +104,7 @@ list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
{
return &nlru->lru;
}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
bool list_lru_add(struct list_lru *lru, struct list_head *item)
{
@@ -292,7 +292,7 @@ static void init_one_lru(struct list_lru_one *l)
l->nr_items = 0;
}
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus,
int begin, int end)
{
@@ -529,7 +529,7 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
static void memcg_destroy_list_lru(struct list_lru *lru)
{
}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
int __list_lru_init(struct list_lru *lru, bool memcg_aware,
struct lock_class_key *key)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0eda67376df4..ca052f2a4a0b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -66,7 +66,6 @@
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
-#include <net/tcp_memcontrol.h>
#include "slab.h"
#include <asm/uaccess.h>
@@ -83,6 +82,9 @@ struct mem_cgroup *root_mem_cgroup __read_mostly;
/* Socket memory accounting disabled? */
static bool cgroup_memory_nosocket;
+/* Kernel memory accounting disabled? */
+static bool cgroup_memory_nokmem;
+
/* Whether the swap controller is active */
#ifdef CONFIG_MEMCG_SWAP
int do_swap_account __read_mostly;
@@ -239,6 +241,7 @@ enum res_type {
_MEMSWAP,
_OOM_TYPE,
_KMEM,
+ _TCP,
};
#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
@@ -247,13 +250,6 @@ enum res_type {
/* Used for OOM nofiier */
#define OOM_CONTROL (0)
-/*
- * The memcg_create_mutex will be held whenever a new cgroup is created.
- * As a consequence, any change that needs to protect against new child cgroups
- * appearing has to hold it as well.
- */
-static DEFINE_MUTEX(memcg_create_mutex);
-
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
@@ -297,7 +293,7 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
return mem_cgroup_from_css(css);
}
-#ifdef CONFIG_MEMCG_KMEM
+#ifndef CONFIG_SLOB
/*
* This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
* The main reason for not using cgroup id for this:
@@ -349,7 +345,7 @@ void memcg_put_cache_ids(void)
DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
EXPORT_SYMBOL(memcg_kmem_enabled_key);
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* !CONFIG_SLOB */
static struct mem_cgroup_per_zone *
mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
@@ -370,13 +366,6 @@ mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
*
* If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
* is returned.
- *
- * XXX: The above description of behavior on the default hierarchy isn't
- * strictly true yet as replace_page_cache_page() can modify the
- * association before @page is released even on the default hierarchy;
- * however, the current and planned usages don't mix the the two functions
- * and replace_page_cache_page() will soon be updated to make the invariant
- * actually true.
*/
struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
{
@@ -896,17 +885,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
if (css == &root->css)
break;
- if (css_tryget(css)) {
- /*
- * Make sure the memcg is initialized:
- * mem_cgroup_css_online() orders the the
- * initialization against setting the flag.
- */
- if (smp_load_acquire(&memcg->initialized))
- break;
-
- css_put(css);
- }
+ if (css_tryget(css))
+ break;
memcg = NULL;
}
@@ -1233,7 +1213,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
pr_cont(":");
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
- if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
+ if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
continue;
pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
K(mem_cgroup_read_stat(iter, i)));
@@ -1272,9 +1252,12 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
limit = memcg->memory.limit;
if (mem_cgroup_swappiness(memcg)) {
unsigned long memsw_limit;
+ unsigned long swap_limit;
memsw_limit = memcg->memsw.limit;
- limit = min(limit + total_swap_pages, memsw_limit);
+ swap_limit = memcg->swap.limit;
+ swap_limit = min(swap_limit, (unsigned long)total_swap_pages);
+ limit = min(limit + swap_limit, memsw_limit);
}
return limit;
}
@@ -2203,7 +2186,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
unlock_page_lru(page, isolated);
}
-#ifdef CONFIG_MEMCG_KMEM
+#ifndef CONFIG_SLOB
static int memcg_alloc_cache_id(void)
{
int id, size;
@@ -2378,16 +2361,17 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
struct page_counter *counter;
int ret;
- if (!memcg_kmem_is_active(memcg))
+ if (!memcg_kmem_online(memcg))
return 0;
- if (!page_counter_try_charge(&memcg->kmem, nr_pages, &counter))
- return -ENOMEM;
-
ret = try_charge(memcg, gfp, nr_pages);
- if (ret) {
- page_counter_uncharge(&memcg->kmem, nr_pages);
+ if (ret)
return ret;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
+ !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
+ cancel_charge(memcg, nr_pages);
+ return -ENOMEM;
}
page->mem_cgroup = memcg;
@@ -2416,7 +2400,9 @@ void __memcg_kmem_uncharge(struct page *page, int order)
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
- page_counter_uncharge(&memcg->kmem, nr_pages);
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ page_counter_uncharge(&memcg->kmem, nr_pages);
+
page_counter_uncharge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, nr_pages);
@@ -2424,7 +2410,7 @@ void __memcg_kmem_uncharge(struct page *page, int order)
page->mem_cgroup = NULL;
css_put_many(&memcg->css, nr_pages);
}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* !CONFIG_SLOB */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -2684,14 +2670,6 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
{
bool ret;
- /*
- * The lock does not prevent addition or deletion of children, but
- * it prevents a new child from being initialized based on this
- * parent in css_online(), so it's enough to decide whether
- * hierarchically inherited attributes can still be changed or not.
- */
- lockdep_assert_held(&memcg_create_mutex);
-
rcu_read_lock();
ret = css_next_child(NULL, &memcg->css);
rcu_read_unlock();
@@ -2754,10 +2732,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
- mutex_lock(&memcg_create_mutex);
-
if (memcg->use_hierarchy == val)
- goto out;
+ return 0;
/*
* If parent's use_hierarchy is set, we can't make any modifications
@@ -2776,9 +2752,6 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
} else
retval = -EINVAL;
-out:
- mutex_unlock(&memcg_create_mutex);
-
return retval;
}
@@ -2794,6 +2767,18 @@ static unsigned long tree_stat(struct mem_cgroup *memcg,
return val;
}
+static unsigned long tree_events(struct mem_cgroup *memcg,
+ enum mem_cgroup_events_index idx)
+{
+ struct mem_cgroup *iter;
+ unsigned long val = 0;
+
+ for_each_mem_cgroup_tree(iter, memcg)
+ val += mem_cgroup_read_events(iter, idx);
+
+ return val;
+}
+
static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
unsigned long val;
@@ -2836,6 +2821,9 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
case _KMEM:
counter = &memcg->kmem;
break;
+ case _TCP:
+ counter = &memcg->tcpmem;
+ break;
default:
BUG();
}
@@ -2860,103 +2848,180 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
}
}
-#ifdef CONFIG_MEMCG_KMEM
-static int memcg_activate_kmem(struct mem_cgroup *memcg,
- unsigned long nr_pages)
+#ifndef CONFIG_SLOB
+static int memcg_online_kmem(struct mem_cgroup *memcg)
{
- int err = 0;
int memcg_id;
BUG_ON(memcg->kmemcg_id >= 0);
- BUG_ON(memcg->kmem_acct_activated);
- BUG_ON(memcg->kmem_acct_active);
-
- /*
- * For simplicity, we won't allow this to be disabled. It also can't
- * be changed if the cgroup has children already, or if tasks had
- * already joined.
- *
- * If tasks join before we set the limit, a person looking at
- * kmem.usage_in_bytes will have no way to determine when it took
- * place, which makes the value quite meaningless.
- *
- * After it first became limited, changes in the value of the limit are
- * of course permitted.
- */
- mutex_lock(&memcg_create_mutex);
- if (cgroup_is_populated(memcg->css.cgroup) ||
- (memcg->use_hierarchy && memcg_has_children(memcg)))
- err = -EBUSY;
- mutex_unlock(&memcg_create_mutex);
- if (err)
- goto out;
+ BUG_ON(memcg->kmem_state);
memcg_id = memcg_alloc_cache_id();
- if (memcg_id < 0) {
- err = memcg_id;
- goto out;
- }
-
- /*
- * We couldn't have accounted to this cgroup, because it hasn't got
- * activated yet, so this should succeed.
- */
- err = page_counter_limit(&memcg->kmem, nr_pages);
- VM_BUG_ON(err);
+ if (memcg_id < 0)
+ return memcg_id;
static_branch_inc(&memcg_kmem_enabled_key);
/*
- * A memory cgroup is considered kmem-active as soon as it gets
+ * A memory cgroup is considered kmem-online as soon as it gets
* kmemcg_id. Setting the id after enabling static branching will
* guarantee no one starts accounting before all call sites are
* patched.
*/
memcg->kmemcg_id = memcg_id;
- memcg->kmem_acct_activated = true;
- memcg->kmem_acct_active = true;
-out:
- return err;
+ memcg->kmem_state = KMEM_ONLINE;
+
+ return 0;
}
-static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
- unsigned long limit)
+static int memcg_propagate_kmem(struct mem_cgroup *parent,
+ struct mem_cgroup *memcg)
{
- int ret;
+ int ret = 0;
mutex_lock(&memcg_limit_mutex);
- if (!memcg_kmem_is_active(memcg))
- ret = memcg_activate_kmem(memcg, limit);
- else
- ret = page_counter_limit(&memcg->kmem, limit);
+ /*
+ * If the parent cgroup is not kmem-online now, it cannot be
+ * onlined after this point, because it has at least one child
+ * already.
+ */
+ if (memcg_kmem_online(parent) ||
+ (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nokmem))
+ ret = memcg_online_kmem(memcg);
mutex_unlock(&memcg_limit_mutex);
return ret;
}
-static int memcg_propagate_kmem(struct mem_cgroup *memcg)
+static void memcg_offline_kmem(struct mem_cgroup *memcg)
{
- int ret = 0;
- struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+ struct cgroup_subsys_state *css;
+ struct mem_cgroup *parent, *child;
+ int kmemcg_id;
+
+ if (memcg->kmem_state != KMEM_ONLINE)
+ return;
+ /*
+ * Clear the online state before clearing memcg_caches array
+ * entries. The slab_mutex in memcg_deactivate_kmem_caches()
+ * guarantees that no cache will be created for this cgroup
+ * after we are done (see memcg_create_kmem_cache()).
+ */
+ memcg->kmem_state = KMEM_ALLOCATED;
+ memcg_deactivate_kmem_caches(memcg);
+
+ kmemcg_id = memcg->kmemcg_id;
+ BUG_ON(kmemcg_id < 0);
+
+ parent = parent_mem_cgroup(memcg);
if (!parent)
- return 0;
+ parent = root_mem_cgroup;
- mutex_lock(&memcg_limit_mutex);
/*
- * If the parent cgroup is not kmem-active now, it cannot be activated
- * after this point, because it has at least one child already.
+ * Change kmemcg_id of this cgroup and all its descendants to the
+ * parent's id, and then move all entries from this cgroup's list_lrus
+ * to ones of the parent. After we have finished, all list_lrus
+ * corresponding to this cgroup are guaranteed to remain empty. The
+ * ordering is imposed by list_lru_node->lock taken by
+ * memcg_drain_all_list_lrus().
*/
- if (memcg_kmem_is_active(parent))
- ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX);
- mutex_unlock(&memcg_limit_mutex);
- return ret;
+ css_for_each_descendant_pre(css, &memcg->css) {
+ child = mem_cgroup_from_css(css);
+ BUG_ON(child->kmemcg_id != kmemcg_id);
+ child->kmemcg_id = parent->kmemcg_id;
+ if (!memcg->use_hierarchy)
+ break;
+ }
+ memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
+
+ memcg_free_cache_id(kmemcg_id);
+}
+
+static void memcg_free_kmem(struct mem_cgroup *memcg)
+{
+ /* css_alloc() failed, offlining didn't happen */
+ if (unlikely(memcg->kmem_state == KMEM_ONLINE))
+ memcg_offline_kmem(memcg);
+
+ if (memcg->kmem_state == KMEM_ALLOCATED) {
+ memcg_destroy_kmem_caches(memcg);
+ static_branch_dec(&memcg_kmem_enabled_key);
+ WARN_ON(page_counter_read(&memcg->kmem));
+ }
}
#else
+static int memcg_propagate_kmem(struct mem_cgroup *parent, struct mem_cgroup *memcg)
+{
+ return 0;
+}
+static int memcg_online_kmem(struct mem_cgroup *memcg)
+{
+ return 0;
+}
+static void memcg_offline_kmem(struct mem_cgroup *memcg)
+{
+}
+static void memcg_free_kmem(struct mem_cgroup *memcg)
+{
+}
+#endif /* !CONFIG_SLOB */
+
static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
unsigned long limit)
{
- return -EINVAL;
+ int ret = 0;
+
+ mutex_lock(&memcg_limit_mutex);
+ /* Top-level cgroup doesn't propagate from root */
+ if (!memcg_kmem_online(memcg)) {
+ if (cgroup_is_populated(memcg->css.cgroup) ||
+ (memcg->use_hierarchy && memcg_has_children(memcg)))
+ ret = -EBUSY;
+ if (ret)
+ goto out;
+ ret = memcg_online_kmem(memcg);
+ if (ret)
+ goto out;
+ }
+ ret = page_counter_limit(&memcg->kmem, limit);
+out:
+ mutex_unlock(&memcg_limit_mutex);
+ return ret;
+}
+
+static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
+{
+ int ret;
+
+ mutex_lock(&memcg_limit_mutex);
+
+ ret = page_counter_limit(&memcg->tcpmem, limit);
+ if (ret)
+ goto out;
+
+ if (!memcg->tcpmem_active) {
+ /*
+ * The active flag needs to be written after the static_key
+ * update. This is what guarantees that the socket activation
+ * function is the last one to run. See sock_update_memcg() for
+ * details, and note that we don't mark any socket as belonging
+ * to this memcg until that flag is up.
+ *
+ * We need to do this, because static_keys will span multiple
+ * sites, but we can't control their order. If we mark a socket
+ * as accounted, but the accounting functions are not patched in
+ * yet, we'll lose accounting.
+ *
+ * We never race with the readers in sock_update_memcg(),
+ * because when this value change, the code to process it is not
+ * patched in yet.
+ */
+ static_branch_inc(&memcg_sockets_enabled_key);
+ memcg->tcpmem_active = true;
+ }
+out:
+ mutex_unlock(&memcg_limit_mutex);
+ return ret;
}
-#endif /* CONFIG_MEMCG_KMEM */
/*
* The user of this function is...
@@ -2990,6 +3055,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
case _KMEM:
ret = memcg_update_kmem_limit(memcg, nr_pages);
break;
+ case _TCP:
+ ret = memcg_update_tcp_limit(memcg, nr_pages);
+ break;
}
break;
case RES_SOFT_LIMIT:
@@ -3016,6 +3084,9 @@ static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
case _KMEM:
counter = &memcg->kmem;
break;
+ case _TCP:
+ counter = &memcg->tcpmem;
+ break;
default:
BUG();
}
@@ -3582,88 +3653,6 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
return 0;
}
-#ifdef CONFIG_MEMCG_KMEM
-static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
-{
- int ret;
-
- ret = memcg_propagate_kmem(memcg);
- if (ret)
- return ret;
-
- return tcp_init_cgroup(memcg, ss);
-}
-
-static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
-{
- struct cgroup_subsys_state *css;
- struct mem_cgroup *parent, *child;
- int kmemcg_id;
-
- if (!memcg->kmem_acct_active)
- return;
-
- /*
- * Clear the 'active' flag before clearing memcg_caches arrays entries.
- * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it
- * guarantees no cache will be created for this cgroup after we are
- * done (see memcg_create_kmem_cache()).
- */
- memcg->kmem_acct_active = false;
-
- memcg_deactivate_kmem_caches(memcg);
-
- kmemcg_id = memcg->kmemcg_id;
- BUG_ON(kmemcg_id < 0);
-
- parent = parent_mem_cgroup(memcg);
- if (!parent)
- parent = root_mem_cgroup;
-
- /*
- * Change kmemcg_id of this cgroup and all its descendants to the
- * parent's id, and then move all entries from this cgroup's list_lrus
- * to ones of the parent. After we have finished, all list_lrus
- * corresponding to this cgroup are guaranteed to remain empty. The
- * ordering is imposed by list_lru_node->lock taken by
- * memcg_drain_all_list_lrus().
- */
- css_for_each_descendant_pre(css, &memcg->css) {
- child = mem_cgroup_from_css(css);
- BUG_ON(child->kmemcg_id != kmemcg_id);
- child->kmemcg_id = parent->kmemcg_id;
- if (!memcg->use_hierarchy)
- break;
- }
- memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
-
- memcg_free_cache_id(kmemcg_id);
-}
-
-static void memcg_destroy_kmem(struct mem_cgroup *memcg)
-{
- if (memcg->kmem_acct_activated) {
- memcg_destroy_kmem_caches(memcg);
- static_branch_dec(&memcg_kmem_enabled_key);
- WARN_ON(page_counter_read(&memcg->kmem));
- }
- tcp_destroy_cgroup(memcg);
-}
-#else
-static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
-{
- return 0;
-}
-
-static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
-{
-}
-
-static void memcg_destroy_kmem(struct mem_cgroup *memcg)
-{
-}
-#endif
-
#ifdef CONFIG_CGROUP_WRITEBACK
struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
@@ -4051,7 +4040,6 @@ static struct cftype mem_cgroup_legacy_files[] = {
.seq_show = memcg_numa_stat_show,
},
#endif
-#ifdef CONFIG_MEMCG_KMEM
{
.name = "kmem.limit_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
@@ -4084,7 +4072,29 @@ static struct cftype mem_cgroup_legacy_files[] = {
.seq_show = memcg_slab_show,
},
#endif
-#endif
+ {
+ .name = "kmem.tcp.limit_in_bytes",
+ .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
+ .write = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "kmem.tcp.usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "kmem.tcp.failcnt",
+ .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "kmem.tcp.max_usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
{ }, /* terminate */
};
@@ -4123,147 +4133,92 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
kfree(memcg->nodeinfo[node]);
}
-static struct mem_cgroup *mem_cgroup_alloc(void)
-{
- struct mem_cgroup *memcg;
- size_t size;
-
- size = sizeof(struct mem_cgroup);
- size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
-
- memcg = kzalloc(size, GFP_KERNEL);
- if (!memcg)
- return NULL;
-
- memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
- if (!memcg->stat)
- goto out_free;
-
- if (memcg_wb_domain_init(memcg, GFP_KERNEL))
- goto out_free_stat;
-
- return memcg;
-
-out_free_stat:
- free_percpu(memcg->stat);
-out_free:
- kfree(memcg);
- return NULL;
-}
-
-/*
- * At destroying mem_cgroup, references from swap_cgroup can remain.
- * (scanning all at force_empty is too costly...)
- *
- * Instead of clearing all references at force_empty, we remember
- * the number of reference from swap_cgroup and free mem_cgroup when
- * it goes down to 0.
- *
- * Removal of cgroup itself succeeds regardless of refs from swap.
- */
-
-static void __mem_cgroup_free(struct mem_cgroup *memcg)
+static void mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
- cancel_work_sync(&memcg->high_work);
-
- mem_cgroup_remove_from_trees(memcg);
-
+ memcg_wb_domain_exit(memcg);
for_each_node(node)
free_mem_cgroup_per_zone_info(memcg, node);
-
free_percpu(memcg->stat);
- memcg_wb_domain_exit(memcg);
kfree(memcg);
}
-static struct cgroup_subsys_state * __ref
-mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
+static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *memcg;
- long error = -ENOMEM;
+ size_t size;
int node;
- memcg = mem_cgroup_alloc();
+ size = sizeof(struct mem_cgroup);
+ size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
+
+ memcg = kzalloc(size, GFP_KERNEL);
if (!memcg)
- return ERR_PTR(error);
+ return NULL;
+
+ memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
+ if (!memcg->stat)
+ goto fail;
for_each_node(node)
if (alloc_mem_cgroup_per_zone_info(memcg, node))
- goto free_out;
+ goto fail;
- /* root ? */
- if (parent_css == NULL) {
- root_mem_cgroup = memcg;
- page_counter_init(&memcg->memory, NULL);
- memcg->high = PAGE_COUNTER_MAX;
- memcg->soft_limit = PAGE_COUNTER_MAX;
- page_counter_init(&memcg->memsw, NULL);
- page_counter_init(&memcg->kmem, NULL);
- }
+ if (memcg_wb_domain_init(memcg, GFP_KERNEL))
+ goto fail;
INIT_WORK(&memcg->high_work, high_work_func);
memcg->last_scanned_node = MAX_NUMNODES;
INIT_LIST_HEAD(&memcg->oom_notify);
- memcg->move_charge_at_immigrate = 0;
mutex_init(&memcg->thresholds_lock);
spin_lock_init(&memcg->move_lock);
vmpressure_init(&memcg->vmpressure);
INIT_LIST_HEAD(&memcg->event_list);
spin_lock_init(&memcg->event_list_lock);
-#ifdef CONFIG_MEMCG_KMEM
+ memcg->socket_pressure = jiffies;
+#ifndef CONFIG_SLOB
memcg->kmemcg_id = -1;
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
#endif
-#ifdef CONFIG_INET
- memcg->socket_pressure = jiffies;
-#endif
- return &memcg->css;
-
-free_out:
- __mem_cgroup_free(memcg);
- return ERR_PTR(error);
+ return memcg;
+fail:
+ mem_cgroup_free(memcg);
+ return NULL;
}
-static int
-mem_cgroup_css_online(struct cgroup_subsys_state *css)
+static struct cgroup_subsys_state * __ref
+mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);
- int ret;
-
- if (css->id > MEM_CGROUP_ID_MAX)
- return -ENOSPC;
-
- if (!parent)
- return 0;
-
- mutex_lock(&memcg_create_mutex);
+ struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
+ struct mem_cgroup *memcg;
+ long error = -ENOMEM;
- memcg->use_hierarchy = parent->use_hierarchy;
- memcg->oom_kill_disable = parent->oom_kill_disable;
- memcg->swappiness = mem_cgroup_swappiness(parent);
+ memcg = mem_cgroup_alloc();
+ if (!memcg)
+ return ERR_PTR(error);
- if (parent->use_hierarchy) {
+ memcg->high = PAGE_COUNTER_MAX;
+ memcg->soft_limit = PAGE_COUNTER_MAX;
+ if (parent) {
+ memcg->swappiness = mem_cgroup_swappiness(parent);
+ memcg->oom_kill_disable = parent->oom_kill_disable;
+ }
+ if (parent && parent->use_hierarchy) {
+ memcg->use_hierarchy = true;
page_counter_init(&memcg->memory, &parent->memory);
- memcg->high = PAGE_COUNTER_MAX;
- memcg->soft_limit = PAGE_COUNTER_MAX;
+ page_counter_init(&memcg->swap, &parent->swap);
page_counter_init(&memcg->memsw, &parent->memsw);
page_counter_init(&memcg->kmem, &parent->kmem);
-
- /*
- * No need to take a reference to the parent because cgroup
- * core guarantees its existence.
- */
+ page_counter_init(&memcg->tcpmem, &parent->tcpmem);
} else {
page_counter_init(&memcg->memory, NULL);
- memcg->high = PAGE_COUNTER_MAX;
- memcg->soft_limit = PAGE_COUNTER_MAX;
+ page_counter_init(&memcg->swap, NULL);
page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL);
+ page_counter_init(&memcg->tcpmem, NULL);
/*
* Deeper hierachy with use_hierarchy == false doesn't make
* much sense so let cgroup subsystem know about this
@@ -4272,23 +4227,31 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (parent != root_mem_cgroup)
memory_cgrp_subsys.broken_hierarchy = true;
}
- mutex_unlock(&memcg_create_mutex);
- ret = memcg_init_kmem(memcg, &memory_cgrp_subsys);
- if (ret)
- return ret;
+ /* The following stuff does not apply to the root */
+ if (!parent) {
+ root_mem_cgroup = memcg;
+ return &memcg->css;
+ }
+
+ error = memcg_propagate_kmem(parent, memcg);
+ if (error)
+ goto fail;
-#ifdef CONFIG_INET
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_inc(&memcg_sockets_enabled_key);
-#endif
- /*
- * Make sure the memcg is initialized: mem_cgroup_iter()
- * orders reading memcg->initialized against its callers
- * reading the memcg members.
- */
- smp_store_release(&memcg->initialized, 1);
+ return &memcg->css;
+fail:
+ mem_cgroup_free(memcg);
+ return NULL;
+}
+
+static int
+mem_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+ if (css->id > MEM_CGROUP_ID_MAX)
+ return -ENOSPC;
return 0;
}
@@ -4310,10 +4273,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
}
spin_unlock(&memcg->event_list_lock);
- vmpressure_cleanup(&memcg->vmpressure);
-
- memcg_deactivate_kmem(memcg);
-
+ memcg_offline_kmem(memcg);
wb_memcg_offline(memcg);
}
@@ -4328,12 +4288,17 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- memcg_destroy_kmem(memcg);
-#ifdef CONFIG_INET
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_dec(&memcg_sockets_enabled_key);
-#endif
- __mem_cgroup_free(memcg);
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
+ static_branch_dec(&memcg_sockets_enabled_key);
+
+ vmpressure_cleanup(&memcg->vmpressure);
+ cancel_work_sync(&memcg->high_work);
+ mem_cgroup_remove_from_trees(memcg);
+ memcg_free_kmem(memcg);
+ mem_cgroup_free(memcg);
}
/**
@@ -5143,6 +5108,59 @@ static int memory_events_show(struct seq_file *m, void *v)
return 0;
}
+static int memory_stat_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ int i;
+
+ /*
+ * Provide statistics on the state of the memory subsystem as
+ * well as cumulative event counters that show past behavior.
+ *
+ * This list is ordered following a combination of these gradients:
+ * 1) generic big picture -> specifics and details
+ * 2) reflecting userspace activity -> reflecting kernel heuristics
+ *
+ * Current memory state:
+ */
+
+ seq_printf(m, "anon %llu\n",
+ (u64)tree_stat(memcg, MEM_CGROUP_STAT_RSS) * PAGE_SIZE);
+ seq_printf(m, "file %llu\n",
+ (u64)tree_stat(memcg, MEM_CGROUP_STAT_CACHE) * PAGE_SIZE);
+ seq_printf(m, "sock %llu\n",
+ (u64)tree_stat(memcg, MEMCG_SOCK) * PAGE_SIZE);
+
+ seq_printf(m, "file_mapped %llu\n",
+ (u64)tree_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED) *
+ PAGE_SIZE);
+ seq_printf(m, "file_dirty %llu\n",
+ (u64)tree_stat(memcg, MEM_CGROUP_STAT_DIRTY) *
+ PAGE_SIZE);
+ seq_printf(m, "file_writeback %llu\n",
+ (u64)tree_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) *
+ PAGE_SIZE);
+
+ for (i = 0; i < NR_LRU_LISTS; i++) {
+ struct mem_cgroup *mi;
+ unsigned long val = 0;
+
+ for_each_mem_cgroup_tree(mi, memcg)
+ val += mem_cgroup_nr_lru_pages(mi, BIT(i));
+ seq_printf(m, "%s %llu\n",
+ mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
+ }
+
+ /* Accumulated memory events */
+
+ seq_printf(m, "pgfault %lu\n",
+ tree_events(memcg, MEM_CGROUP_EVENTS_PGFAULT));
+ seq_printf(m, "pgmajfault %lu\n",
+ tree_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT));
+
+ return 0;
+}
+
static struct cftype memory_files[] = {
{
.name = "current",
@@ -5173,6 +5191,11 @@ static struct cftype memory_files[] = {
.file_offset = offsetof(struct mem_cgroup, events_file),
.seq_show = memory_events_show,
},
+ {
+ .name = "stat",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = memory_stat_show,
+ },
{ } /* terminate */
};
@@ -5269,7 +5292,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
if (page->mem_cgroup)
goto out;
- if (do_memsw_account()) {
+ if (do_swap_account) {
swp_entry_t ent = { .val = page_private(page), };
unsigned short id = lookup_swap_cgroup_id(ent);
@@ -5504,7 +5527,8 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
{
struct mem_cgroup *memcg;
- int isolated;
+ unsigned int nr_pages;
+ bool compound;
VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
@@ -5524,14 +5548,22 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
if (!memcg)
return;
- lock_page_lru(oldpage, &isolated);
- oldpage->mem_cgroup = NULL;
- unlock_page_lru(oldpage, isolated);
+ /* Force-charge the new page. The old one will be freed soon */
+ compound = PageTransHuge(newpage);
+ nr_pages = compound ? hpage_nr_pages(newpage) : 1;
+
+ page_counter_charge(&memcg->memory, nr_pages);
+ if (do_memsw_account())
+ page_counter_charge(&memcg->memsw, nr_pages);
+ css_get_many(&memcg->css, nr_pages);
commit_charge(newpage, memcg, true);
-}
-#ifdef CONFIG_INET
+ local_irq_disable();
+ mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
+ memcg_check_events(memcg, newpage);
+ local_irq_enable();
+}
DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
EXPORT_SYMBOL(memcg_sockets_enabled_key);
@@ -5558,10 +5590,8 @@ void sock_update_memcg(struct sock *sk)
memcg = mem_cgroup_from_task(current);
if (memcg == root_mem_cgroup)
goto out;
-#ifdef CONFIG_MEMCG_KMEM
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcp_mem.active)
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
goto out;
-#endif
if (css_tryget_online(&memcg->css))
sk->sk_memcg = memcg;
out:
@@ -5587,24 +5617,24 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
{
gfp_t gfp_mask = GFP_KERNEL;
-#ifdef CONFIG_MEMCG_KMEM
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
- struct page_counter *counter;
+ struct page_counter *fail;
- if (page_counter_try_charge(&memcg->tcp_mem.memory_allocated,
- nr_pages, &counter)) {
- memcg->tcp_mem.memory_pressure = 0;
+ if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
+ memcg->tcpmem_pressure = 0;
return true;
}
- page_counter_charge(&memcg->tcp_mem.memory_allocated, nr_pages);
- memcg->tcp_mem.memory_pressure = 1;
+ page_counter_charge(&memcg->tcpmem, nr_pages);
+ memcg->tcpmem_pressure = 1;
return false;
}
-#endif
+
/* Don't block in the packet receive path */
if (in_softirq())
gfp_mask = GFP_NOWAIT;
+ this_cpu_add(memcg->stat->count[MEMCG_SOCK], nr_pages);
+
if (try_charge(memcg, gfp_mask, nr_pages) == 0)
return true;
@@ -5619,19 +5649,17 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
*/
void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
{
-#ifdef CONFIG_MEMCG_KMEM
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
- page_counter_uncharge(&memcg->tcp_mem.memory_allocated,
- nr_pages);
+ page_counter_uncharge(&memcg->tcpmem, nr_pages);
return;
}
-#endif
+
+ this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages);
+
page_counter_uncharge(&memcg->memory, nr_pages);
css_put_many(&memcg->css, nr_pages);
}
-#endif /* CONFIG_INET */
-
static int __init cgroup_memory(char *s)
{
char *token;
@@ -5641,6 +5669,8 @@ static int __init cgroup_memory(char *s)
continue;
if (!strcmp(token, "nosocket"))
cgroup_memory_nosocket = true;
+ if (!strcmp(token, "nokmem"))
+ cgroup_memory_nokmem = true;
}
return 0;
}
@@ -5730,32 +5760,107 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
memcg_check_events(memcg, page);
}
+/*
+ * mem_cgroup_try_charge_swap - try charging a swap entry
+ * @page: page being added to swap
+ * @entry: swap entry to charge
+ *
+ * Try to charge @entry to the memcg that @page belongs to.
+ *
+ * Returns 0 on success, -ENOMEM on failure.
+ */
+int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
+{
+ struct mem_cgroup *memcg;
+ struct page_counter *counter;
+ unsigned short oldid;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
+ return 0;
+
+ memcg = page->mem_cgroup;
+
+ /* Readahead page, never charged */
+ if (!memcg)
+ return 0;
+
+ if (!mem_cgroup_is_root(memcg) &&
+ !page_counter_try_charge(&memcg->swap, 1, &counter))
+ return -ENOMEM;
+
+ oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
+ VM_BUG_ON_PAGE(oldid, page);
+ mem_cgroup_swap_statistics(memcg, true);
+
+ css_get(&memcg->css);
+ return 0;
+}
+
/**
* mem_cgroup_uncharge_swap - uncharge a swap entry
* @entry: swap entry to uncharge
*
- * Drop the memsw charge associated with @entry.
+ * Drop the swap charge associated with @entry.
*/
void mem_cgroup_uncharge_swap(swp_entry_t entry)
{
struct mem_cgroup *memcg;
unsigned short id;
- if (!do_memsw_account())
+ if (!do_swap_account)
return;
id = swap_cgroup_record(entry, 0);
rcu_read_lock();
memcg = mem_cgroup_from_id(id);
if (memcg) {
- if (!mem_cgroup_is_root(memcg))
- page_counter_uncharge(&memcg->memsw, 1);
+ if (!mem_cgroup_is_root(memcg)) {
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ page_counter_uncharge(&memcg->swap, 1);
+ else
+ page_counter_uncharge(&memcg->memsw, 1);
+ }
mem_cgroup_swap_statistics(memcg, false);
css_put(&memcg->css);
}
rcu_read_unlock();
}
+long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
+{
+ long nr_swap_pages = get_nr_swap_pages();
+
+ if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return nr_swap_pages;
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
+ nr_swap_pages = min_t(long, nr_swap_pages,
+ READ_ONCE(memcg->swap.limit) -
+ page_counter_read(&memcg->swap));
+ return nr_swap_pages;
+}
+
+bool mem_cgroup_swap_full(struct page *page)
+{
+ struct mem_cgroup *memcg;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+ if (vm_swap_full())
+ return true;
+ if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return false;
+
+ memcg = page->mem_cgroup;
+ if (!memcg)
+ return false;
+
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
+ if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit)
+ return true;
+
+ return false;
+}
+
/* for remember boot option*/
#ifdef CONFIG_MEMCG_SWAP_ENABLED
static int really_do_swap_account __initdata = 1;
@@ -5773,6 +5878,63 @@ static int __init enable_swap_account(char *s)
}
__setup("swapaccount=", enable_swap_account);
+static u64 swap_current_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
+}
+
+static int swap_max_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ unsigned long max = READ_ONCE(memcg->swap.limit);
+
+ if (max == PAGE_COUNTER_MAX)
+ seq_puts(m, "max\n");
+ else
+ seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
+
+ return 0;
+}
+
+static ssize_t swap_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long max;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "max", &max);
+ if (err)
+ return err;
+
+ mutex_lock(&memcg_limit_mutex);
+ err = page_counter_limit(&memcg->swap, max);
+ mutex_unlock(&memcg_limit_mutex);
+ if (err)
+ return err;
+
+ return nbytes;
+}
+
+static struct cftype swap_files[] = {
+ {
+ .name = "swap.current",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = swap_current_read,
+ },
+ {
+ .name = "swap.max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = swap_max_show,
+ .write = swap_max_write,
+ },
+ { } /* terminate */
+};
+
static struct cftype memsw_cgroup_files[] = {
{
.name = "memsw.usage_in_bytes",
@@ -5804,6 +5966,8 @@ static int __init mem_cgroup_swap_init(void)
{
if (!mem_cgroup_disabled() && really_do_swap_account) {
do_swap_account = 1;
+ WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
+ swap_files));
WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
memsw_cgroup_files));
}
diff --git a/mm/memory.c b/mm/memory.c
index ff17850a52d9..30991f83d0bf 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2582,7 +2582,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
swap_free(entry);
- if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
+ if (mem_cgroup_swap_full(page) ||
+ (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
try_to_free_swap(page);
unlock_page(page);
if (page != swapcache) {
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index e88d071648c2..5d453e58ddbf 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -194,7 +194,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
goto free_proc_pages;
}
- mm = mm_access(task, PTRACE_MODE_ATTACH);
+ mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS);
if (!mm || IS_ERR(mm)) {
rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
/*
diff --git a/mm/shmem.c b/mm/shmem.c
index b98e1011858c..fa2ceb2d2655 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -912,6 +912,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (!swap.val)
goto redirty;
+ if (mem_cgroup_try_charge_swap(page, swap))
+ goto free_swap;
+
/*
* Add inode to shmem_unuse()'s list of swapped-out inodes,
* if it's not already there. Do it now before the page is
@@ -940,6 +943,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
}
mutex_unlock(&shmem_swaplist_mutex);
+free_swap:
swapcache_free(swap);
redirty:
set_page_dirty(page);
diff --git a/mm/slab.h b/mm/slab.h
index c63b8699cfa3..834ad240c0bb 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -173,7 +173,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
/*
* Iterate over all memcg caches of the given root cache. The caller must hold
* slab_mutex.
@@ -251,7 +251,7 @@ static __always_inline int memcg_charge_slab(struct page *page,
extern void slab_init_memcg_params(struct kmem_cache *);
-#else /* !CONFIG_MEMCG_KMEM */
+#else /* CONFIG_MEMCG && !CONFIG_SLOB */
#define for_each_memcg_cache(iter, root) \
for ((void)(iter), (void)(root); 0; )
@@ -292,7 +292,7 @@ static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order,
static inline void slab_init_memcg_params(struct kmem_cache *s)
{
}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
{
diff --git a/mm/slab_common.c b/mm/slab_common.c
index e016178063e1..b50aef01ccf7 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -128,7 +128,7 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
return i;
}
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
void slab_init_memcg_params(struct kmem_cache *s)
{
s->memcg_params.is_root_cache = true;
@@ -221,7 +221,7 @@ static inline int init_memcg_params(struct kmem_cache *s,
static inline void destroy_memcg_params(struct kmem_cache *s)
{
}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
/*
* Find a mergeable slab cache
@@ -477,7 +477,7 @@ static void release_caches(struct list_head *release, bool need_rcu_barrier)
}
}
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
/*
* memcg_create_kmem_cache - Create a cache for a memory cgroup.
* @memcg: The memory cgroup the new cache is for.
@@ -503,10 +503,10 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
mutex_lock(&slab_mutex);
/*
- * The memory cgroup could have been deactivated while the cache
+ * The memory cgroup could have been offlined while the cache
* creation work was pending.
*/
- if (!memcg_kmem_is_active(memcg))
+ if (!memcg_kmem_online(memcg))
goto out_unlock;
idx = memcg_cache_id(memcg);
@@ -689,7 +689,7 @@ static inline int shutdown_memcg_caches(struct kmem_cache *s,
{
return 0;
}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
void slab_kmem_cache_release(struct kmem_cache *s)
{
@@ -1123,7 +1123,7 @@ static int slab_show(struct seq_file *m, void *p)
return 0;
}
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
int memcg_slab_show(struct seq_file *m, void *p)
{
struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
diff --git a/mm/slub.c b/mm/slub.c
index b21fd24b08b1..2e1355ac056b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5207,7 +5207,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
return -EIO;
err = attribute->store(s, buf, len);
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
struct kmem_cache *c;
@@ -5242,7 +5242,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
static void memcg_propagate_slab_attrs(struct kmem_cache *s)
{
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
int i;
char *buffer = NULL;
struct kmem_cache *root_cache;
@@ -5328,7 +5328,7 @@ static struct kset *slab_kset;
static inline struct kset *cache_kset(struct kmem_cache *s)
{
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
if (!is_root_cache(s))
return s->memcg_params.root_cache->memcg_kset;
#endif
@@ -5405,7 +5405,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
if (err)
goto out_del_kobj;
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
if (is_root_cache(s)) {
s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
if (!s->memcg_kset) {
@@ -5438,7 +5438,7 @@ void sysfs_slab_remove(struct kmem_cache *s)
*/
return;
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
kset_unregister(s->memcg_kset);
#endif
kobject_uevent(&s->kobj, KOBJ_REMOVE);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 676ff2991380..69cb2464e7dc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -170,6 +170,11 @@ int add_to_swap(struct page *page, struct list_head *list)
if (!entry.val)
return 0;
+ if (mem_cgroup_try_charge_swap(page, entry)) {
+ swapcache_free(entry);
+ return 0;
+ }
+
if (unlikely(PageTransHuge(page)))
if (unlikely(split_huge_page_to_list(page, list))) {
swapcache_free(entry);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2bb30aa3a412..c43f654a7b64 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -785,14 +785,12 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
count--;
}
- if (!count)
- mem_cgroup_uncharge_swap(entry);
-
usage = count | has_cache;
p->swap_map[offset] = usage;
/* free if no reference */
if (!usage) {
+ mem_cgroup_uncharge_swap(entry);
dec_cluster_info_page(p, p->cluster_info, offset);
if (offset < p->lowest_bit)
p->lowest_bit = offset;
@@ -1008,7 +1006,7 @@ int free_swap_and_cache(swp_entry_t entry)
* Also recheck PageSwapCache now page is locked (above).
*/
if (PageSwapCache(page) && !PageWriteback(page) &&
- (!page_mapped(page) || vm_swap_full())) {
+ (!page_mapped(page) || mem_cgroup_swap_full(page))) {
delete_from_swap_cache(page);
SetPageDirty(page);
}
diff --git a/mm/util.c b/mm/util.c
index 6d1f9200f74e..c108a6542d05 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -476,17 +476,25 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen)
int res = 0;
unsigned int len;
struct mm_struct *mm = get_task_mm(task);
+ unsigned long arg_start, arg_end, env_start, env_end;
if (!mm)
goto out;
if (!mm->arg_end)
goto out_mm; /* Shh! No looking before we're done */
- len = mm->arg_end - mm->arg_start;
+ down_read(&mm->mmap_sem);
+ arg_start = mm->arg_start;
+ arg_end = mm->arg_end;
+ env_start = mm->env_start;
+ env_end = mm->env_end;
+ up_read(&mm->mmap_sem);
+
+ len = arg_end - arg_start;
if (len > buflen)
len = buflen;
- res = access_process_vm(task, mm->arg_start, buffer, len, 0);
+ res = access_process_vm(task, arg_start, buffer, len, 0);
/*
* If the nul at the end of args has been overwritten, then
@@ -497,10 +505,10 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen)
if (len < res) {
res = len;
} else {
- len = mm->env_end - mm->env_start;
+ len = env_end - env_start;
if (len > buflen - res)
len = buflen - res;
- res += access_process_vm(task, mm->env_start,
+ res += access_process_vm(task, env_start,
buffer+res, len, 0);
res = strnlen(buffer, res);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5ac86956ff9d..bd620b65db52 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -411,7 +411,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
struct shrinker *shrinker;
unsigned long freed = 0;
- if (memcg && !memcg_kmem_is_active(memcg))
+ if (memcg && !memcg_kmem_online(memcg))
return 0;
if (nr_scanned == 0)
@@ -1214,7 +1214,7 @@ cull_mlocked:
activate_locked:
/* Not a candidate for swapping, so reclaim swap space. */
- if (PageSwapCache(page) && vm_swap_full())
+ if (PageSwapCache(page) && mem_cgroup_swap_full(page))
try_to_free_swap(page);
VM_BUG_ON_PAGE(PageActive(page), page);
SetPageActive(page);
@@ -1966,10 +1966,11 @@ enum scan_balance {
* nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
* nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
*/
-static void get_scan_count(struct lruvec *lruvec, int swappiness,
+static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
struct scan_control *sc, unsigned long *nr,
unsigned long *lru_pages)
{
+ int swappiness = mem_cgroup_swappiness(memcg);
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
u64 fraction[2];
u64 denominator = 0; /* gcc */
@@ -1996,14 +1997,14 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness,
if (current_is_kswapd()) {
if (!zone_reclaimable(zone))
force_scan = true;
- if (!mem_cgroup_lruvec_online(lruvec))
+ if (!mem_cgroup_online(memcg))
force_scan = true;
}
if (!global_reclaim(sc))
force_scan = true;
/* If we have no swap space, do not bother scanning anon pages. */
- if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
+ if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
scan_balance = SCAN_FILE;
goto out;
}
@@ -2193,9 +2194,10 @@ static inline void init_tlb_ubc(void)
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
-static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
- struct scan_control *sc, unsigned long *lru_pages)
+static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg,
+ struct scan_control *sc, unsigned long *lru_pages)
{
+ struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
unsigned long nr[NR_LRU_LISTS];
unsigned long targets[NR_LRU_LISTS];
unsigned long nr_to_scan;
@@ -2205,7 +2207,7 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
struct blk_plug plug;
bool scan_adjusted;
- get_scan_count(lruvec, swappiness, sc, nr, lru_pages);
+ get_scan_count(lruvec, memcg, sc, nr, lru_pages);
/* Record the original scan target for proportional adjustments later */
memcpy(targets, nr, sizeof(nr));
@@ -2409,8 +2411,6 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
unsigned long lru_pages;
unsigned long reclaimed;
unsigned long scanned;
- struct lruvec *lruvec;
- int swappiness;
if (mem_cgroup_low(root, memcg)) {
if (!sc->may_thrash)
@@ -2418,12 +2418,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
mem_cgroup_events(memcg, MEMCG_LOW, 1);
}
- lruvec = mem_cgroup_zone_lruvec(zone, memcg);
- swappiness = mem_cgroup_swappiness(memcg);
reclaimed = sc->nr_reclaimed;
scanned = sc->nr_scanned;
- shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
+ shrink_zone_memcg(zone, memcg, sc, &lru_pages);
zone_lru_pages += lru_pages;
if (memcg && is_classzone)
@@ -2893,8 +2891,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
.may_unmap = 1,
.may_swap = !noswap,
};
- struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
- int swappiness = mem_cgroup_swappiness(memcg);
unsigned long lru_pages;
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
@@ -2911,7 +2907,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
* will pick up pages from other mem cgroup's as well. We hack
* the priority and make it zero.
*/
- shrink_lruvec(lruvec, swappiness, &sc, &lru_pages);
+ shrink_zone_memcg(zone, memcg, &sc, &lru_pages);
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index e7414cec220b..2d7c4c11fc63 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -309,7 +309,12 @@ static void free_handle(struct zs_pool *pool, unsigned long handle)
static void record_obj(unsigned long handle, unsigned long obj)
{
- *(unsigned long *)handle = obj;
+ /*
+ * lsb of @obj represents handle lock while other bits
+ * represent object value the handle is pointing so
+ * updating shouldn't do store tearing.
+ */
+ WRITE_ONCE(*(unsigned long *)handle, obj);
}
/* zpool driver */
@@ -1635,6 +1640,13 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
free_obj = obj_malloc(d_page, class, handle);
zs_object_copy(free_obj, used_obj, class);
index++;
+ /*
+ * record_obj updates handle's value to free_obj and it will
+ * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which
+ * breaks synchronization using pin_tag(e,g, zs_free) so
+ * let's keep the lock bit.
+ */
+ free_obj |= BIT(HANDLE_PIN_BIT);
record_obj(handle, free_obj);
unpin_tag(handle);
obj_free(pool, class, used_obj);