summaryrefslogtreecommitdiff
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c1095
1 files changed, 635 insertions, 460 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7b3503d12aaf..902da8a9c643 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -29,6 +29,7 @@
#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
+#include <linux/cpuset.h>
#include <linux/sched/mm.h>
#include <linux/shmem_fs.h>
#include <linux/hugetlb.h>
@@ -95,6 +96,9 @@ static bool cgroup_memory_nokmem __ro_after_init;
/* BPF memory accounting disabled? */
static bool cgroup_memory_nobpf __ro_after_init;
+static struct kmem_cache *memcg_cachep;
+static struct kmem_cache *memcg_pn_cachep;
+
#ifdef CONFIG_CGROUP_WRITEBACK
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
#endif
@@ -129,8 +133,7 @@ bool mem_cgroup_kmem_disabled(void)
return cgroup_memory_nokmem;
}
-static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
- unsigned int nr_pages);
+static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages);
static void obj_cgroup_release(struct percpu_ref *ref)
{
@@ -163,8 +166,16 @@ static void obj_cgroup_release(struct percpu_ref *ref)
WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
nr_pages = nr_bytes >> PAGE_SHIFT;
- if (nr_pages)
- obj_cgroup_uncharge_pages(objcg, nr_pages);
+ if (nr_pages) {
+ struct mem_cgroup *memcg;
+
+ memcg = get_mem_cgroup_from_objcg(objcg);
+ mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
+ memcg1_account_kmem(memcg, -nr_pages);
+ if (!mem_cgroup_is_root(memcg))
+ memcg_uncharge(memcg, nr_pages);
+ mem_cgroup_put(memcg);
+ }
spin_lock_irqsave(&objcg_lock, flags);
list_del(&objcg->list);
@@ -315,6 +326,7 @@ static const unsigned int memcg_node_stat_items[] = {
PGDEMOTE_KSWAPD,
PGDEMOTE_DIRECT,
PGDEMOTE_KHUGEPAGED,
+ PGDEMOTE_PROACTIVE,
#ifdef CONFIG_HUGETLB_PAGE
NR_HUGETLB,
#endif
@@ -431,9 +443,11 @@ static const unsigned int memcg_vm_event_stat[] = {
PGSCAN_KSWAPD,
PGSCAN_DIRECT,
PGSCAN_KHUGEPAGED,
+ PGSCAN_PROACTIVE,
PGSTEAL_KSWAPD,
PGSTEAL_DIRECT,
PGSTEAL_KHUGEPAGED,
+ PGSTEAL_PROACTIVE,
PGFAULT,
PGMAJFAULT,
PGREFILL,
@@ -460,6 +474,8 @@ static const unsigned int memcg_vm_event_stat[] = {
NUMA_PAGE_MIGRATE,
NUMA_PTE_UPDATES,
NUMA_HINT_FAULTS,
+ NUMA_TASK_MIGRATE,
+ NUMA_TASK_SWAP,
#endif
};
@@ -489,8 +505,8 @@ struct memcg_vmstats_percpu {
unsigned int stats_updates;
/* Cached pointers for fast iteration in memcg_rstat_updated() */
- struct memcg_vmstats_percpu *parent;
- struct memcg_vmstats *vmstats;
+ struct memcg_vmstats_percpu __percpu *parent_pcpu;
+ struct memcg_vmstats *vmstats;
/* The above should fit a single cacheline for memcg_rstat_updated() */
@@ -517,7 +533,7 @@ struct memcg_vmstats {
unsigned long events_pending[NR_MEMCG_EVENTS];
/* Stats updates since the last flush */
- atomic64_t stats_updates;
+ atomic_t stats_updates;
};
/*
@@ -541,60 +557,43 @@ static u64 flush_last_time;
#define FLUSH_TIME (2UL*HZ)
-/*
- * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
- * not rely on this as part of an acquired spinlock_t lock. These functions are
- * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
- * is sufficient.
- */
-static void memcg_stats_lock(void)
-{
- preempt_disable_nested();
- VM_WARN_ON_IRQS_ENABLED();
-}
-
-static void __memcg_stats_lock(void)
-{
- preempt_disable_nested();
-}
-
-static void memcg_stats_unlock(void)
-{
- preempt_enable_nested();
-}
-
-
static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats)
{
- return atomic64_read(&vmstats->stats_updates) >
+ return atomic_read(&vmstats->stats_updates) >
MEMCG_CHARGE_BATCH * num_online_cpus();
}
-static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
+static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val,
+ int cpu)
{
+ struct memcg_vmstats_percpu __percpu *statc_pcpu;
struct memcg_vmstats_percpu *statc;
- int cpu = smp_processor_id();
unsigned int stats_updates;
if (!val)
return;
- cgroup_rstat_updated(memcg->css.cgroup, cpu);
- statc = this_cpu_ptr(memcg->vmstats_percpu);
- for (; statc; statc = statc->parent) {
- stats_updates = READ_ONCE(statc->stats_updates) + abs(val);
- WRITE_ONCE(statc->stats_updates, stats_updates);
+ /* TODO: add to cgroup update tree once it is nmi-safe. */
+ if (!in_nmi())
+ css_rstat_updated(&memcg->css, cpu);
+ statc_pcpu = memcg->vmstats_percpu;
+ for (; statc_pcpu; statc_pcpu = statc->parent_pcpu) {
+ statc = this_cpu_ptr(statc_pcpu);
+ /*
+ * If @memcg is already flushable then all its ancestors are
+ * flushable as well and also there is no need to increase
+ * stats_updates.
+ */
+ if (memcg_vmstats_needs_flush(statc->vmstats))
+ break;
+
+ stats_updates = this_cpu_add_return(statc_pcpu->stats_updates,
+ abs(val));
if (stats_updates < MEMCG_CHARGE_BATCH)
continue;
- /*
- * If @memcg is already flush-able, increasing stats_updates is
- * redundant. Avoid the overhead of the atomic update.
- */
- if (!memcg_vmstats_needs_flush(statc->vmstats))
- atomic64_add(stats_updates,
- &statc->vmstats->stats_updates);
- WRITE_ONCE(statc->stats_updates, 0);
+ stats_updates = this_cpu_xchg(statc_pcpu->stats_updates, 0);
+ atomic_add(stats_updates, &statc->vmstats->stats_updates);
}
}
@@ -602,7 +601,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force)
{
bool needs_flush = memcg_vmstats_needs_flush(memcg->vmstats);
- trace_memcg_flush_stats(memcg, atomic64_read(&memcg->vmstats->stats_updates),
+ trace_memcg_flush_stats(memcg, atomic_read(&memcg->vmstats->stats_updates),
force, needs_flush);
if (!force && !needs_flush)
@@ -611,7 +610,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force)
if (mem_cgroup_is_root(memcg))
WRITE_ONCE(flush_last_time, jiffies_64);
- cgroup_rstat_flush(memcg->css.cgroup);
+ css_rstat_flush(&memcg->css);
}
/*
@@ -684,15 +683,16 @@ static int memcg_state_val_in_pages(int idx, int val)
}
/**
- * __mod_memcg_state - update cgroup memory statistics
+ * mod_memcg_state - update cgroup memory statistics
* @memcg: the memory cgroup
* @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
* @val: delta to add to the counter, can be negative
*/
-void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
+void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
int val)
{
int i = memcg_stats_index(idx);
+ int cpu;
if (mem_cgroup_disabled())
return;
@@ -700,12 +700,17 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
- __this_cpu_add(memcg->vmstats_percpu->state[i], val);
+ cpu = get_cpu();
+
+ this_cpu_add(memcg->vmstats_percpu->state[i], val);
val = memcg_state_val_in_pages(idx, val);
- memcg_rstat_updated(memcg, val);
+ memcg_rstat_updated(memcg, val, cpu);
trace_mod_memcg_state(memcg, idx, val);
+
+ put_cpu();
}
+#ifdef CONFIG_MEMCG_V1
/* idx can be of type enum memcg_stat_item or node_stat_item. */
unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
{
@@ -722,14 +727,16 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
#endif
return x;
}
+#endif
-static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
+static void mod_memcg_lruvec_state(struct lruvec *lruvec,
enum node_stat_item idx,
int val)
{
struct mem_cgroup_per_node *pn;
struct mem_cgroup *memcg;
int i = memcg_stats_index(idx);
+ int cpu;
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
@@ -737,35 +744,19 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
memcg = pn->memcg;
- /*
- * The caller from rmap relies on disabled preemption because they never
- * update their counter from in-interrupt context. For these two
- * counters we check that the update is never performed from an
- * interrupt context while other caller need to have disabled interrupt.
- */
- __memcg_stats_lock();
- if (IS_ENABLED(CONFIG_DEBUG_VM)) {
- switch (idx) {
- case NR_ANON_MAPPED:
- case NR_FILE_MAPPED:
- case NR_ANON_THPS:
- WARN_ON_ONCE(!in_task());
- break;
- default:
- VM_WARN_ON_IRQS_ENABLED();
- }
- }
+ cpu = get_cpu();
/* Update memcg */
- __this_cpu_add(memcg->vmstats_percpu->state[i], val);
+ this_cpu_add(memcg->vmstats_percpu->state[i], val);
/* Update lruvec */
- __this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
+ this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
val = memcg_state_val_in_pages(idx, val);
- memcg_rstat_updated(memcg, val);
+ memcg_rstat_updated(memcg, val, cpu);
trace_mod_memcg_lruvec_state(memcg, idx, val);
- memcg_stats_unlock();
+
+ put_cpu();
}
/**
@@ -786,7 +777,7 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
/* Update memcg and lruvec */
if (!mem_cgroup_disabled())
- __mod_memcg_lruvec_state(lruvec, idx, val);
+ mod_memcg_lruvec_state(lruvec, idx, val);
}
void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx,
@@ -836,15 +827,16 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
}
/**
- * __count_memcg_events - account VM events in a cgroup
+ * count_memcg_events - account VM events in a cgroup
* @memcg: the memory cgroup
* @idx: the event item
* @count: the number of events that occurred
*/
-void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
+void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
unsigned long count)
{
int i = memcg_events_index(idx);
+ int cpu;
if (mem_cgroup_disabled())
return;
@@ -852,11 +844,13 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
- memcg_stats_lock();
- __this_cpu_add(memcg->vmstats_percpu->events[i], count);
- memcg_rstat_updated(memcg, count);
+ cpu = get_cpu();
+
+ this_cpu_add(memcg->vmstats_percpu->events[i], count);
+ memcg_rstat_updated(memcg, count, cpu);
trace_count_memcg_events(memcg, idx, count);
- memcg_stats_unlock();
+
+ put_cpu();
}
unsigned long memcg_events(struct mem_cgroup *memcg, int event)
@@ -869,6 +863,7 @@ unsigned long memcg_events(struct mem_cgroup *memcg, int event)
return READ_ONCE(memcg->vmstats->events[i]);
}
+#ifdef CONFIG_MEMCG_V1
unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
{
int i = memcg_events_index(event);
@@ -878,6 +873,7 @@ unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
return READ_ONCE(memcg->vmstats->events_local[i]);
}
+#endif
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
{
@@ -1169,8 +1165,11 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
struct task_struct *task;
css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
- while (!ret && (task = css_task_iter_next(&it)))
+ while (!ret && (task = css_task_iter_next(&it))) {
ret = fn(task, arg);
+ /* Avoid potential softlockup warning */
+ cond_resched();
+ }
css_task_iter_end(&it);
if (ret) {
mem_cgroup_iter_break(memcg, iter);
@@ -1385,6 +1384,7 @@ static const struct memory_stat memory_stats[] = {
{ "pgdemote_kswapd", PGDEMOTE_KSWAPD },
{ "pgdemote_direct", PGDEMOTE_DIRECT },
{ "pgdemote_khugepaged", PGDEMOTE_KHUGEPAGED },
+ { "pgdemote_proactive", PGDEMOTE_PROACTIVE },
#ifdef CONFIG_NUMA_BALANCING
{ "pgpromote_success", PGPROMOTE_SUCCESS },
#endif
@@ -1427,6 +1427,7 @@ static int memcg_page_state_output_unit(int item)
case PGDEMOTE_KSWAPD:
case PGDEMOTE_DIRECT:
case PGDEMOTE_KHUGEPAGED:
+ case PGDEMOTE_PROACTIVE:
#ifdef CONFIG_NUMA_BALANCING
case PGPROMOTE_SUCCESS:
#endif
@@ -1442,11 +1443,25 @@ unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item)
memcg_page_state_output_unit(item);
}
+#ifdef CONFIG_MEMCG_V1
unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item)
{
return memcg_page_state_local(memcg, item) *
memcg_page_state_output_unit(item);
}
+#endif
+
+#ifdef CONFIG_HUGETLB_PAGE
+static bool memcg_accounts_hugetlb(void)
+{
+ return cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+}
+#else /* CONFIG_HUGETLB_PAGE */
+static bool memcg_accounts_hugetlb(void)
+{
+ return false;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
{
@@ -1469,7 +1484,7 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
#ifdef CONFIG_HUGETLB_PAGE
if (unlikely(memory_stats[i].idx == NR_HUGETLB) &&
- !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING))
+ !memcg_accounts_hugetlb())
continue;
#endif
size = memcg_page_state_output(memcg, memory_stats[i].idx);
@@ -1486,10 +1501,12 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
seq_buf_printf(s, "pgscan %lu\n",
memcg_events(memcg, PGSCAN_KSWAPD) +
memcg_events(memcg, PGSCAN_DIRECT) +
+ memcg_events(memcg, PGSCAN_PROACTIVE) +
memcg_events(memcg, PGSCAN_KHUGEPAGED));
seq_buf_printf(s, "pgsteal %lu\n",
memcg_events(memcg, PGSTEAL_KSWAPD) +
memcg_events(memcg, PGSTEAL_DIRECT) +
+ memcg_events(memcg, PGSTEAL_PROACTIVE) +
memcg_events(memcg, PGSTEAL_KHUGEPAGED));
for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) {
@@ -1549,16 +1566,23 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
/* Use static buffer, for the caller is holding oom_lock. */
static char buf[SEQ_BUF_SIZE];
struct seq_buf s;
+ unsigned long memory_failcnt;
lockdep_assert_held(&oom_lock);
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ memory_failcnt = atomic_long_read(&memcg->memory_events[MEMCG_MAX]);
+ else
+ memory_failcnt = memcg->memory.failcnt;
+
pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memory)),
- K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
+ K((u64)READ_ONCE(memcg->memory.max)), memory_failcnt);
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->swap)),
- K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
+ K((u64)READ_ONCE(memcg->swap.max)),
+ atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
#ifdef CONFIG_MEMCG_V1
else {
pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
@@ -1627,7 +1651,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
* A few threads which were not waiting at mutex_lock_killable() can
* fail to bail out. Therefore, check again after holding oom_lock.
*/
- ret = task_is_dying() || out_of_memory(&oc);
+ ret = out_of_memory(&oc);
unlock:
mutex_unlock(&oom_lock);
@@ -1721,28 +1745,45 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
pr_cont(" are going to be killed due to memory.oom.group set\n");
}
+/*
+ * The value of NR_MEMCG_STOCK is selected to keep the cached memcgs and their
+ * nr_pages in a single cacheline. This may change in future.
+ */
+#define NR_MEMCG_STOCK 7
+#define FLUSHING_CACHED_CHARGE 0
struct memcg_stock_pcp {
- local_lock_t stock_lock;
- struct mem_cgroup *cached; /* this never be root cgroup */
- unsigned int nr_pages;
+ local_trylock_t lock;
+ uint8_t nr_pages[NR_MEMCG_STOCK];
+ struct mem_cgroup *cached[NR_MEMCG_STOCK];
+ struct work_struct work;
+ unsigned long flags;
+};
+
+static DEFINE_PER_CPU_ALIGNED(struct memcg_stock_pcp, memcg_stock) = {
+ .lock = INIT_LOCAL_TRYLOCK(lock),
+};
+
+struct obj_stock_pcp {
+ local_trylock_t lock;
+ unsigned int nr_bytes;
struct obj_cgroup *cached_objcg;
struct pglist_data *cached_pgdat;
- unsigned int nr_bytes;
int nr_slab_reclaimable_b;
int nr_slab_unreclaimable_b;
struct work_struct work;
unsigned long flags;
-#define FLUSHING_CACHED_CHARGE 0
};
-static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
- .stock_lock = INIT_LOCAL_LOCK(stock_lock),
+
+static DEFINE_PER_CPU_ALIGNED(struct obj_stock_pcp, obj_stock) = {
+ .lock = INIT_LOCAL_TRYLOCK(lock),
};
+
static DEFINE_MUTEX(percpu_charge_mutex);
-static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock);
-static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+static void drain_obj_stock(struct obj_stock_pcp *stock);
+static bool obj_stock_flush_required(struct obj_stock_pcp *stock,
struct mem_cgroup *root_memcg);
/**
@@ -1750,110 +1791,188 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
* @memcg: memcg to consume from.
* @nr_pages: how many pages to charge.
*
- * The charges will only happen if @memcg matches the current cpu's memcg
- * stock, and at least @nr_pages are available in that stock. Failure to
- * service an allocation will refill the stock.
+ * Consume the cached charge if enough nr_pages are present otherwise return
+ * failure. Also return failure for charge request larger than
+ * MEMCG_CHARGE_BATCH or if the local lock is already taken.
*
* returns true if successful, false otherwise.
*/
static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock;
- unsigned int stock_pages;
- unsigned long flags;
+ uint8_t stock_pages;
bool ret = false;
+ int i;
- if (nr_pages > MEMCG_CHARGE_BATCH)
+ if (nr_pages > MEMCG_CHARGE_BATCH ||
+ !local_trylock(&memcg_stock.lock))
return ret;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
-
stock = this_cpu_ptr(&memcg_stock);
- stock_pages = READ_ONCE(stock->nr_pages);
- if (memcg == READ_ONCE(stock->cached) && stock_pages >= nr_pages) {
- WRITE_ONCE(stock->nr_pages, stock_pages - nr_pages);
- ret = true;
+
+ for (i = 0; i < NR_MEMCG_STOCK; ++i) {
+ if (memcg != READ_ONCE(stock->cached[i]))
+ continue;
+
+ stock_pages = READ_ONCE(stock->nr_pages[i]);
+ if (stock_pages >= nr_pages) {
+ WRITE_ONCE(stock->nr_pages[i], stock_pages - nr_pages);
+ ret = true;
+ }
+ break;
}
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ local_unlock(&memcg_stock.lock);
return ret;
}
+static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+ page_counter_uncharge(&memcg->memory, nr_pages);
+ if (do_memsw_account())
+ page_counter_uncharge(&memcg->memsw, nr_pages);
+}
+
/*
* Returns stocks cached in percpu and reset cached information.
*/
-static void drain_stock(struct memcg_stock_pcp *stock)
+static void drain_stock(struct memcg_stock_pcp *stock, int i)
{
- unsigned int stock_pages = READ_ONCE(stock->nr_pages);
- struct mem_cgroup *old = READ_ONCE(stock->cached);
+ struct mem_cgroup *old = READ_ONCE(stock->cached[i]);
+ uint8_t stock_pages;
if (!old)
return;
+ stock_pages = READ_ONCE(stock->nr_pages[i]);
if (stock_pages) {
- page_counter_uncharge(&old->memory, stock_pages);
- if (do_memsw_account())
- page_counter_uncharge(&old->memsw, stock_pages);
-
- WRITE_ONCE(stock->nr_pages, 0);
+ memcg_uncharge(old, stock_pages);
+ WRITE_ONCE(stock->nr_pages[i], 0);
}
css_put(&old->css);
- WRITE_ONCE(stock->cached, NULL);
+ WRITE_ONCE(stock->cached[i], NULL);
+}
+
+static void drain_stock_fully(struct memcg_stock_pcp *stock)
+{
+ int i;
+
+ for (i = 0; i < NR_MEMCG_STOCK; ++i)
+ drain_stock(stock, i);
}
-static void drain_local_stock(struct work_struct *dummy)
+static void drain_local_memcg_stock(struct work_struct *dummy)
{
struct memcg_stock_pcp *stock;
- struct obj_cgroup *old = NULL;
- unsigned long flags;
- /*
- * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs.
- * drain_stock races is that we always operate on local CPU stock
- * here with IRQ disabled
- */
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ if (WARN_ONCE(!in_task(), "drain in non-task context"))
+ return;
+
+ local_lock(&memcg_stock.lock);
stock = this_cpu_ptr(&memcg_stock);
- old = drain_obj_stock(stock);
- drain_stock(stock);
+ drain_stock_fully(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
- obj_cgroup_put(old);
+ local_unlock(&memcg_stock.lock);
}
-/*
- * Cache charges(val) to local per_cpu area.
- * This will be consumed by consume_stock() function, later.
- */
-static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+static void drain_local_obj_stock(struct work_struct *dummy)
+{
+ struct obj_stock_pcp *stock;
+
+ if (WARN_ONCE(!in_task(), "drain in non-task context"))
+ return;
+
+ local_lock(&obj_stock.lock);
+
+ stock = this_cpu_ptr(&obj_stock);
+ drain_obj_stock(stock);
+ clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
+
+ local_unlock(&obj_stock.lock);
+}
+
+static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock;
- unsigned int stock_pages;
+ struct mem_cgroup *cached;
+ uint8_t stock_pages;
+ bool success = false;
+ int empty_slot = -1;
+ int i;
+
+ /*
+ * For now limit MEMCG_CHARGE_BATCH to 127 and less. In future if we
+ * decide to increase it more than 127 then we will need more careful
+ * handling of nr_pages[] in struct memcg_stock_pcp.
+ */
+ BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S8_MAX);
+
+ VM_WARN_ON_ONCE(mem_cgroup_is_root(memcg));
+
+ if (nr_pages > MEMCG_CHARGE_BATCH ||
+ !local_trylock(&memcg_stock.lock)) {
+ /*
+ * In case of larger than batch refill or unlikely failure to
+ * lock the percpu memcg_stock.lock, uncharge memcg directly.
+ */
+ memcg_uncharge(memcg, nr_pages);
+ return;
+ }
stock = this_cpu_ptr(&memcg_stock);
- if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */
- drain_stock(stock);
+ for (i = 0; i < NR_MEMCG_STOCK; ++i) {
+ cached = READ_ONCE(stock->cached[i]);
+ if (!cached && empty_slot == -1)
+ empty_slot = i;
+ if (memcg == READ_ONCE(stock->cached[i])) {
+ stock_pages = READ_ONCE(stock->nr_pages[i]) + nr_pages;
+ WRITE_ONCE(stock->nr_pages[i], stock_pages);
+ if (stock_pages > MEMCG_CHARGE_BATCH)
+ drain_stock(stock, i);
+ success = true;
+ break;
+ }
+ }
+
+ if (!success) {
+ i = empty_slot;
+ if (i == -1) {
+ i = get_random_u32_below(NR_MEMCG_STOCK);
+ drain_stock(stock, i);
+ }
css_get(&memcg->css);
- WRITE_ONCE(stock->cached, memcg);
+ WRITE_ONCE(stock->cached[i], memcg);
+ WRITE_ONCE(stock->nr_pages[i], nr_pages);
}
- stock_pages = READ_ONCE(stock->nr_pages) + nr_pages;
- WRITE_ONCE(stock->nr_pages, stock_pages);
- if (stock_pages > MEMCG_CHARGE_BATCH)
- drain_stock(stock);
+ local_unlock(&memcg_stock.lock);
}
-static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+static bool is_memcg_drain_needed(struct memcg_stock_pcp *stock,
+ struct mem_cgroup *root_memcg)
{
- unsigned long flags;
+ struct mem_cgroup *memcg;
+ bool flush = false;
+ int i;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
- __refill_stock(memcg, nr_pages);
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ rcu_read_lock();
+ for (i = 0; i < NR_MEMCG_STOCK; ++i) {
+ memcg = READ_ONCE(stock->cached[i]);
+ if (!memcg)
+ continue;
+
+ if (READ_ONCE(stock->nr_pages[i]) &&
+ mem_cgroup_is_descendant(memcg, root_memcg)) {
+ flush = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return flush;
}
/*
@@ -1876,25 +1995,27 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
migrate_disable();
curcpu = smp_processor_id();
for_each_online_cpu(cpu) {
- struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
- struct mem_cgroup *memcg;
- bool flush = false;
+ struct memcg_stock_pcp *memcg_st = &per_cpu(memcg_stock, cpu);
+ struct obj_stock_pcp *obj_st = &per_cpu(obj_stock, cpu);
- rcu_read_lock();
- memcg = READ_ONCE(stock->cached);
- if (memcg && READ_ONCE(stock->nr_pages) &&
- mem_cgroup_is_descendant(memcg, root_memcg))
- flush = true;
- else if (obj_stock_flush_required(stock, root_memcg))
- flush = true;
- rcu_read_unlock();
+ if (!test_bit(FLUSHING_CACHED_CHARGE, &memcg_st->flags) &&
+ is_memcg_drain_needed(memcg_st, root_memcg) &&
+ !test_and_set_bit(FLUSHING_CACHED_CHARGE,
+ &memcg_st->flags)) {
+ if (cpu == curcpu)
+ drain_local_memcg_stock(&memcg_st->work);
+ else if (!cpu_is_isolated(cpu))
+ schedule_work_on(cpu, &memcg_st->work);
+ }
- if (flush &&
- !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
+ if (!test_bit(FLUSHING_CACHED_CHARGE, &obj_st->flags) &&
+ obj_stock_flush_required(obj_st, root_memcg) &&
+ !test_and_set_bit(FLUSHING_CACHED_CHARGE,
+ &obj_st->flags)) {
if (cpu == curcpu)
- drain_local_stock(&stock->work);
+ drain_local_obj_stock(&obj_st->work);
else if (!cpu_is_isolated(cpu))
- schedule_work_on(cpu, &stock->work);
+ schedule_work_on(cpu, &obj_st->work);
}
}
migrate_enable();
@@ -1903,10 +2024,9 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
- struct memcg_stock_pcp *stock;
-
- stock = &per_cpu(memcg_stock, cpu);
- drain_stock(stock);
+ /* no need for the local lock */
+ drain_obj_stock(&per_cpu(obj_stock, cpu));
+ drain_stock_fully(&per_cpu(memcg_stock, cpu));
return 0;
}
@@ -2181,8 +2301,8 @@ out:
css_put(&memcg->css);
}
-int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
- unsigned int nr_pages)
+static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ unsigned int nr_pages)
{
unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
int nr_retries = MAX_RECLAIM_RETRIES;
@@ -2199,6 +2319,10 @@ retry:
if (consume_stock(memcg, nr_pages))
return 0;
+ if (!gfpflags_allow_spinning(gfp_mask))
+ /* Avoid the refill and flush of the older stock */
+ batch = nr_pages;
+
if (!do_memsw_account() ||
page_counter_try_charge(&memcg->memsw, batch, &counter)) {
if (page_counter_try_charge(&memcg->memory, batch, &counter))
@@ -2371,19 +2495,13 @@ done_restock:
return 0;
}
-/**
- * mem_cgroup_cancel_charge() - cancel an uncommitted try_charge() call.
- * @memcg: memcg previously charged.
- * @nr_pages: number of pages previously charged.
- */
-void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
+static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ unsigned int nr_pages)
{
if (mem_cgroup_is_root(memcg))
- return;
+ return 0;
- page_counter_uncharge(&memcg->memory, nr_pages);
- if (do_memsw_account())
- page_counter_uncharge(&memcg->memsw, nr_pages);
+ return try_charge_memcg(memcg, gfp_mask, nr_pages);
}
static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
@@ -2399,29 +2517,47 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
folio->memcg_data = (unsigned long)memcg;
}
-/**
- * mem_cgroup_commit_charge - commit a previously successful try_charge().
- * @folio: folio to commit the charge to.
- * @memcg: memcg previously charged.
- */
-void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
+#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
+static inline void account_slab_nmi_safe(struct mem_cgroup *memcg,
+ struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr)
{
- css_get(&memcg->css);
- commit_charge(folio, memcg);
- memcg1_commit_charge(folio, memcg);
+ struct lruvec *lruvec;
+
+ if (likely(!in_nmi())) {
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ mod_memcg_lruvec_state(lruvec, idx, nr);
+ } else {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[pgdat->node_id];
+
+ /* TODO: add to cgroup update tree once it is nmi-safe. */
+ if (idx == NR_SLAB_RECLAIMABLE_B)
+ atomic_add(nr, &pn->slab_reclaimable);
+ else
+ atomic_add(nr, &pn->slab_unreclaimable);
+ }
}
+#else
+static inline void account_slab_nmi_safe(struct mem_cgroup *memcg,
+ struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr)
+{
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ mod_memcg_lruvec_state(lruvec, idx, nr);
+}
+#endif
-static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg,
+static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
struct pglist_data *pgdat,
enum node_stat_item idx, int nr)
{
struct mem_cgroup *memcg;
- struct lruvec *lruvec;
rcu_read_lock();
memcg = obj_cgroup_memcg(objcg);
- lruvec = mem_cgroup_lruvec(memcg, pgdat);
- __mod_memcg_lruvec_state(lruvec, idx, nr);
+ account_slab_nmi_safe(memcg, pgdat, idx, nr);
rcu_read_unlock();
}
@@ -2546,6 +2682,9 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void)
struct mem_cgroup *memcg;
struct obj_cgroup *objcg;
+ if (IS_ENABLED(CONFIG_MEMCG_NMI_UNSAFE) && in_nmi())
+ return NULL;
+
if (in_task()) {
memcg = current->active_memcg;
if (unlikely(memcg))
@@ -2608,6 +2747,23 @@ struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
return objcg;
}
+#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
+static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val)
+{
+ if (likely(!in_nmi())) {
+ mod_memcg_state(memcg, MEMCG_KMEM, val);
+ } else {
+ /* TODO: add to cgroup update tree once it is nmi-safe. */
+ atomic_add(val, &memcg->kmem_stat);
+ }
+}
+#else
+static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val)
+{
+ mod_memcg_state(memcg, MEMCG_KMEM, val);
+}
+#endif
+
/*
* obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
* @objcg: object cgroup to uncharge
@@ -2620,9 +2776,10 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
memcg = get_mem_cgroup_from_objcg(objcg);
- mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
+ account_kmem_nmi_safe(memcg, -nr_pages);
memcg1_account_kmem(memcg, -nr_pages);
- refill_stock(memcg, nr_pages);
+ if (!mem_cgroup_is_root(memcg))
+ refill_stock(memcg, nr_pages);
css_put(&memcg->css);
}
@@ -2647,7 +2804,7 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
if (ret)
goto out;
- mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
+ account_kmem_nmi_safe(memcg, nr_pages);
memcg1_account_kmem(memcg, nr_pages);
out:
css_put(&memcg->css);
@@ -2655,6 +2812,23 @@ out:
return ret;
}
+static struct obj_cgroup *page_objcg(const struct page *page)
+{
+ unsigned long memcg_data = page->memcg_data;
+
+ if (mem_cgroup_disabled() || !memcg_data)
+ return NULL;
+
+ VM_BUG_ON_PAGE((memcg_data & OBJEXTS_FLAGS_MASK) != MEMCG_DATA_KMEM,
+ page);
+ return (struct obj_cgroup *)(memcg_data - MEMCG_DATA_KMEM);
+}
+
+static void page_set_objcg(struct page *page, const struct obj_cgroup *objcg)
+{
+ page->memcg_data = (unsigned long)objcg | MEMCG_DATA_KMEM;
+}
+
/**
* __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
* @page: page to charge
@@ -2673,8 +2847,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
if (!ret) {
obj_cgroup_get(objcg);
- page->memcg_data = (unsigned long)objcg |
- MEMCG_DATA_KMEM;
+ page_set_objcg(page, objcg);
return 0;
}
}
@@ -2688,53 +2861,38 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
*/
void __memcg_kmem_uncharge_page(struct page *page, int order)
{
- struct folio *folio = page_folio(page);
- struct obj_cgroup *objcg;
+ struct obj_cgroup *objcg = page_objcg(page);
unsigned int nr_pages = 1 << order;
- if (!folio_memcg_kmem(folio))
+ if (!objcg)
return;
- objcg = __folio_objcg(folio);
obj_cgroup_uncharge_pages(objcg, nr_pages);
- folio->memcg_data = 0;
+ page->memcg_data = 0;
obj_cgroup_put(objcg);
}
-static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
- enum node_stat_item idx, int nr)
+static void __account_obj_stock(struct obj_cgroup *objcg,
+ struct obj_stock_pcp *stock, int nr,
+ struct pglist_data *pgdat, enum node_stat_item idx)
{
- struct memcg_stock_pcp *stock;
- struct obj_cgroup *old = NULL;
- unsigned long flags;
int *bytes;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
- stock = this_cpu_ptr(&memcg_stock);
-
/*
* Save vmstat data in stock and skip vmstat array update unless
- * accumulating over a page of vmstat data or when pgdat or idx
- * changes.
+ * accumulating over a page of vmstat data or when pgdat changes.
*/
- if (READ_ONCE(stock->cached_objcg) != objcg) {
- old = drain_obj_stock(stock);
- obj_cgroup_get(objcg);
- stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
- ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
- WRITE_ONCE(stock->cached_objcg, objcg);
- stock->cached_pgdat = pgdat;
- } else if (stock->cached_pgdat != pgdat) {
+ if (stock->cached_pgdat != pgdat) {
/* Flush the existing cached vmstat data */
struct pglist_data *oldpg = stock->cached_pgdat;
if (stock->nr_slab_reclaimable_b) {
- __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
+ mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
stock->nr_slab_reclaimable_b);
stock->nr_slab_reclaimable_b = 0;
}
if (stock->nr_slab_unreclaimable_b) {
- __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
+ mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
stock->nr_slab_unreclaimable_b);
stock->nr_slab_unreclaimable_b = 0;
}
@@ -2760,37 +2918,38 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
}
}
if (nr)
- __mod_objcg_mlstate(objcg, pgdat, idx, nr);
-
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
- obj_cgroup_put(old);
+ mod_objcg_mlstate(objcg, pgdat, idx, nr);
}
-static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
+static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
+ struct pglist_data *pgdat, enum node_stat_item idx)
{
- struct memcg_stock_pcp *stock;
- unsigned long flags;
+ struct obj_stock_pcp *stock;
bool ret = false;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ if (!local_trylock(&obj_stock.lock))
+ return ret;
- stock = this_cpu_ptr(&memcg_stock);
+ stock = this_cpu_ptr(&obj_stock);
if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) {
stock->nr_bytes -= nr_bytes;
ret = true;
+
+ if (pgdat)
+ __account_obj_stock(objcg, stock, nr_bytes, pgdat, idx);
}
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ local_unlock(&obj_stock.lock);
return ret;
}
-static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
+static void drain_obj_stock(struct obj_stock_pcp *stock)
{
struct obj_cgroup *old = READ_ONCE(stock->cached_objcg);
if (!old)
- return NULL;
+ return;
if (stock->nr_bytes) {
unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
@@ -2803,7 +2962,8 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
memcg1_account_kmem(memcg, -nr_pages);
- __refill_stock(memcg, nr_pages);
+ if (!mem_cgroup_is_root(memcg))
+ memcg_uncharge(memcg, nr_pages);
css_put(&memcg->css);
}
@@ -2827,13 +2987,13 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
*/
if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
if (stock->nr_slab_reclaimable_b) {
- __mod_objcg_mlstate(old, stock->cached_pgdat,
+ mod_objcg_mlstate(old, stock->cached_pgdat,
NR_SLAB_RECLAIMABLE_B,
stock->nr_slab_reclaimable_b);
stock->nr_slab_reclaimable_b = 0;
}
if (stock->nr_slab_unreclaimable_b) {
- __mod_objcg_mlstate(old, stock->cached_pgdat,
+ mod_objcg_mlstate(old, stock->cached_pgdat,
NR_SLAB_UNRECLAIMABLE_B,
stock->nr_slab_unreclaimable_b);
stock->nr_slab_unreclaimable_b = 0;
@@ -2842,67 +3002,76 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
}
WRITE_ONCE(stock->cached_objcg, NULL);
- /*
- * The `old' objects needs to be released by the caller via
- * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock.
- */
- return old;
+ obj_cgroup_put(old);
}
-static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+static bool obj_stock_flush_required(struct obj_stock_pcp *stock,
struct mem_cgroup *root_memcg)
{
struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg);
struct mem_cgroup *memcg;
+ bool flush = false;
+ rcu_read_lock();
if (objcg) {
memcg = obj_cgroup_memcg(objcg);
if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
- return true;
+ flush = true;
}
+ rcu_read_unlock();
- return false;
+ return flush;
}
static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
- bool allow_uncharge)
+ bool allow_uncharge, int nr_acct, struct pglist_data *pgdat,
+ enum node_stat_item idx)
{
- struct memcg_stock_pcp *stock;
- struct obj_cgroup *old = NULL;
- unsigned long flags;
+ struct obj_stock_pcp *stock;
unsigned int nr_pages = 0;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ if (!local_trylock(&obj_stock.lock)) {
+ if (pgdat)
+ mod_objcg_mlstate(objcg, pgdat, idx, nr_bytes);
+ nr_pages = nr_bytes >> PAGE_SHIFT;
+ nr_bytes = nr_bytes & (PAGE_SIZE - 1);
+ atomic_add(nr_bytes, &objcg->nr_charged_bytes);
+ goto out;
+ }
- stock = this_cpu_ptr(&memcg_stock);
+ stock = this_cpu_ptr(&obj_stock);
if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */
- old = drain_obj_stock(stock);
+ drain_obj_stock(stock);
obj_cgroup_get(objcg);
- WRITE_ONCE(stock->cached_objcg, objcg);
stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
+ WRITE_ONCE(stock->cached_objcg, objcg);
+
allow_uncharge = true; /* Allow uncharge when objcg changes */
}
stock->nr_bytes += nr_bytes;
+ if (pgdat)
+ __account_obj_stock(objcg, stock, nr_acct, pgdat, idx);
+
if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) {
nr_pages = stock->nr_bytes >> PAGE_SHIFT;
stock->nr_bytes &= (PAGE_SIZE - 1);
}
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
- obj_cgroup_put(old);
-
+ local_unlock(&obj_stock.lock);
+out:
if (nr_pages)
obj_cgroup_uncharge_pages(objcg, nr_pages);
}
-int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
+static int obj_cgroup_charge_account(struct obj_cgroup *objcg, gfp_t gfp, size_t size,
+ struct pglist_data *pgdat, enum node_stat_item idx)
{
unsigned int nr_pages, nr_bytes;
int ret;
- if (consume_obj_stock(objcg, size))
+ if (likely(consume_obj_stock(objcg, size, pgdat, idx)))
return 0;
/*
@@ -2935,15 +3104,21 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
nr_pages += 1;
ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages);
- if (!ret && nr_bytes)
- refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false);
+ if (!ret && (nr_bytes || pgdat))
+ refill_obj_stock(objcg, nr_bytes ? PAGE_SIZE - nr_bytes : 0,
+ false, size, pgdat, idx);
return ret;
}
+int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
+{
+ return obj_cgroup_charge_account(objcg, gfp, size, NULL, 0);
+}
+
void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
{
- refill_obj_stock(objcg, size, true);
+ refill_obj_stock(objcg, size, true, 0, NULL, 0);
}
static inline size_t obj_full_size(struct kmem_cache *s)
@@ -2995,23 +3170,32 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
return false;
}
- if (obj_cgroup_charge(objcg, flags, size * obj_full_size(s)))
- return false;
-
for (i = 0; i < size; i++) {
slab = virt_to_slab(p[i]);
if (!slab_obj_exts(slab) &&
alloc_slab_obj_exts(slab, s, flags, false)) {
- obj_cgroup_uncharge(objcg, obj_full_size(s));
continue;
}
+ /*
+ * if we fail and size is 1, memcg_alloc_abort_single() will
+ * just free the object, which is ok as we have not assigned
+ * objcg to its obj_ext yet
+ *
+ * for larger sizes, kmem_cache_free_bulk() will uncharge
+ * any objects that were already charged and obj_ext assigned
+ *
+ * TODO: we could batch this until slab_pgdat(slab) changes
+ * between iterations, with a more complicated undo
+ */
+ if (obj_cgroup_charge_account(objcg, flags, obj_full_size(s),
+ slab_pgdat(slab), cache_vmstat_idx(s)))
+ return false;
+
off = obj_to_index(s, slab, p[i]);
obj_cgroup_get(objcg);
slab_obj_exts(slab)[off].objcg = objcg;
- mod_objcg_state(objcg, slab_pgdat(slab),
- cache_vmstat_idx(s), obj_full_size(s));
}
return true;
@@ -3020,6 +3204,8 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
void **p, int objects, struct slabobj_ext *obj_exts)
{
+ size_t obj_size = obj_full_size(s);
+
for (int i = 0; i < objects; i++) {
struct obj_cgroup *objcg;
unsigned int off;
@@ -3030,33 +3216,40 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
continue;
obj_exts[off].objcg = NULL;
- obj_cgroup_uncharge(objcg, obj_full_size(s));
- mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s),
- -obj_full_size(s));
+ refill_obj_stock(objcg, obj_size, true, -obj_size,
+ slab_pgdat(slab), cache_vmstat_idx(s));
obj_cgroup_put(objcg);
}
}
/*
- * Because folio_memcg(head) is not set on tails, set it now.
+ * The objcg is only set on the first page, so transfer it to all the
+ * other pages.
*/
-void split_page_memcg(struct page *head, int old_order, int new_order)
+void split_page_memcg(struct page *page, unsigned order)
{
- struct folio *folio = page_folio(head);
- int i;
- unsigned int old_nr = 1 << old_order;
- unsigned int new_nr = 1 << new_order;
+ struct obj_cgroup *objcg = page_objcg(page);
+ unsigned int i, nr = 1 << order;
- if (mem_cgroup_disabled() || !folio_memcg_charged(folio))
+ if (!objcg)
return;
- for (i = new_nr; i < old_nr; i += new_nr)
- folio_page(folio, i)->memcg_data = folio->memcg_data;
+ for (i = 1; i < nr; i++)
+ page_set_objcg(&page[i], objcg);
- if (folio_memcg_kmem(folio))
- obj_cgroup_get_many(__folio_objcg(folio), old_nr / new_nr - 1);
- else
- css_get_many(&folio_memcg(folio)->css, old_nr / new_nr - 1);
+ obj_cgroup_get_many(objcg, nr - 1);
+}
+
+void folio_split_memcg_refs(struct folio *folio, unsigned old_order,
+ unsigned new_order)
+{
+ unsigned new_refs;
+
+ if (mem_cgroup_disabled() || !folio_memcg_charged(folio))
+ return;
+
+ new_refs = (1 << (old_order - new_order)) - 1;
+ css_get_many(&__folio_memcg(folio)->css, new_refs);
}
unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
@@ -3384,7 +3577,7 @@ void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
refcount_add(n, &memcg->id.ref);
}
-void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
+static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
{
if (refcount_sub_and_test(n, &memcg->id.ref)) {
mem_cgroup_id_remove(memcg);
@@ -3399,6 +3592,24 @@ static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
mem_cgroup_id_put_many(memcg, 1);
}
+struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
+{
+ while (!refcount_inc_not_zero(&memcg->id.ref)) {
+ /*
+ * The root cgroup cannot be destroyed, so it's refcount must
+ * always be >= 1.
+ */
+ if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) {
+ VM_BUG_ON(1);
+ break;
+ }
+ memcg = parent_mem_cgroup(memcg);
+ if (!memcg)
+ memcg = root_mem_cgroup;
+ }
+ return memcg;
+}
+
/**
* mem_cgroup_from_id - look up a memcg from a memcg id
* @id: the memcg id to look up
@@ -3434,11 +3645,22 @@ struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
}
#endif
+static void free_mem_cgroup_per_node_info(struct mem_cgroup_per_node *pn)
+{
+ if (!pn)
+ return;
+
+ free_percpu(pn->lruvec_stats_percpu);
+ kfree(pn->lruvec_stats);
+ kfree(pn);
+}
+
static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn;
- pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node);
+ pn = kmem_cache_alloc_node(memcg_pn_cachep, GFP_KERNEL | __GFP_ZERO,
+ node);
if (!pn)
return false;
@@ -3458,23 +3680,10 @@ static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
memcg->nodeinfo[node] = pn;
return true;
fail:
- kfree(pn->lruvec_stats);
- kfree(pn);
+ free_mem_cgroup_per_node_info(pn);
return false;
}
-static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
-{
- struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
-
- if (!pn)
- return;
-
- free_percpu(pn->lruvec_stats_percpu);
- kfree(pn->lruvec_stats);
- kfree(pn);
-}
-
static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
@@ -3482,7 +3691,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
obj_cgroup_put(memcg->orig_objcg);
for_each_node(node)
- free_mem_cgroup_per_node_info(memcg, node);
+ free_mem_cgroup_per_node_info(memcg->nodeinfo[node]);
memcg1_free_events(memcg);
kfree(memcg->vmstats);
free_percpu(memcg->vmstats_percpu);
@@ -3498,13 +3707,14 @@ static void mem_cgroup_free(struct mem_cgroup *memcg)
static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
{
- struct memcg_vmstats_percpu *statc, *pstatc;
+ struct memcg_vmstats_percpu *statc;
+ struct memcg_vmstats_percpu __percpu *pstatc_pcpu;
struct mem_cgroup *memcg;
int node, cpu;
int __maybe_unused i;
long error;
- memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL);
+ memcg = kmem_cache_zalloc(memcg_cachep, GFP_KERNEL);
if (!memcg)
return ERR_PTR(-ENOMEM);
@@ -3529,9 +3739,9 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
for_each_possible_cpu(cpu) {
if (parent)
- pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu);
+ pstatc_pcpu = parent->vmstats_percpu;
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
- statc->parent = parent ? pstatc : NULL;
+ statc->parent_pcpu = parent ? pstatc_pcpu : NULL;
statc->vmstats = memcg->vmstats;
}
@@ -3575,6 +3785,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
struct mem_cgroup *memcg, *old_memcg;
+ bool memcg_on_dfl = cgroup_subsys_on_dfl(memory_cgrp_subsys);
old_memcg = set_active_memcg(parent);
memcg = mem_cgroup_alloc(parent);
@@ -3592,9 +3803,10 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (parent) {
WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
- page_counter_init(&memcg->memory, &parent->memory, true);
+ page_counter_init(&memcg->memory, &parent->memory, memcg_on_dfl);
page_counter_init(&memcg->swap, &parent->swap, false);
#ifdef CONFIG_MEMCG_V1
+ memcg->memory.track_failcnt = !memcg_on_dfl;
WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable));
page_counter_init(&memcg->kmem, &parent->kmem, false);
page_counter_init(&memcg->tcpmem, &parent->tcpmem, false);
@@ -3612,7 +3824,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
return &memcg->css;
}
- if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
+ if (memcg_on_dfl && !cgroup_memory_nosocket)
static_branch_inc(&memcg_sockets_enabled_key);
if (!cgroup_memory_nobpf)
@@ -3803,6 +4015,53 @@ static void mem_cgroup_stat_aggregate(struct aggregate_control *ac)
}
}
+#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
+static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent,
+ int cpu)
+{
+ int nid;
+
+ if (atomic_read(&memcg->kmem_stat)) {
+ int kmem = atomic_xchg(&memcg->kmem_stat, 0);
+ int index = memcg_stats_index(MEMCG_KMEM);
+
+ memcg->vmstats->state[index] += kmem;
+ if (parent)
+ parent->vmstats->state_pending[index] += kmem;
+ }
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+ struct lruvec_stats *lstats = pn->lruvec_stats;
+ struct lruvec_stats *plstats = NULL;
+
+ if (parent)
+ plstats = parent->nodeinfo[nid]->lruvec_stats;
+
+ if (atomic_read(&pn->slab_reclaimable)) {
+ int slab = atomic_xchg(&pn->slab_reclaimable, 0);
+ int index = memcg_stats_index(NR_SLAB_RECLAIMABLE_B);
+
+ lstats->state[index] += slab;
+ if (plstats)
+ plstats->state_pending[index] += slab;
+ }
+ if (atomic_read(&pn->slab_unreclaimable)) {
+ int slab = atomic_xchg(&pn->slab_unreclaimable, 0);
+ int index = memcg_stats_index(NR_SLAB_UNRECLAIMABLE_B);
+
+ lstats->state[index] += slab;
+ if (plstats)
+ plstats->state_pending[index] += slab;
+ }
+ }
+}
+#else
+static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent,
+ int cpu)
+{}
+#endif
+
static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@ -3811,6 +4070,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
struct aggregate_control ac;
int nid;
+ flush_nmi_stats(memcg, parent, cpu);
+
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
ac = (struct aggregate_control) {
@@ -3860,8 +4121,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
}
WRITE_ONCE(statc->stats_updates, 0);
/* We are in a per-cpu loop here, only do the atomic write once */
- if (atomic64_read(&memcg->vmstats->stats_updates))
- atomic64_set(&memcg->vmstats->stats_updates, 0);
+ if (atomic_read(&memcg->vmstats->stats_updates))
+ atomic_set(&memcg->vmstats->stats_updates, 0);
}
static void mem_cgroup_fork(struct task_struct *task)
@@ -4014,7 +4275,7 @@ static ssize_t peak_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
WRITE_ONCE(peer_ctx->value, usage);
/* initial write, register watcher */
- if (ofp->value == -1)
+ if (ofp->value == OFP_PEAK_UNSET)
list_add(&ofp->list, watchers);
WRITE_ONCE(ofp->value, usage);
@@ -4102,6 +4363,9 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
page_counter_set_high(&memcg->memory, high);
+ if (of->file->f_flags & O_NONBLOCK)
+ goto out;
+
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
unsigned long reclaimed;
@@ -4124,7 +4388,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
if (!reclaimed && !nr_retries--)
break;
}
-
+out:
memcg_wb_domain_size_changed(memcg);
return nbytes;
}
@@ -4151,6 +4415,9 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
xchg(&memcg->memory.max, max);
+ if (of->file->f_flags & O_NONBLOCK)
+ goto out;
+
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
@@ -4176,8 +4443,9 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
memcg_memory_event(memcg, MEMCG_OOM);
if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
break;
+ cond_resched();
}
-
+out:
memcg_wb_domain_size_changed(memcg);
return nbytes;
}
@@ -4300,11 +4568,13 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
enum {
MEMORY_RECLAIM_SWAPPINESS = 0,
+ MEMORY_RECLAIM_SWAPPINESS_MAX,
MEMORY_RECLAIM_NULL,
};
static const match_table_t tokens = {
{ MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"},
+ { MEMORY_RECLAIM_SWAPPINESS_MAX, "swappiness=max"},
{ MEMORY_RECLAIM_NULL, NULL },
};
@@ -4338,6 +4608,9 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
if (swappiness < MIN_SWAPPINESS || swappiness > MAX_SWAPPINESS)
return -EINVAL;
break;
+ case MEMORY_RECLAIM_SWAPPINESS_MAX:
+ swappiness = SWAPPINESS_ANON_ONLY;
+ break;
default:
return -EINVAL;
}
@@ -4498,7 +4771,9 @@ static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
if (ret)
goto out;
- mem_cgroup_commit_charge(folio, memcg);
+ css_get(&memcg->css);
+ commit_charge(folio, memcg);
+ memcg1_commit_charge(folio, memcg);
out:
return ret;
}
@@ -4516,38 +4791,37 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
}
/**
- * mem_cgroup_hugetlb_try_charge - try to charge the memcg for a hugetlb folio
- * @memcg: memcg to charge.
- * @gfp: reclaim mode.
- * @nr_pages: number of pages to charge.
- *
- * This function is called when allocating a huge page folio to determine if
- * the memcg has the capacity for it. It does not commit the charge yet,
- * as the hugetlb folio itself has not been obtained from the hugetlb pool.
+ * mem_cgroup_charge_hugetlb - charge the memcg for a hugetlb folio
+ * @folio: folio being charged
+ * @gfp: reclaim mode
*
- * Once we have obtained the hugetlb folio, we can call
- * mem_cgroup_commit_charge() to commit the charge. If we fail to obtain the
- * folio, we should instead call mem_cgroup_cancel_charge() to undo the effect
- * of try_charge().
+ * This function is called when allocating a huge page folio, after the page has
+ * already been obtained and charged to the appropriate hugetlb cgroup
+ * controller (if it is enabled).
*
- * Returns 0 on success. Otherwise, an error code is returned.
+ * Returns ENOMEM if the memcg is already full.
+ * Returns 0 if either the charge was successful, or if we skip the charging.
*/
-int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
- long nr_pages)
+int mem_cgroup_charge_hugetlb(struct folio *folio, gfp_t gfp)
{
+ struct mem_cgroup *memcg = get_mem_cgroup_from_current();
+ int ret = 0;
+
/*
- * If hugetlb memcg charging is not enabled, do not fail hugetlb allocation,
- * but do not attempt to commit charge later (or cancel on error) either.
+ * Even memcg does not account for hugetlb, we still want to update
+ * system-level stats via lruvec_stat_mod_folio. Return 0, and skip
+ * charging the memcg.
*/
- if (mem_cgroup_disabled() || !memcg ||
- !cgroup_subsys_on_dfl(memory_cgrp_subsys) ||
- !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING))
- return -EOPNOTSUPP;
+ if (mem_cgroup_disabled() || !memcg_accounts_hugetlb() ||
+ !memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ goto out;
- if (try_charge(memcg, gfp, nr_pages))
- return -ENOMEM;
+ if (charge_memcg(folio, memcg, gfp))
+ ret = -ENOMEM;
- return 0;
+out:
+ mem_cgroup_put(memcg);
+ return ret;
}
/**
@@ -4585,40 +4859,6 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
return ret;
}
-/*
- * mem_cgroup_swapin_uncharge_swap - uncharge swap slot
- * @entry: the first swap entry for which the pages are charged
- * @nr_pages: number of pages which will be uncharged
- *
- * Call this function after successfully adding the charged page to swapcache.
- *
- * Note: This function assumes the page for which swap slot is being uncharged
- * is order 0 page.
- */
-void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
-{
- /*
- * Cgroup1's unified memory+swap counter has been charged with the
- * new swapcache page, finish the transfer by uncharging the swap
- * slot. The swap slot would also get uncharged when it dies, but
- * it can stick around indefinitely and we'd count the page twice
- * the entire time.
- *
- * Cgroup2 has separate resource counters for memory and swap,
- * so this is a non-issue here. Memory and swap charge lifetimes
- * correspond 1:1 to page and swap slot lifetimes: we charge the
- * page to memory here, and uncharge swap when the slot is freed.
- */
- if (!mem_cgroup_disabled() && do_memsw_account()) {
- /*
- * The swap entry might not get freed for a long time,
- * let's not wait for it. The page already received a
- * memory+swap charge, drop the swap entry duplicate.
- */
- mem_cgroup_uncharge_swap(entry, nr_pages);
- }
-}
-
struct uncharge_gather {
struct mem_cgroup *memcg;
unsigned long nr_memory;
@@ -4635,9 +4875,7 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug)
static void uncharge_batch(const struct uncharge_gather *ug)
{
if (ug->nr_memory) {
- page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
- if (do_memsw_account())
- page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
+ memcg_uncharge(ug->memcg, ug->nr_memory);
if (ug->nr_kmem) {
mod_memcg_state(ug->memcg, MEMCG_KMEM, -ug->nr_kmem);
memcg1_account_kmem(ug->memcg, -ug->nr_kmem);
@@ -4869,7 +5107,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return memcg1_charge_skmem(memcg, nr_pages, gfp_mask);
- if (try_charge(memcg, gfp_mask, nr_pages) == 0) {
+ if (try_charge_memcg(memcg, gfp_mask, nr_pages) == 0) {
mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
return true;
}
@@ -4913,15 +5151,16 @@ static int __init cgroup_memory(char *s)
__setup("cgroup.memory=", cgroup_memory);
/*
- * subsys_initcall() for memory controller.
+ * Memory controller init before cgroup_init() initialize root_mem_cgroup.
*
* Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
* context because of lock dependencies (cgroup_lock -> cpu hotplug) but
* basically everything that doesn't depend on a specific mem_cgroup structure
* should be initialized from here.
*/
-static int __init mem_cgroup_init(void)
+int __init mem_cgroup_init(void)
{
+ unsigned int memcg_size;
int cpu;
/*
@@ -4935,92 +5174,24 @@ static int __init mem_cgroup_init(void)
cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
memcg_hotplug_cpu_dead);
- for_each_possible_cpu(cpu)
+ for_each_possible_cpu(cpu) {
INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
- drain_local_stock);
-
- return 0;
-}
-subsys_initcall(mem_cgroup_init);
-
-#ifdef CONFIG_SWAP
-static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
-{
- while (!refcount_inc_not_zero(&memcg->id.ref)) {
- /*
- * The root cgroup cannot be destroyed, so it's refcount must
- * always be >= 1.
- */
- if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) {
- VM_BUG_ON(1);
- break;
- }
- memcg = parent_mem_cgroup(memcg);
- if (!memcg)
- memcg = root_mem_cgroup;
+ drain_local_memcg_stock);
+ INIT_WORK(&per_cpu_ptr(&obj_stock, cpu)->work,
+ drain_local_obj_stock);
}
- return memcg;
-}
-
-/**
- * mem_cgroup_swapout - transfer a memsw charge to swap
- * @folio: folio whose memsw charge to transfer
- * @entry: swap entry to move the charge to
- *
- * Transfer the memsw charge of @folio to @entry.
- */
-void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
-{
- struct mem_cgroup *memcg, *swap_memcg;
- unsigned int nr_entries;
- unsigned short oldid;
-
- VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
- VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
-
- if (mem_cgroup_disabled())
- return;
-
- if (!do_memsw_account())
- return;
-
- memcg = folio_memcg(folio);
-
- VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
- if (!memcg)
- return;
-
- /*
- * In case the memcg owning these pages has been offlined and doesn't
- * have an ID allocated to it anymore, charge the closest online
- * ancestor for the swap instead and transfer the memory+swap charge.
- */
- swap_memcg = mem_cgroup_id_get_online(memcg);
- nr_entries = folio_nr_pages(folio);
- /* Get references for the tail pages, too */
- if (nr_entries > 1)
- mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
- oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
- nr_entries);
- VM_BUG_ON_FOLIO(oldid, folio);
- mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
- folio_unqueue_deferred_split(folio);
- folio->memcg_data = 0;
+ memcg_size = struct_size_t(struct mem_cgroup, nodeinfo, nr_node_ids);
+ memcg_cachep = kmem_cache_create("mem_cgroup", memcg_size, 0,
+ SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL);
- if (!mem_cgroup_is_root(memcg))
- page_counter_uncharge(&memcg->memory, nr_entries);
+ memcg_pn_cachep = KMEM_CACHE(mem_cgroup_per_node,
+ SLAB_PANIC | SLAB_HWCACHE_ALIGN);
- if (memcg != swap_memcg) {
- if (!mem_cgroup_is_root(swap_memcg))
- page_counter_charge(&swap_memcg->memsw, nr_entries);
- page_counter_uncharge(&memcg->memsw, nr_entries);
- }
-
- memcg1_swapout(folio, memcg);
- css_put(&memcg->css);
+ return 0;
}
+#ifdef CONFIG_SWAP
/**
* __mem_cgroup_try_charge_swap - try charging swap space for a folio
* @folio: folio being added to swap
@@ -5035,7 +5206,6 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
unsigned int nr_pages = folio_nr_pages(folio);
struct page_counter *counter;
struct mem_cgroup *memcg;
- unsigned short oldid;
if (do_memsw_account())
return 0;
@@ -5064,10 +5234,10 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
/* Get references for the tail pages, too */
if (nr_pages > 1)
mem_cgroup_id_get_many(memcg, nr_pages - 1);
- oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
- VM_BUG_ON_FOLIO(oldid, folio);
mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
+ swap_cgroup_record(folio, mem_cgroup_id(memcg), entry);
+
return 0;
}
@@ -5081,7 +5251,7 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
struct mem_cgroup *memcg;
unsigned short id;
- id = swap_cgroup_record(entry, 0, nr_pages);
+ id = swap_cgroup_clear(entry, nr_pages);
rcu_read_lock();
memcg = mem_cgroup_from_id(id);
if (memcg) {
@@ -5473,3 +5643,8 @@ static int __init mem_cgroup_swap_init(void)
subsys_initcall(mem_cgroup_swap_init);
#endif /* CONFIG_SWAP */
+
+bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
+{
+ return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true;
+}