From 009bcbd0b201d4dc125eb960a61cb6d4d9fdfc72 Mon Sep 17 00:00:00 2001 From: Tao Cui Date: Thu, 14 May 2026 14:50:32 +0800 Subject: cgroup/rdma: add rdma.events to track resource limit exhaustion Add per-device hierarchical event counters to track when RDMA resource limits are exceeded. The rdma.events file reports max event counts propagated upward from the cgroup whose limit was hit to all ancestors. This mirrors the design of pids.events, where events are attributed to the cgroup that imposed the limit, not necessarily the cgroup where the allocation was attempted. Userspace can monitor this file via poll/epoll for real-time notification of resource exhaustion. Signed-off-by: Tao Cui Signed-off-by: Tejun Heo --- include/linux/cgroup_rdma.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h index 80edae03c313..ac691fe7d3f5 100644 --- a/include/linux/cgroup_rdma.h +++ b/include/linux/cgroup_rdma.h @@ -24,6 +24,9 @@ struct rdma_cgroup { * that belongs to this cgroup. */ struct list_head rpools; + + /* Handle for rdma.events */ + struct cgroup_file events_file; }; struct rdmacg_device { -- cgit v1.2.3 From aefe4847f0891e2e71bedf5478d1cf350f86fc61 Mon Sep 17 00:00:00 2001 From: Tao Cui Date: Thu, 14 May 2026 14:50:33 +0800 Subject: cgroup/rdma: add rdma.events.local for per-cgroup allocation failure attribution Add per-cgroup local event counters to track RDMA resource limit exhaustion from the perspective of individual cgroups. The rdma.events.local file reports two per-resource counters: - max: number of times this cgroup's limit was the one that blocked an allocation in the subtree - alloc_fail: number of allocation attempts originating from this cgroup that failed due to an ancestor's limit This mirrors the design of pids.events.local, where events are attributed to the cgroup that imposed the limit, not necessarily the cgroup where the allocation was attempted. Also extend rdma.events with a hierarchical alloc_fail counter that tracks allocation failures propagating upward from the requesting cgroup, complementing the existing max counter, so that rdma.events and rdma.events.local share the same output format. Signed-off-by: Tao Cui Signed-off-by: Tejun Heo --- include/linux/cgroup_rdma.h | 3 +- kernel/cgroup/rdma.c | 143 +++++++++++++++++++++++++++++++++----------- 2 files changed, 109 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h index ac691fe7d3f5..404e746552ca 100644 --- a/include/linux/cgroup_rdma.h +++ b/include/linux/cgroup_rdma.h @@ -25,8 +25,9 @@ struct rdma_cgroup { */ struct list_head rpools; - /* Handle for rdma.events */ + /* Handles for rdma.events[.local] */ struct cgroup_file events_file; + struct cgroup_file events_local_file; }; struct rdmacg_device { diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index 927bbf1eb949..7c238a9d64d4 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -82,8 +82,11 @@ struct rdmacg_resource_pool { /* total number counts which are set to max */ int num_max_cnt; - /* per-resource hierarchical max event counters */ + /* per-resource event counters */ u64 events_max[RDMACG_RESOURCE_MAX]; + u64 events_alloc_fail[RDMACG_RESOURCE_MAX]; + u64 events_local_max[RDMACG_RESOURCE_MAX]; + u64 events_local_alloc_fail[RDMACG_RESOURCE_MAX]; }; static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css) @@ -131,6 +134,26 @@ static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool) kfree(rpool); } +static bool rpool_has_persistent_state(struct rdmacg_resource_pool *rpool) +{ + int i; + + /* + * Keep the rpool alive if any peak value is non-zero, + * so that rdma.peak persists as a historical high- + * watermark even after all resources are freed. + */ + for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { + if (rpool->resources[i].peak || + READ_ONCE(rpool->events_max[i]) || + READ_ONCE(rpool->events_local_max[i]) || + READ_ONCE(rpool->events_alloc_fail[i]) || + READ_ONCE(rpool->events_local_alloc_fail[i])) + return true; + } + return false; +} + static struct rdmacg_resource_pool * find_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device) @@ -209,37 +232,30 @@ uncharge_cg_locked(struct rdma_cgroup *cg, rpool->usage_sum--; if (rpool->usage_sum == 0 && rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { - int i; - - /* - * Keep the rpool alive if any peak value is non-zero, - * so that rdma.peak persists as a historical high- - * watermark even after all resources are freed. - */ - for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { - if (rpool->resources[i].peak || - READ_ONCE(rpool->events_max[i])) - return; + if (!rpool_has_persistent_state(rpool)) { + /* + * No user of the rpool and all entries are set to max, so + * safe to delete this rpool. + */ + free_cg_rpool_locked(rpool); } - /* - * No user of the rpool and all entries are set to max, so - * safe to delete this rpool. - */ - free_cg_rpool_locked(rpool); } } /** - * rdmacg_event_locked - fire hierarchical max event when resource limit is hit + * rdmacg_event_locked - fire event when resource allocation exceeds limit + * @cg: requesting cgroup * @over_cg: cgroup whose limit was exceeded * @device: rdma device * @index: resource type index * - * Must be called under rdmacg_mutex. Propagates max event counts - * from @over_cg (including itself) upward to all ancestors with - * an rpool and notifies userspace. + * Must be called under rdmacg_mutex. Updates event counters in the + * resource pools of @cg and @over_cg, propagates hierarchical max + * events from @over_cg (including itself) upward, and notifies + * userspace via cgroup_file_notify(). */ -static void rdmacg_event_locked(struct rdma_cgroup *over_cg, +static void rdmacg_event_locked(struct rdma_cgroup *cg, + struct rdma_cgroup *over_cg, struct rdmacg_device *device, enum rdmacg_resource_type index) { @@ -248,6 +264,21 @@ static void rdmacg_event_locked(struct rdma_cgroup *over_cg, lockdep_assert_held(&rdmacg_mutex); + /* Increment local alloc_fail in requesting cgroup */ + rpool = find_cg_rpool_locked(cg, device); + if (rpool) { + rpool->events_local_alloc_fail[index]++; + cgroup_file_notify(&cg->events_local_file); + } + + /* Increment local max in the over-limit cgroup */ + rpool = find_cg_rpool_locked(over_cg, device); + if (rpool) { + rpool->events_local_max[index]++; + cgroup_file_notify(&over_cg->events_local_file); + } + + /* Propagate hierarchical max events upward */ for (p = over_cg; parent_rdmacg(p); p = parent_rdmacg(p)) { rpool = get_cg_rpool_locked(p, device); if (!IS_ERR(rpool)) { @@ -255,6 +286,14 @@ static void rdmacg_event_locked(struct rdma_cgroup *over_cg, cgroup_file_notify(&p->events_file); } } + /* Propagate hierarchical alloc_fail from requesting cgroup upward */ + for (p = cg; parent_rdmacg(p); p = parent_rdmacg(p)) { + rpool = get_cg_rpool_locked(p, device); + if (!IS_ERR(rpool)) { + rpool->events_alloc_fail[index]++; + cgroup_file_notify(&p->events_file); + } + } } /** @@ -368,7 +407,7 @@ int rdmacg_try_charge(struct rdma_cgroup **rdmacg, err: if (ret == -EAGAIN) - rdmacg_event_locked(p, device, index); + rdmacg_event_locked(cg, p, device, index); mutex_unlock(&rdmacg_mutex); rdmacg_uncharge_hierarchy(cg, device, p, index); return ret; @@ -525,18 +564,13 @@ static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of, if (rpool->usage_sum == 0 && rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { - int i; - - for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { - if (rpool->resources[i].peak || - READ_ONCE(rpool->events_max[i])) - goto dev_err; + if (!rpool_has_persistent_state(rpool)) { + /* + * No user of the rpool and all entries are set to max, so + * safe to delete this rpool. + */ + free_cg_rpool_locked(rpool); } - /* - * No user of the rpool and all entries are set to max, so - * safe to delete this rpool. - */ - free_cg_rpool_locked(rpool); } dev_err: @@ -618,9 +652,40 @@ static int rdmacg_events_show(struct seq_file *sf, void *v) seq_printf(sf, "%s ", device->name); for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { - seq_printf(sf, "%s.max=%llu", + seq_printf(sf, "%s.max=%llu %s.alloc_fail=%llu", + rdmacg_resource_names[i], + rpool ? READ_ONCE(rpool->events_max[i]) : 0ULL, + rdmacg_resource_names[i], + rpool ? READ_ONCE(rpool->events_alloc_fail[i]) : 0ULL); + if (i < RDMACG_RESOURCE_MAX - 1) + seq_putc(sf, ' '); + } + seq_putc(sf, '\n'); + } + + mutex_unlock(&rdmacg_mutex); + return 0; +} + +static int rdmacg_events_local_show(struct seq_file *sf, void *v) +{ + struct rdma_cgroup *cg = css_rdmacg(seq_css(sf)); + struct rdmacg_resource_pool *rpool; + struct rdmacg_device *device; + int i; + + mutex_lock(&rdmacg_mutex); + + list_for_each_entry(device, &rdmacg_devices, dev_node) { + rpool = find_cg_rpool_locked(cg, device); + + seq_printf(sf, "%s ", device->name); + for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { + seq_printf(sf, "%s.max=%llu %s.alloc_fail=%llu", + rdmacg_resource_names[i], + rpool ? READ_ONCE(rpool->events_local_max[i]) : 0ULL, rdmacg_resource_names[i], - rpool ? READ_ONCE(rpool->events_max[i]) : 0ULL); + rpool ? READ_ONCE(rpool->events_local_alloc_fail[i]) : 0ULL); if (i < RDMACG_RESOURCE_MAX - 1) seq_putc(sf, ' '); } @@ -657,6 +722,12 @@ static struct cftype rdmacg_files[] = { .file_offset = offsetof(struct rdma_cgroup, events_file), .flags = CFTYPE_NOT_ON_ROOT, }, + { + .name = "events.local", + .seq_show = rdmacg_events_local_show, + .file_offset = offsetof(struct rdma_cgroup, events_local_file), + .flags = CFTYPE_NOT_ON_ROOT, + }, { } /* terminate */ }; -- cgit v1.2.3 From 3360a5c16d87933fb74b530f5e016eb3dfffee5d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 May 2026 14:51:17 -1000 Subject: cgroup: Inline cgroup_has_tasks() in cgroup.h cpuset reads cs->css.cgroup->nr_populated_csets directly in two places to test whether a cgroup has tasks. cgroup.c already has a matching helper, cgroup_has_tasks(). Move it to cgroup.h as static inline and use that instead. This is to prepare for relocation of cgroup->nr_populated_csets. No semantic change. Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 5 +++++ kernel/cgroup/cgroup.c | 5 ----- kernel/cgroup/cpuset-v1.c | 2 +- kernel/cgroup/cpuset.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e52160e85af4..ceb87507667e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -639,6 +639,11 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, return cgroup_is_descendant(cset->dfl_cgrp, ancestor); } +static inline bool cgroup_has_tasks(struct cgroup *cgrp) +{ + return cgrp->nr_populated_csets; +} + /* no synchronization, the result can only be used as a hint */ static inline bool cgroup_is_populated(struct cgroup *cgrp) { diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index bd10a7e2f9c5..7a94c2ea1036 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -376,11 +376,6 @@ static void cgroup_idr_remove(struct idr *idr, int id) spin_unlock_bh(&cgroup_idr_lock); } -static bool cgroup_has_tasks(struct cgroup *cgrp) -{ - return cgrp->nr_populated_csets; -} - static bool cgroup_is_threaded(struct cgroup *cgrp) { return cgrp->dom_cgrp != cgrp; diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index 7308e9b02495..3e9968dd91e9 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -312,7 +312,7 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs, * This is full cgroup operation which will also call back into * cpuset. Execute it asynchronously using workqueue. */ - if (is_empty && cs->css.cgroup->nr_populated_csets && + if (is_empty && cgroup_has_tasks(cs->css.cgroup) && css_tryget_online(&cs->css)) { struct cpuset_remove_tasks_struct *s; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 74d5c494d6ae..8500e4341c60 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -432,7 +432,7 @@ static inline bool partition_is_populated(struct cpuset *cs, * nr_populated_domain_children may include populated * csets from descendants that are partitions. */ - if (cs->css.cgroup->nr_populated_csets || + if (cgroup_has_tasks(cs->css.cgroup) || cs->attach_in_progress) return true; -- cgit v1.2.3 From 44fabf05634ce9e90b3fb179ea962995b7bbaa09 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 May 2026 14:51:18 -1000 Subject: cgroup: Annotate unlocked nr_populated_* accesses with READ_ONCE/WRITE_ONCE cgroup_update_populated() updates nr_populated_csets, nr_populated_domain_children, and nr_populated_threaded_children under css_set_lock, but cgroup_has_tasks(), cgroup_is_populated(), and cgroup_can_be_thread_root() read them without holding it. Use READ_ONCE/WRITE_ONCE. Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 21 +++++++++++++++++---- kernel/cgroup/cgroup.c | 11 +++++++---- 2 files changed, 24 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ceb87507667e..9f8bef8f3a60 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -639,16 +639,29 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, return cgroup_is_descendant(cset->dfl_cgrp, ancestor); } +/* + * Populated counters: writes happen under css_set_lock. The accessors below + * may read unlocked. What an unpopulated result means depends on context: + * + * - No lock held. Just a snapshot. May race with concurrent updates and is + * useful only as a hint. + * + * - cgroup_mutex held. Migration into the cgroup is blocked, so an observed + * !populated stays !populated until cgroup_mutex is dropped. + * + * - CSS_DYING set. The css can no longer be repopulated, so !populated is + * sticky once observed. + */ static inline bool cgroup_has_tasks(struct cgroup *cgrp) { - return cgrp->nr_populated_csets; + return READ_ONCE(cgrp->nr_populated_csets); } -/* no synchronization, the result can only be used as a hint */ static inline bool cgroup_is_populated(struct cgroup *cgrp) { - return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children + - cgrp->nr_populated_threaded_children; + return READ_ONCE(cgrp->nr_populated_csets) + + READ_ONCE(cgrp->nr_populated_domain_children) + + READ_ONCE(cgrp->nr_populated_threaded_children); } /* returns ino associated with a cgroup */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 7a94c2ea1036..d1395784871a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -404,7 +404,7 @@ static bool cgroup_can_be_thread_root(struct cgroup *cgrp) return false; /* can only have either domain or threaded children */ - if (cgrp->nr_populated_domain_children) + if (READ_ONCE(cgrp->nr_populated_domain_children)) return false; /* and no domain controllers can be enabled */ @@ -783,12 +783,15 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) bool was_populated = cgroup_is_populated(cgrp); if (!child) { - cgrp->nr_populated_csets += adj; + WRITE_ONCE(cgrp->nr_populated_csets, + cgrp->nr_populated_csets + adj); } else { if (cgroup_is_threaded(child)) - cgrp->nr_populated_threaded_children += adj; + WRITE_ONCE(cgrp->nr_populated_threaded_children, + cgrp->nr_populated_threaded_children + adj); else - cgrp->nr_populated_domain_children += adj; + WRITE_ONCE(cgrp->nr_populated_domain_children, + cgrp->nr_populated_domain_children + adj); } if (was_populated == cgroup_is_populated(cgrp)) -- cgit v1.2.3 From c4799253a3ee74ebb27be72fb991c597a5902c01 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 May 2026 14:51:19 -1000 Subject: cgroup: Move populated counters to cgroup_subsys_state Later patches replace the cgroup-level finish_destroy_work deferral added by 93618edf7538 ("cgroup: Defer css percpu_ref kill on rmdir until cgroup is depopulated") with a per-subsys-css deferral. That needs each subsystem css to track its own populated count. Move the populated counters from cgroup onto cgroup_subsys_state. cgroup->self is itself a cgroup_subsys_state and self.parent walks the same chain as cgroup_parent(), so cgroup_update_populated() generalizes to a single css_update_populated() taking a css. The cgroup-side bookkeeping runs only when the walk started from a self css. Keep nr_populated_{domain,threaded}_children on cgroup. Both sum to self.nr_populated_children, but staying as dedicated fields to allow readers like cgroup_can_be_thread_root() unlocked access. css_set_update_populated() also walks the per-subsys-css chain so each subsystem css's hierarchical populated count is maintained. No reader consumes those counts yet. Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 24 +++++++----- include/linux/cgroup.h | 11 ++++-- kernel/cgroup/cgroup.c | 95 ++++++++++++++++++++++++++------------------- 3 files changed, 76 insertions(+), 54 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 50a784da7a81..c4929f7bbe5a 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -253,6 +253,15 @@ struct cgroup_subsys_state { */ int nr_descendants; + /* + * Hierarchical populated state. For cgroup->self, nr_populated_csets + * counts populated csets linked via cgrp_cset_link. + * nr_populated_children counts immediate-child csses whose own + * populated state is nonzero. Protected by css_set_lock. + */ + int nr_populated_csets; + int nr_populated_children; + /* * A singly-linked list of css structures to be rstat flushed. * This is a scratch field to be used exclusively by @@ -504,17 +513,12 @@ struct cgroup { int max_descendants; /* - * Each non-empty css_set associated with this cgroup contributes - * one to nr_populated_csets. The counter is zero iff this cgroup - * doesn't have any tasks. - * - * All children which have non-zero nr_populated_csets and/or - * nr_populated_children of their own contribute one to either - * nr_populated_domain_children or nr_populated_threaded_children - * depending on their type. Each counter is zero iff all cgroups - * of the type in the subtree proper don't have any tasks. + * Domain/threaded split of self.nr_populated_children: each counts + * immediate-child cgroups whose subtree is populated and sums to + * self.nr_populated_children. Kept as separate fields to allow readers + * like cgroup_can_be_thread_root() unlocked access. Protected by + * css_set_lock; updated by css_update_populated(). */ - int nr_populated_csets; int nr_populated_domain_children; int nr_populated_threaded_children; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 9f8bef8f3a60..c2a8c38d8206 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -654,14 +654,17 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, */ static inline bool cgroup_has_tasks(struct cgroup *cgrp) { - return READ_ONCE(cgrp->nr_populated_csets); + return READ_ONCE(cgrp->self.nr_populated_csets); +} + +static inline bool css_is_populated(struct cgroup_subsys_state *css) +{ + return READ_ONCE(css->nr_populated_csets) || READ_ONCE(css->nr_populated_children); } static inline bool cgroup_is_populated(struct cgroup *cgrp) { - return READ_ONCE(cgrp->nr_populated_csets) + - READ_ONCE(cgrp->nr_populated_domain_children) + - READ_ONCE(cgrp->nr_populated_threaded_children); + return css_is_populated(&cgrp->self); } /* returns ino associated with a cgroup */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d1395784871a..dd4ea9d83100 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -756,65 +756,70 @@ static bool css_set_populated(struct css_set *cset) } /** - * cgroup_update_populated - update the populated count of a cgroup - * @cgrp: the target cgroup - * @populated: inc or dec populated count - * - * One of the css_sets associated with @cgrp is either getting its first - * task or losing the last. Update @cgrp->nr_populated_* accordingly. The - * count is propagated towards root so that a given cgroup's - * nr_populated_children is zero iff none of its descendants contain any - * tasks. - * - * @cgrp's interface file "cgroup.populated" is zero if both - * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and - * 1 otherwise. When the sum changes from or to zero, userland is notified - * that the content of the interface file has changed. This can be used to - * detect when @cgrp and its descendants become populated or empty. + * css_update_populated - update the populated state of a css and ancestors + * @css: leaf css whose own populated count is changing + * @populated: inc or dec + * + * One of the css_sets pinned by @css is getting its first task or losing the + * last. Propagate the transition up the parent chain so that a css's + * nr_populated_children is zero iff none of its descendants contain any tasks. + * + * For a cgroup->self walk, also runs cgroup-side bookkeeping at each level: + * domain/threaded child split, deferred-destroy trigger, and notification via + * "cgroup.populated" (zero iff cgrp->self has neither populated csets nor + * populated children; userland is notified on transitions). */ -static void cgroup_update_populated(struct cgroup *cgrp, bool populated) +static void css_update_populated(struct cgroup_subsys_state *css, bool populated) { - struct cgroup *child = NULL; + struct cgroup_subsys_state *child = NULL; int adj = populated ? 1 : -1; lockdep_assert_held(&css_set_lock); do { - bool was_populated = cgroup_is_populated(cgrp); + /* non-NULL only on the cgroup->self walk */ + struct cgroup *cgrp = css_is_self(css) ? css->cgroup : NULL; + bool was_populated = css_is_populated(css); if (!child) { - WRITE_ONCE(cgrp->nr_populated_csets, - cgrp->nr_populated_csets + adj); + WRITE_ONCE(css->nr_populated_csets, + css->nr_populated_csets + adj); } else { - if (cgroup_is_threaded(child)) - WRITE_ONCE(cgrp->nr_populated_threaded_children, - cgrp->nr_populated_threaded_children + adj); - else - WRITE_ONCE(cgrp->nr_populated_domain_children, - cgrp->nr_populated_domain_children + adj); + WRITE_ONCE(css->nr_populated_children, + css->nr_populated_children + adj); + if (cgrp) { + if (cgroup_is_threaded(child->cgroup)) + WRITE_ONCE(cgrp->nr_populated_threaded_children, + cgrp->nr_populated_threaded_children + adj); + else + WRITE_ONCE(cgrp->nr_populated_domain_children, + cgrp->nr_populated_domain_children + adj); + } } - if (was_populated == cgroup_is_populated(cgrp)) + if (was_populated == css_is_populated(css)) break; /* * Subtree just emptied below an offlined cgrp. Fire deferred * destroy. The transition is one-shot. */ - if (was_populated && !css_is_online(&cgrp->self)) { + if (cgrp && was_populated && !css_is_online(css)) { cgroup_get(cgrp); WARN_ON_ONCE(!queue_work(cgroup_offline_wq, &cgrp->finish_destroy_work)); } - cgroup1_check_for_release(cgrp); - TRACE_CGROUP_PATH(notify_populated, cgrp, - cgroup_is_populated(cgrp)); - cgroup_file_notify(&cgrp->events_file); + if (cgrp) { + cgroup1_check_for_release(cgrp); + TRACE_CGROUP_PATH(notify_populated, cgrp, + cgroup_is_populated(cgrp)); + cgroup_file_notify(&cgrp->events_file); + } - child = cgrp; - cgrp = cgroup_parent(cgrp); - } while (cgrp); + child = css; + css = css->parent; + } while (css); } /** @@ -822,17 +827,27 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) * @cset: target css_set * @populated: whether @cset is populated or depopulated * - * @cset is either getting the first task or losing the last. Update the - * populated counters of all associated cgroups accordingly. + * @cset is either getting the first task or losing the last. Update the + * populated counters along each linked cgroup's self chain and each + * subsystem css that @cset pins. */ static void css_set_update_populated(struct css_set *cset, bool populated) { struct cgrp_cset_link *link; + struct cgroup_subsys *ss; + int ssid; lockdep_assert_held(&css_set_lock); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) - cgroup_update_populated(link->cgrp, populated); + css_update_populated(&link->cgrp->self, populated); + + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *css = cset->subsys[ssid]; + + if (css) + css_update_populated(css, populated); + } } /* @@ -2190,7 +2205,7 @@ int cgroup_setup_root(struct cgroup_root *root, u32 ss_mask) hash_for_each(css_set_table, i, cset, hlist) { link_css_set(&tmp_links, cset, root_cgrp); if (css_set_populated(cset)) - cgroup_update_populated(root_cgrp, true); + css_update_populated(&root_cgrp->self, true); } spin_unlock_irq(&css_set_lock); @@ -6145,7 +6160,7 @@ static void kill_css_finish(struct cgroup_subsys_state *css) * * - cgroup_finish_destroy(): kicks the percpu_ref kill via kill_css_finish() on * each subsystem css. Fires once @cgrp's subtree is fully drained, either - * inline here or from cgroup_update_populated(). + * inline here or from css_update_populated(). * * - The percpu_ref kill chain: css_killed_ref_fn -> css_killed_work_fn -> * ->css_offline() -> release/free. -- cgit v1.2.3 From cfc1da7e1127b4c8787f4dc25d59987c10c9107f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 May 2026 14:51:20 -1000 Subject: cgroup: Add per-subsys-css kill_css_finish deferral 93618edf7538 ("cgroup: Defer css percpu_ref kill on rmdir until cgroup is depopulated") deferred kill_css_finish() at the cgroup level: rmdir waits for the entire cgroup's populated count to drop to zero, then fires kill_css_finish() on every subsystem css at once. Replace that with per-subsys-css deferral. Each subsystem css now tracks its own hierarchical populated count and independently defers its kill_css_finish() until its own subtree drains. The rmdir-race fix carries through unchanged in shape. The dying css's ->css_offline() still waits until no PF_EXITING task references it, and v2's cgroup-level machinery goes away. cgroup_apply_control_disable() has the same race shape (PF_EXITING tasks pinning a css whose ->css_offline() is about to run) and stays synchronous here. This patch lays the groundwork for fixing it - per-cgroup waiting can't gate one subsys css being killed while the rest of the cgroup stays live, but per-css can. Subtree-wide invariant preserved: a dying ancestor css stays populated through nr_populated_children until every dying descendant's task drains, so the walker fires the ancestor's kill_finish_work only after all descendants have drained. Add paired smp_mb()s in kill_css_sync() and css_update_populated() to fence the StoreLoad on (CSS_DYING, populated counter), guaranteeing that either the walker queues kill_finish_work or the caller fires synchronously. cgroup_destroy_locked() was implicitly fenced by an unrelated css_set_lock pair; cgroup_apply_control_disable() in the next patch is not. Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 6 ++-- kernel/cgroup/cgroup.c | 83 +++++++++++++++++++++++---------------------- 2 files changed, 46 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index c4929f7bbe5a..de2cd6238c2a 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -262,6 +262,9 @@ struct cgroup_subsys_state { int nr_populated_csets; int nr_populated_children; + /* deferred kill_css_finish() queued by css_update_populated() */ + struct work_struct kill_finish_work; + /* * A singly-linked list of css structures to be rstat flushed. * This is a scratch field to be used exclusively by @@ -615,9 +618,6 @@ struct cgroup { /* used to wait for offlining of csses */ wait_queue_head_t offline_waitq; - /* defers killing csses after removal until cgroup is depopulated */ - struct work_struct finish_destroy_work; - /* used to schedule release agent */ struct work_struct release_agent_work; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index dd4ea9d83100..fa24102535d9 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -264,7 +264,6 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret); static void css_task_iter_skip(struct css_task_iter *it, struct task_struct *task); static int cgroup_destroy_locked(struct cgroup *cgrp); -static void cgroup_finish_destroy(struct cgroup *cgrp); static void kill_css_sync(struct cgroup_subsys_state *css); static void kill_css_finish(struct cgroup_subsys_state *css); static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, @@ -801,13 +800,19 @@ static void css_update_populated(struct cgroup_subsys_state *css, bool populated break; /* - * Subtree just emptied below an offlined cgrp. Fire deferred - * destroy. The transition is one-shot. + * Pair with smp_mb() in kill_css_sync(). Either we observe + * CSS_DYING and queue, or the caller observes our decrement + * and fires synchronously. */ - if (cgrp && was_populated && !css_is_online(css)) { - cgroup_get(cgrp); - WARN_ON_ONCE(!queue_work(cgroup_offline_wq, - &cgrp->finish_destroy_work)); + smp_mb(); + + /* + * Subtree just emptied below a dying css. Fire deferred kill. + * The transition is one-shot for a dying css. + */ + if (was_populated && css_is_dying(css)) { + css_get(css); + WARN_ON_ONCE(!queue_work(cgroup_offline_wq, &css->kill_finish_work)); } if (cgrp) { @@ -2064,16 +2069,6 @@ static int cgroup_reconfigure(struct fs_context *fc) return 0; } -static void cgroup_finish_destroy_work_fn(struct work_struct *work) -{ - struct cgroup *cgrp = container_of(work, struct cgroup, finish_destroy_work); - - cgroup_lock(); - cgroup_finish_destroy(cgrp); - cgroup_unlock(); - cgroup_put(cgrp); -} - static void init_cgroup_housekeeping(struct cgroup *cgrp) { struct cgroup_subsys *ss; @@ -2100,7 +2095,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) #endif init_waitqueue_head(&cgrp->offline_waitq); - INIT_WORK(&cgrp->finish_destroy_work, cgroup_finish_destroy_work_fn); INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } @@ -5695,6 +5689,22 @@ static void css_release(struct percpu_ref *ref) queue_work(cgroup_release_wq, &css->destroy_work); } +/* + * Deferred kill_css_finish() fired from css_update_populated() once a dying + * css's hierarchical populated state drops to zero. Pinned by css_get() at the + * queue site; matched by css_put() here. + */ +static void kill_css_finish_work_fn(struct work_struct *work) +{ + struct cgroup_subsys_state *css = + container_of(work, struct cgroup_subsys_state, kill_finish_work); + + cgroup_lock(); + kill_css_finish(css); + cgroup_unlock(); + css_put(css); +} + static void init_and_link_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, struct cgroup *cgrp) { @@ -5708,6 +5718,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css->id = -1; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); + INIT_WORK(&css->kill_finish_work, kill_css_finish_work_fn); css->serial_nr = css_serial_nr_next++; atomic_set(&css->online_cnt, 0); @@ -6083,6 +6094,13 @@ static void kill_css_sync(struct cgroup_subsys_state *css) css->flags |= CSS_DYING; + /* + * Pair with smp_mb() in css_update_populated(). Either our + * caller observes the walker's decrement and fires + * synchronously, or the walker observes CSS_DYING and queues. + */ + smp_mb(); + /* * This must happen before css is disassociated with its cgroup. * See seq_css() for details. @@ -6158,9 +6176,9 @@ static void kill_css_finish(struct cgroup_subsys_state *css) * - This function: synchronous user-visible state teardown plus kill_css_sync() * on each subsystem css. * - * - cgroup_finish_destroy(): kicks the percpu_ref kill via kill_css_finish() on - * each subsystem css. Fires once @cgrp's subtree is fully drained, either - * inline here or from css_update_populated(). + * - For each subsys css: fire kill_css_finish() synchronously if the subtree is + * already drained, otherwise rely on css_update_populated() to queue + * kill_finish_work when the last populated cset under the css empties. * * - The percpu_ref kill chain: css_killed_ref_fn -> css_killed_work_fn -> * ->css_offline() -> release/free. @@ -6238,29 +6256,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); - if (!cgroup_is_populated(cgrp)) - cgroup_finish_destroy(cgrp); + for_each_css(css, ssid, cgrp) { + if (!css_is_populated(css)) + kill_css_finish(css); + } return 0; }; -/** - * cgroup_finish_destroy - deferred half of @cgrp destruction - * @cgrp: cgroup whose subtree just became empty - * - * See cgroup_destroy_locked() for the rationale. - */ -static void cgroup_finish_destroy(struct cgroup *cgrp) -{ - struct cgroup_subsys_state *css; - int ssid; - - lockdep_assert_held(&cgroup_mutex); - - for_each_css(css, ssid, cgrp) - kill_css_finish(css); -} - int cgroup_rmdir(struct kernfs_node *kn) { struct cgroup *cgrp; -- cgit v1.2.3