diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-04-15 20:18:49 +0300 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-04-15 20:18:49 +0300 |
| commit | b71f0be2d23d876648758d57bc6761500e3b9c70 (patch) | |
| tree | 323b07b4f1234d912b86b958730eb6cf5088b9f3 | |
| parent | 05cef13fa80de8cec481ae5a015e58bc6340ca2d (diff) | |
| parent | 3348e1e83a0f8a5ca1095843bc3316aaef7aae34 (diff) | |
| download | linux-b71f0be2d23d876648758d57bc6761500e3b9c70.tar.xz | |
Merge tag 'cgroup-for-7.1' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo:
- cgroup_file_notify() locking converted from a global lock to
per-cgroup_file spinlock with a lockless fast-path when no
notification is needed
- Misc changes including exposing cgroup helpers for sched_ext and
minor fixes
* tag 'cgroup-for-7.1' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
cgroup/rdma: fix swapped arguments in pr_warn() format string
cgroup/dmem: remove region parameter from dmemcg_parse_limit
cgroup: replace global cgroup_file_kn_lock with per-cgroup_file lock
cgroup: add lockless fast-path checks to cgroup_file_notify()
cgroup: reduce cgroup_file_kn_lock hold time in cgroup_file_notify()
cgroup: Expose some cgroup helpers
| -rw-r--r-- | include/linux/cgroup-defs.h | 1 | ||||
| -rw-r--r-- | include/linux/cgroup.h | 65 | ||||
| -rw-r--r-- | kernel/cgroup/cgroup-internal.h | 6 | ||||
| -rw-r--r-- | kernel/cgroup/cgroup.c | 108 | ||||
| -rw-r--r-- | kernel/cgroup/dmem.c | 5 | ||||
| -rw-r--r-- | kernel/cgroup/rdma.c | 2 |
6 files changed, 95 insertions, 92 deletions
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 7f87399938fa..f197ca104737 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -167,6 +167,7 @@ struct cgroup_file { struct kernfs_node *kn; unsigned long notified_at; struct timer_list notify_timer; + spinlock_t lock; }; /* diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index bc892e3b37ee..e52160e85af4 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -42,6 +42,14 @@ struct kernel_clone_args; #ifdef CONFIG_CGROUPS +/* + * To avoid confusing the compiler (and generating warnings) with code + * that attempts to access what would be a 0-element array (i.e. sized + * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this + * constant expression can be added. + */ +#define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0) + enum css_task_iter_flags { CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ @@ -76,6 +84,7 @@ enum cgroup_lifetime_events { extern struct file_system_type cgroup_fs_type; extern struct cgroup_root cgrp_dfl_root; extern struct css_set init_css_set; +extern struct mutex cgroup_mutex; extern spinlock_t css_set_lock; extern struct blocking_notifier_head cgroup_lifetime_notifier; @@ -103,6 +112,8 @@ extern struct blocking_notifier_head cgroup_lifetime_notifier; #define cgroup_subsys_on_dfl(ss) \ static_branch_likely(&ss ## _on_dfl_key) +bool cgroup_on_dfl(const struct cgroup *cgrp); + bool css_has_online_children(struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup, @@ -274,6 +285,32 @@ void css_task_iter_end(struct css_task_iter *it); for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ (pos) = css_next_descendant_post((pos), (css))) +/* iterate over child cgrps, lock should be held throughout iteration */ +#define cgroup_for_each_live_child(child, cgrp) \ + list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + cgroup_is_dead(child); })) \ + ; \ + else + +/* walk live descendants in pre order */ +#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \ + css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + (dsct) = (d_css)->cgroup; \ + cgroup_is_dead(dsct); })) \ + ; \ + else + +/* walk live descendants in postorder */ +#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \ + css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + (dsct) = (d_css)->cgroup; \ + cgroup_is_dead(dsct); })) \ + ; \ + else + /** * cgroup_taskset_for_each - iterate cgroup_taskset * @task: the loop cursor @@ -337,6 +374,27 @@ static inline u64 cgroup_id(const struct cgroup *cgrp) } /** + * cgroup_css - obtain a cgroup's css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest (%NULL returns @cgrp->self) + * + * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This + * function must be called either under cgroup_mutex or rcu_read_lock() and + * the caller is responsible for pinning the returned css if it wants to + * keep accessing it outside the said locks. This function may return + * %NULL if @cgrp doesn't have @subsys_id enabled. + */ +static inline struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + if (CGROUP_HAS_SUBSYS_CONFIG && ss) + return rcu_dereference_check(cgrp->subsys[ss->id], + lockdep_is_held(&cgroup_mutex)); + else + return &cgrp->self; +} + +/** * css_is_dying - test whether the specified css is dying * @css: target css * @@ -372,6 +430,11 @@ static inline bool css_is_self(struct cgroup_subsys_state *css) return false; } +static inline bool cgroup_is_dead(const struct cgroup *cgrp) +{ + return !(cgrp->self.flags & CSS_ONLINE); +} + static inline void cgroup_get(struct cgroup *cgrp) { css_get(&cgrp->self); @@ -387,8 +450,6 @@ static inline void cgroup_put(struct cgroup *cgrp) css_put(&cgrp->self); } -extern struct mutex cgroup_mutex; - static inline void cgroup_lock(void) { mutex_lock(&cgroup_mutex); diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 3bfe37693d68..58797123b752 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -184,11 +184,6 @@ extern bool cgrp_dfl_visible; for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) -static inline bool cgroup_is_dead(const struct cgroup *cgrp) -{ - return !(cgrp->self.flags & CSS_ONLINE); -} - static inline bool notify_on_release(const struct cgroup *cgrp) { return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -222,7 +217,6 @@ static inline void get_css_set(struct css_set *cset) } bool cgroup_ssid_enabled(int ssid); -bool cgroup_on_dfl(const struct cgroup *cgrp); struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root); struct cgroup *task_cgroup_from_root(struct task_struct *task, diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4ca3cb993da2..1f084ee71443 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -69,14 +69,6 @@ #define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100) /* - * To avoid confusing the compiler (and generating warnings) with code - * that attempts to access what would be a 0-element array (i.e. sized - * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this - * constant expression can be added. - */ -#define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0) - -/* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. * @@ -107,12 +99,6 @@ static bool cgroup_debug __read_mostly; */ static DEFINE_SPINLOCK(cgroup_idr_lock); -/* - * Protects cgroup_file->kn for !self csses. It synchronizes notifications - * against file removal/re-creation across css hiding. - */ -static DEFINE_SPINLOCK(cgroup_file_kn_lock); - DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); #define cgroup_assert_mutex_or_rcu_locked() \ @@ -510,27 +496,6 @@ static u32 cgroup_ss_mask(struct cgroup *cgrp) } /** - * cgroup_css - obtain a cgroup's css for the specified subsystem - * @cgrp: the cgroup of interest - * @ss: the subsystem of interest (%NULL returns @cgrp->self) - * - * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This - * function must be called either under cgroup_mutex or rcu_read_lock() and - * the caller is responsible for pinning the returned css if it wants to - * keep accessing it outside the said locks. This function may return - * %NULL if @cgrp doesn't have @subsys_id enabled. - */ -static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, - struct cgroup_subsys *ss) -{ - if (CGROUP_HAS_SUBSYS_CONFIG && ss) - return rcu_dereference_check(cgrp->subsys[ss->id], - lockdep_is_held(&cgroup_mutex)); - else - return &cgrp->self; -} - -/** * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) @@ -741,32 +706,6 @@ EXPORT_SYMBOL_GPL(of_css); } \ } while (false) -/* iterate over child cgrps, lock should be held throughout iteration */ -#define cgroup_for_each_live_child(child, cgrp) \ - list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ - if (({ lockdep_assert_held(&cgroup_mutex); \ - cgroup_is_dead(child); })) \ - ; \ - else - -/* walk live descendants in pre order */ -#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \ - css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ - if (({ lockdep_assert_held(&cgroup_mutex); \ - (dsct) = (d_css)->cgroup; \ - cgroup_is_dead(dsct); })) \ - ; \ - else - -/* walk live descendants in postorder */ -#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \ - css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \ - if (({ lockdep_assert_held(&cgroup_mutex); \ - (dsct) = (d_css)->cgroup; \ - cgroup_is_dead(dsct); })) \ - ; \ - else - /* * The default css_set - used by init and its children prior to any * hierarchies being mounted. It contains a pointer to the root state @@ -1748,9 +1687,9 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss); struct cgroup_file *cfile = (void *)css + cft->file_offset; - spin_lock_irq(&cgroup_file_kn_lock); - cfile->kn = NULL; - spin_unlock_irq(&cgroup_file_kn_lock); + spin_lock_irq(&cfile->lock); + WRITE_ONCE(cfile->kn, NULL); + spin_unlock_irq(&cfile->lock); timer_delete_sync(&cfile->notify_timer); } @@ -4429,10 +4368,8 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cgroup_file *cfile = (void *)css + cft->file_offset; timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0); - - spin_lock_irq(&cgroup_file_kn_lock); + spin_lock_init(&cfile->lock); cfile->kn = kn; - spin_unlock_irq(&cgroup_file_kn_lock); } return 0; @@ -4687,21 +4624,32 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) */ void cgroup_file_notify(struct cgroup_file *cfile) { - unsigned long flags; + unsigned long flags, last, next; + struct kernfs_node *kn = NULL; - spin_lock_irqsave(&cgroup_file_kn_lock, flags); + if (!READ_ONCE(cfile->kn)) + return; + + last = READ_ONCE(cfile->notified_at); + next = last + CGROUP_FILE_NOTIFY_MIN_INTV; + if (time_in_range(jiffies, last, next)) { + timer_reduce(&cfile->notify_timer, next); + if (timer_pending(&cfile->notify_timer)) + return; + } + + spin_lock_irqsave(&cfile->lock, flags); if (cfile->kn) { - unsigned long last = cfile->notified_at; - unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV; + kn = cfile->kn; + kernfs_get(kn); + WRITE_ONCE(cfile->notified_at, jiffies); + } + spin_unlock_irqrestore(&cfile->lock, flags); - if (time_in_range(jiffies, last, next)) { - timer_reduce(&cfile->notify_timer, next); - } else { - kernfs_notify(cfile->kn); - cfile->notified_at = jiffies; - } + if (kn) { + kernfs_notify(kn); + kernfs_put(kn); } - spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); } EXPORT_SYMBOL_GPL(cgroup_file_notify); @@ -4714,10 +4662,10 @@ void cgroup_file_show(struct cgroup_file *cfile, bool show) { struct kernfs_node *kn; - spin_lock_irq(&cgroup_file_kn_lock); + spin_lock_irq(&cfile->lock); kn = cfile->kn; kernfs_get(kn); - spin_unlock_irq(&cgroup_file_kn_lock); + spin_unlock_irq(&cfile->lock); if (kn) kernfs_show(kn, show); diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c index 9d95824dc6fa..1ab1fb47f271 100644 --- a/kernel/cgroup/dmem.c +++ b/kernel/cgroup/dmem.c @@ -707,8 +707,7 @@ static int dmem_cgroup_region_capacity_show(struct seq_file *sf, void *v) return 0; } -static int dmemcg_parse_limit(char *options, struct dmem_cgroup_region *region, - u64 *new_limit) +static int dmemcg_parse_limit(char *options, u64 *new_limit) { char *end; @@ -762,7 +761,7 @@ static ssize_t dmemcg_limit_write(struct kernfs_open_file *of, if (!region) return -EINVAL; - err = dmemcg_parse_limit(options, region, &new_limit); + err = dmemcg_parse_limit(options, &new_limit); if (err < 0) goto out_put; diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index 09258eebb5c7..9967fb25c563 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -173,7 +173,7 @@ uncharge_cg_locked(struct rdma_cgroup *cg, * the system. */ if (unlikely(!rpool)) { - pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device); + pr_warn("Invalid device %p or rdma cgroup %p\n", device, cg); return; } |
