From 519fabc7aaba3f0847cf37d5f9a5740c370eb777 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 2 Mar 2023 17:13:46 -0800 Subject: psi: remove 500ms min window size limitation for triggers Current 500ms min window size for psi triggers limits polling interval to 50ms to prevent polling threads from using too much cpu bandwidth by polling too frequently. However the number of cgroups with triggers is unlimited, so this protection can be defeated by creating multiple cgroups with psi triggers (triggers in each cgroup are served by a single "psimon" kernel thread). Instead of limiting min polling period, which also limits the latency of psi events, it's better to limit psi trigger creation to authorized users only, like we do for system-wide psi triggers (/proc/pressure/* files can be written only by processes with CAP_SYS_RESOURCE capability). This also makes access rules for cgroup psi files consistent with system-wide ones. Add a CAP_SYS_RESOURCE capability check for cgroup psi file writers and remove the psi window min size limitation. Suggested-by: Sudarshan Rajagopalan Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Acked-by: Michal Hocko Acked-by: Johannes Weiner Link: https://lore.kernel.org/all/cover.1676067791.git.quic_sudaraja@quicinc.com/ --- kernel/cgroup/cgroup.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel/cgroup/cgroup.c') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 625d7483951c..b26ae200abef 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3877,6 +3877,14 @@ static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, return psi_trigger_poll(&ctx->psi.trigger, of->file, pt); } +static int cgroup_pressure_open(struct kernfs_open_file *of) +{ + if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) + return -EPERM; + + return 0; +} + static void cgroup_pressure_release(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv; @@ -5276,6 +5284,7 @@ static struct cftype cgroup_psi_files[] = { { .name = "io.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]), + .open = cgroup_pressure_open, .seq_show = cgroup_io_pressure_show, .write = cgroup_io_pressure_write, .poll = cgroup_pressure_poll, @@ -5284,6 +5293,7 @@ static struct cftype cgroup_psi_files[] = { { .name = "memory.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]), + .open = cgroup_pressure_open, .seq_show = cgroup_memory_pressure_show, .write = cgroup_memory_pressure_write, .poll = cgroup_pressure_poll, @@ -5292,6 +5302,7 @@ static struct cftype cgroup_psi_files[] = { { .name = "cpu.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]), + .open = cgroup_pressure_open, .seq_show = cgroup_cpu_pressure_show, .write = cgroup_cpu_pressure_write, .poll = cgroup_pressure_poll, @@ -5301,6 +5312,7 @@ static struct cftype cgroup_psi_files[] = { { .name = "irq.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]), + .open = cgroup_pressure_open, .seq_show = cgroup_irq_pressure_show, .write = cgroup_irq_pressure_write, .poll = cgroup_pressure_poll, -- cgit v1.2.3 From 6c24849f5515e4966d94fa5279bdff4acf2e9489 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 8 May 2023 09:58:51 +0200 Subject: sched/cpuset: Keep track of SCHED_DEADLINE task in cpusets Qais reported that iterating over all tasks when rebuilding root domains for finding out which ones are DEADLINE and need their bandwidth correctly restored on such root domains can be a costly operation (10+ ms delays on suspend-resume). To fix the problem keep track of the number of DEADLINE tasks belonging to each cpuset and then use this information (followup patch) to only perform the above iteration if DEADLINE tasks are actually present in the cpuset for which a corresponding root domain is being rebuilt. Reported-by: Qais Yousef Link: https://lore.kernel.org/lkml/20230206221428.2125324-1-qyousef@layalina.io/ Signed-off-by: Juri Lelli Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/cpuset.h | 4 ++++ kernel/cgroup/cgroup.c | 4 ++++ kernel/cgroup/cpuset.c | 25 +++++++++++++++++++++++++ kernel/sched/deadline.c | 14 ++++++++++++++ 4 files changed, 47 insertions(+) (limited to 'kernel/cgroup/cgroup.c') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index f90e6325d707..d629094fac6e 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -71,6 +71,8 @@ extern void cpuset_init_smp(void); extern void cpuset_force_rebuild(void); extern void cpuset_update_active_cpus(void); extern void cpuset_wait_for_hotplug(void); +extern void inc_dl_tasks_cs(struct task_struct *task); +extern void dec_dl_tasks_cs(struct task_struct *task); extern void cpuset_lock(void); extern void cpuset_unlock(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); @@ -189,6 +191,8 @@ static inline void cpuset_update_active_cpus(void) static inline void cpuset_wait_for_hotplug(void) { } +static inline void inc_dl_tasks_cs(struct task_struct *task) { } +static inline void dec_dl_tasks_cs(struct task_struct *task) { } static inline void cpuset_lock(void) { } static inline void cpuset_unlock(void) { } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 625d7483951c..9d809191a54f 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include @@ -6683,6 +6684,9 @@ void cgroup_exit(struct task_struct *tsk) list_add_tail(&tsk->cg_list, &cset->dying_tasks); cset->nr_tasks--; + if (dl_task(tsk)) + dec_dl_tasks_cs(tsk); + WARN_ON_ONCE(cgroup_task_frozen(tsk)); if (unlikely(!(tsk->flags & PF_KTHREAD) && test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags))) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 041c0809adaf..ca195ff8b298 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -193,6 +193,12 @@ struct cpuset { int use_parent_ecpus; int child_ecpus_count; + /* + * number of SCHED_DEADLINE tasks attached to this cpuset, so that we + * know when to rebuild associated root domain bandwidth information. + */ + int nr_deadline_tasks; + /* Invalid partition error code, not lock protected */ enum prs_errcode prs_err; @@ -245,6 +251,20 @@ static inline struct cpuset *parent_cs(struct cpuset *cs) return css_cs(cs->css.parent); } +void inc_dl_tasks_cs(struct task_struct *p) +{ + struct cpuset *cs = task_cs(p); + + cs->nr_deadline_tasks++; +} + +void dec_dl_tasks_cs(struct task_struct *p) +{ + struct cpuset *cs = task_cs(p); + + cs->nr_deadline_tasks--; +} + /* bits in struct cpuset flags field */ typedef enum { CS_ONLINE, @@ -2499,6 +2519,11 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) ret = security_task_setscheduler(task); if (ret) goto out_unlock; + + if (dl_task(task)) { + cs->nr_deadline_tasks++; + cpuset_attach_old_cs->nr_deadline_tasks--; + } } /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 5a9a4b81c972..e11de074a6fd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -16,6 +16,8 @@ * Fabio Checconi */ +#include + /* * Default limits for DL period; on the top end we guard against small util * tasks still getting ridiculously long effective runtimes, on the bottom end we @@ -2596,6 +2598,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) if (task_on_rq_queued(p) && p->dl.dl_runtime) task_non_contending(p); + /* + * In case a task is setscheduled out from SCHED_DEADLINE we need to + * keep track of that on its cpuset (for correct bandwidth tracking). + */ + dec_dl_tasks_cs(p); + if (!task_on_rq_queued(p)) { /* * Inactive timer is armed. However, p is leaving DEADLINE and @@ -2636,6 +2644,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) put_task_struct(p); + /* + * In case a task is setscheduled to SCHED_DEADLINE we need to keep + * track of that on its cpuset (for correct bandwidth tracking). + */ + inc_dl_tasks_cs(p); + /* If p is not queued we will update its parameters at next wakeup. */ if (!task_on_rq_queued(p)) { add_rq_bw(&p->dl, &rq->dl); -- cgit v1.2.3 From 2bd110339288c18823dcace602b63b0d8627e520 Mon Sep 17 00:00:00 2001 From: John Sperbeck Date: Sun, 21 May 2023 19:29:53 +0000 Subject: cgroup: always put cset in cgroup_css_set_put_fork A successful call to cgroup_css_set_fork() will always have taken a ref on kargs->cset (regardless of CLONE_INTO_CGROUP), so always do a corresponding put in cgroup_css_set_put_fork(). Without this, a cset and its contained css structures will be leaked for some fork failures. The following script reproduces the leak for a fork failure due to exceeding pids.max in the pids controller. A similar thing can happen if we jump to the bad_fork_cancel_cgroup label in copy_process(). [ -z "$1" ] && echo "Usage $0 pids-root" && exit 1 PID_ROOT=$1 CGROUP=$PID_ROOT/foo [ -e $CGROUP ] && rmdir -f $CGROUP mkdir $CGROUP echo 5 > $CGROUP/pids.max echo $$ > $CGROUP/cgroup.procs fork_bomb() { set -e for i in $(seq 10); do /bin/sleep 3600 & done } (fork_bomb) & wait echo $$ > $PID_ROOT/cgroup.procs kill $(cat $CGROUP/cgroup.procs) rmdir $CGROUP Fixes: ef2c41cf38a7 ("clone3: allow spawning processes into cgroups") Cc: stable@vger.kernel.org # v5.7+ Signed-off-by: John Sperbeck Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel/cgroup/cgroup.c') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 625d7483951c..245cf62ce85a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6486,19 +6486,18 @@ err: static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs) __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { + struct cgroup *cgrp = kargs->cgrp; + struct css_set *cset = kargs->cset; + cgroup_threadgroup_change_end(current); - if (kargs->flags & CLONE_INTO_CGROUP) { - struct cgroup *cgrp = kargs->cgrp; - struct css_set *cset = kargs->cset; + if (cset) { + put_css_set(cset); + kargs->cset = NULL; + } + if (kargs->flags & CLONE_INTO_CGROUP) { cgroup_unlock(); - - if (cset) { - put_css_set(cset); - kargs->cset = NULL; - } - if (cgrp) { cgroup_put(cgrp); kargs->cgrp = NULL; -- cgit v1.2.3 From 659db0789c2e66c5d6a52d57008e3a7401a3ffff Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Wed, 24 May 2023 14:54:31 +0800 Subject: cgroup: Update out-of-date comment in cgroup_migrate() Commit 674b745e22b3 ("cgroup: remove rcu_read_lock()/rcu_read_unlock() in critical section of spin_lock_irq()") has removed the rcu_read_lock, which makes the comment out-of-date, so update it. tj: Updated the comment a bit. Signed-off-by: Xiu Jianfeng Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/cgroup/cgroup.c') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 9d809191a54f..f329f49529a2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2872,9 +2872,9 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, struct task_struct *task; /* - * Prevent freeing of tasks while we take a snapshot. Tasks that are - * already PF_EXITING could be freed from underneath us unless we - * take an rcu_read_lock. + * The following thread iteration should be inside an RCU critical + * section to prevent tasks from being freed while taking the snapshot. + * spin_lock_irq() implies RCU critical section here. */ spin_lock_irq(&css_set_lock); task = leader; -- cgit v1.2.3 From a49a11dc6449a7e1370441bc23d83d528787066d Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 27 May 2023 17:33:53 +0800 Subject: cgroup: remove unused macro for_each_e_css() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit for_each_e_css() is unused now. Remove it. Signed-off-by: Miaohe Lin Reviewed-by: Yosry Ahmed Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'kernel/cgroup/cgroup.c') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f329f49529a2..eb3d9c1f25e3 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -690,21 +690,6 @@ EXPORT_SYMBOL_GPL(of_css); lockdep_is_held(&cgroup_mutex)))) { } \ else -/** - * for_each_e_css - iterate all effective css's of a cgroup - * @css: the iteration cursor - * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end - * @cgrp: the target cgroup to iterate css's of - * - * Should be called under cgroup_[tree_]mutex. - */ -#define for_each_e_css(css, ssid, cgrp) \ - for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ - if (!((css) = cgroup_e_css_by_mask(cgrp, \ - cgroup_subsys[(ssid)]))) \ - ; \ - else - /** * do_each_subsys_mask - filter for_each_subsys with a bitmask * @ss: the iteration cursor -- cgit v1.2.3 From 7bf11e90a30afcf79ff1acf28b01c6455799861a Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Fri, 2 Jun 2023 15:43:46 +0800 Subject: cgroup: Replace the css_set call with cgroup_get We will release the refcnt of cgroup via cgroup_put, for example in the cgroup_lock_and_drain_offline function, so replace css_get with the cgroup_get function for better readability. Signed-off-by: Gaosheng Cui Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/cgroup/cgroup.c') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index eb3d9c1f25e3..f69e9a443ecc 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -619,7 +619,7 @@ EXPORT_SYMBOL_GPL(cgroup_get_e_css); static void cgroup_get_live(struct cgroup *cgrp) { WARN_ON_ONCE(cgroup_is_dead(cgrp)); - css_get(&cgrp->self); + cgroup_get(cgrp); } /** -- cgit v1.2.3 From 0dad9b072b2b170a99fcefa330a1a3193d503d8c Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 3 Jun 2023 15:13:04 +0800 Subject: cgroup: make cgroup_is_threaded() and cgroup_is_thread_root() static They're only called inside cgroup.c. Make them static. Signed-off-by: Miaohe Lin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-internal.h | 2 -- kernel/cgroup/cgroup.c | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel/cgroup/cgroup.c') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 367b0a42ada9..c56071f150f2 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -220,8 +220,6 @@ static inline void get_css_set(struct css_set *cset) bool cgroup_ssid_enabled(int ssid); bool cgroup_on_dfl(const struct cgroup *cgrp); -bool cgroup_is_thread_root(struct cgroup *cgrp); -bool cgroup_is_threaded(struct cgroup *cgrp); struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root); struct cgroup *task_cgroup_from_root(struct task_struct *task, diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f69e9a443ecc..57f31b234433 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -357,7 +357,7 @@ static bool cgroup_has_tasks(struct cgroup *cgrp) return cgrp->nr_populated_csets; } -bool cgroup_is_threaded(struct cgroup *cgrp) +static bool cgroup_is_threaded(struct cgroup *cgrp) { return cgrp->dom_cgrp != cgrp; } @@ -396,7 +396,7 @@ static bool cgroup_can_be_thread_root(struct cgroup *cgrp) } /* is @cgrp root of a threaded subtree? */ -bool cgroup_is_thread_root(struct cgroup *cgrp) +static bool cgroup_is_thread_root(struct cgroup *cgrp) { /* thread root should be a domain */ if (cgroup_is_threaded(cgrp)) -- cgit v1.2.3 From d16b3af46679a1eb21652c37711a60d3d4e6b8c0 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 10 Jun 2023 11:57:37 +0800 Subject: cgroup: remove unused task_cgroup_path() task_cgroup_path() is not used anymore. So remove it. Signed-off-by: Miaohe Lin Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 1 - kernel/cgroup/cgroup.c | 39 --------------------------------------- 2 files changed, 40 deletions(-) (limited to 'kernel/cgroup/cgroup.c') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 885f5395fcd0..1261a47932a6 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -118,7 +118,6 @@ int cgroup_rm_cftypes(struct cftype *cfts); void cgroup_file_notify(struct cgroup_file *cfile); void cgroup_file_show(struct cgroup_file *cfile, bool show); -int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 57f31b234433..04d1c0cde882 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2364,45 +2364,6 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, } EXPORT_SYMBOL_GPL(cgroup_path_ns); -/** - * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy - * @task: target task - * @buf: the buffer to write the path into - * @buflen: the length of the buffer - * - * Determine @task's cgroup on the first (the one with the lowest non-zero - * hierarchy_id) cgroup hierarchy and copy its path into @buf. This - * function grabs cgroup_mutex and shouldn't be used inside locks used by - * cgroup controller callbacks. - * - * Return value is the same as kernfs_path(). - */ -int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) -{ - struct cgroup_root *root; - struct cgroup *cgrp; - int hierarchy_id = 1; - int ret; - - cgroup_lock(); - spin_lock_irq(&css_set_lock); - - root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); - - if (root) { - cgrp = task_cgroup_from_root(task, root); - ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); - } else { - /* if no hierarchy exists, everyone is in "/" */ - ret = strscpy(buf, "/", buflen); - } - - spin_unlock_irq(&css_set_lock); - cgroup_unlock(); - return ret; -} -EXPORT_SYMBOL_GPL(task_cgroup_path); - /** * cgroup_attach_lock - Lock for ->attach() * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem -- cgit v1.2.3 From 6f363f5aa845561f7ea496d8b1175e3204470486 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Sat, 10 Jun 2023 17:26:43 +0800 Subject: cgroup: Do not corrupt task iteration when rebinding subsystem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We found a refcount UAF bug as follows: refcount_t: addition on 0; use-after-free. WARNING: CPU: 1 PID: 342 at lib/refcount.c:25 refcount_warn_saturate+0xa0/0x148 Workqueue: events cpuset_hotplug_workfn Call trace: refcount_warn_saturate+0xa0/0x148 __refcount_add.constprop.0+0x5c/0x80 css_task_iter_advance_css_set+0xd8/0x210 css_task_iter_advance+0xa8/0x120 css_task_iter_next+0x94/0x158 update_tasks_root_domain+0x58/0x98 rebuild_root_domains+0xa0/0x1b0 rebuild_sched_domains_locked+0x144/0x188 cpuset_hotplug_workfn+0x138/0x5a0 process_one_work+0x1e8/0x448 worker_thread+0x228/0x3e0 kthread+0xe0/0xf0 ret_from_fork+0x10/0x20 then a kernel panic will be triggered as below: Unable to handle kernel paging request at virtual address 00000000c0000010 Call trace: cgroup_apply_control_disable+0xa4/0x16c rebind_subsystems+0x224/0x590 cgroup_destroy_root+0x64/0x2e0 css_free_rwork_fn+0x198/0x2a0 process_one_work+0x1d4/0x4bc worker_thread+0x158/0x410 kthread+0x108/0x13c ret_from_fork+0x10/0x18 The race that cause this bug can be shown as below: (hotplug cpu) | (umount cpuset) mutex_lock(&cpuset_mutex) | mutex_lock(&cgroup_mutex) cpuset_hotplug_workfn | rebuild_root_domains | rebind_subsystems update_tasks_root_domain | spin_lock_irq(&css_set_lock) css_task_iter_start | list_move_tail(&cset->e_cset_node[ss->id] while(css_task_iter_next) | &dcgrp->e_csets[ss->id]); css_task_iter_end | spin_unlock_irq(&css_set_lock) mutex_unlock(&cpuset_mutex) | mutex_unlock(&cgroup_mutex) Inside css_task_iter_start/next/end, css_set_lock is hold and then released, so when iterating task(left side), the css_set may be moved to another list(right side), then it->cset_head points to the old list head and it->cset_pos->next points to the head node of new list, which can't be used as struct css_set. To fix this issue, switch from all css_sets to only scgrp's css_sets to patch in-flight iterators to preserve correct iteration, and then update it->cset_head as well. Reported-by: Gaosheng Cui Link: https://www.spinics.net/lists/cgroups/msg37935.html Suggested-by: Michal Koutný Link: https://lore.kernel.org/all/20230526114139.70274-1-xiujianfeng@huaweicloud.com/ Signed-off-by: Xiu Jianfeng Fixes: 2d8f243a5e6e ("cgroup: implement cgroup->e_csets[]") Cc: stable@vger.kernel.org # v3.16+ Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'kernel/cgroup/cgroup.c') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 245cf62ce85a..4d42f0cbc11e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1798,7 +1798,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; - int ssid, i, ret; + int ssid, ret; u16 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); @@ -1842,7 +1842,8 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); - struct css_set *cset; + struct css_set *cset, *cset_pos; + struct css_task_iter *it; WARN_ON(!css || cgroup_css(dcgrp, ss)); @@ -1860,9 +1861,22 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) css->cgroup = dcgrp; spin_lock_irq(&css_set_lock); - hash_for_each(css_set_table, i, cset, hlist) + WARN_ON(!list_empty(&dcgrp->e_csets[ss->id])); + list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id], + e_cset_node[ss->id]) { list_move_tail(&cset->e_cset_node[ss->id], &dcgrp->e_csets[ss->id]); + /* + * all css_sets of scgrp together in same order to dcgrp, + * patch in-flight iterators to preserve correct iteration. + * since the iterator is always advanced right away and + * finished when it->cset_pos meets it->cset_head, so only + * update it->cset_head is enough here. + */ + list_for_each_entry(it, &cset->task_iters, iters_node) + if (it->cset_head == &scgrp->e_csets[ss->id]) + it->cset_head = &dcgrp->e_csets[ss->id]; + } spin_unlock_irq(&css_set_lock); if (ss->css_rstat_flush) { -- cgit v1.2.3 From a04de42460e271e532b43ad40e9fb07a9a09fe5c Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 17 Jun 2023 15:48:09 +0800 Subject: cgroup: remove obsolete comment on cgroup_on_dfl() The debug feature is supported since commit 8cc38fa7fa31 ("cgroup: make debug an implicit controller on cgroup2"), update corresponding comment. Signed-off-by: Miaohe Lin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/cgroup/cgroup.c') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 04d1c0cde882..065bebb4af9b 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -313,8 +313,6 @@ bool cgroup_ssid_enabled(int ssid) * masks of ancestors. * * - blkcg: blk-throttle becomes properly hierarchical. - * - * - debug: disallowed on the default hierarchy. */ bool cgroup_on_dfl(const struct cgroup *cgrp) { -- cgit v1.2.3 From 36de5f303ca1bd6fce74815ef17ef3d8ff8737b5 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 14 Jun 2023 19:18:22 -0600 Subject: cgroup: Avoid -Wstringop-overflow warnings Address the following -Wstringop-overflow warnings seen when built with ARM architecture and aspeed_g4_defconfig configuration (notice that under this configuration CGROUP_SUBSYS_COUNT == 0): kernel/cgroup/cgroup.c:1208:16: warning: 'find_existing_css_set' accessing 4 bytes in a region of size 0 [-Wstringop-overflow=] kernel/cgroup/cgroup.c:1258:15: warning: 'css_set_hash' accessing 4 bytes in a region of size 0 [-Wstringop-overflow=] kernel/cgroup/cgroup.c:6089:18: warning: 'css_set_hash' accessing 4 bytes in a region of size 0 [-Wstringop-overflow=] kernel/cgroup/cgroup.c:6153:18: warning: 'css_set_hash' accessing 4 bytes in a region of size 0 [-Wstringop-overflow=] These changes are based on commit d20d30ebb199 ("cgroup: Avoid compiler warnings with no subsystems"). Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel/cgroup/cgroup.c') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 065bebb4af9b..1404e8e8abf7 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1198,6 +1198,9 @@ static struct css_set *find_css_set(struct css_set *old_cset, unsigned long key; int ssid; + if (!CGROUP_HAS_SUBSYS_CONFIG) + return NULL; + lockdep_assert_held(&cgroup_mutex); /* First see if we already have a cgroup group that matches @@ -6017,6 +6020,9 @@ int __init cgroup_init(void) struct cgroup_subsys *ss; int ssid; + if (!CGROUP_HAS_SUBSYS_CONFIG) + return -EINVAL; + BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files)); -- cgit v1.2.3 From 81621430c81bb7965c3d5807039bc2b5b3ec87ca Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 22 Jun 2023 08:51:14 -1000 Subject: Revert "cgroup: Avoid -Wstringop-overflow warnings" This reverts commit 36de5f303ca1bd6fce74815ef17ef3d8ff8737b5. The commit caused boot failures on some configurations due to cgroup hierarchies not being created at all. Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel/cgroup/cgroup.c') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1404e8e8abf7..065bebb4af9b 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1198,9 +1198,6 @@ static struct css_set *find_css_set(struct css_set *old_cset, unsigned long key; int ssid; - if (!CGROUP_HAS_SUBSYS_CONFIG) - return NULL; - lockdep_assert_held(&cgroup_mutex); /* First see if we already have a cgroup group that matches @@ -6020,9 +6017,6 @@ int __init cgroup_init(void) struct cgroup_subsys *ss; int ssid; - if (!CGROUP_HAS_SUBSYS_CONFIG) - return -EINVAL; - BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files)); -- cgit v1.2.3 From aff037078ecaecf34a7c2afab1341815f90fba5e Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 29 Jun 2023 17:56:12 -0700 Subject: sched/psi: use kernfs polling functions for PSI trigger polling Destroying psi trigger in cgroup_file_release causes UAF issues when a cgroup is removed from under a polling process. This is happening because cgroup removal causes a call to cgroup_file_release while the actual file is still alive. Destroying the trigger at this point would also destroy its waitqueue head and if there is still a polling process on that file accessing the waitqueue, it will step on the freed pointer: do_select vfs_poll do_rmdir cgroup_rmdir kernfs_drain_open_files cgroup_file_release cgroup_pressure_release psi_trigger_destroy wake_up_pollfree(&t->event_wait) // vfs_poll is unblocked synchronize_rcu kfree(t) poll_freewait -> UAF access to the trigger's waitqueue head Patch [1] fixed this issue for epoll() case using wake_up_pollfree(), however the same issue exists for synchronous poll() case. The root cause of this issue is that the lifecycles of the psi trigger's waitqueue and of the file associated with the trigger are different. Fix this by using kernfs_generic_poll function when polling on cgroup-specific psi triggers. It internally uses kernfs_open_node->poll waitqueue head with its lifecycle tied to the file's lifecycle. This also renders the fix in [1] obsolete, so revert it. [1] commit c2dbe32d5db5 ("sched/psi: Fix use-after-free in ep_remove_wait_queue()") Fixes: 0e94682b73bf ("psi: introduce psi monitor") Closes: https://lore.kernel.org/all/20230613062306.101831-1-lujialin4@huawei.com/ Reported-by: Lu Jialin Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20230630005612.1014540-1-surenb@google.com --- include/linux/psi.h | 5 +++-- include/linux/psi_types.h | 3 +++ kernel/cgroup/cgroup.c | 2 +- kernel/sched/psi.c | 29 +++++++++++++++++++++-------- 4 files changed, 28 insertions(+), 11 deletions(-) (limited to 'kernel/cgroup/cgroup.c') diff --git a/include/linux/psi.h b/include/linux/psi.h index ab26200c2803..e0745873e3f2 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -23,8 +23,9 @@ void psi_memstall_enter(unsigned long *flags); void psi_memstall_leave(unsigned long *flags); int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); -struct psi_trigger *psi_trigger_create(struct psi_group *group, - char *buf, enum psi_res res, struct file *file); +struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, + enum psi_res res, struct file *file, + struct kernfs_open_file *of); void psi_trigger_destroy(struct psi_trigger *t); __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 040c089581c6..f1fd3a8044e0 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -137,6 +137,9 @@ struct psi_trigger { /* Wait queue for polling */ wait_queue_head_t event_wait; + /* Kernfs file for cgroup triggers */ + struct kernfs_open_file *of; + /* Pending event flag */ int event; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index bfe3cd8ccf36..f55a40db065f 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3730,7 +3730,7 @@ static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, } psi = cgroup_psi(cgrp); - new = psi_trigger_create(psi, buf, res, of->file); + new = psi_trigger_create(psi, buf, res, of->file, of); if (IS_ERR(new)) { cgroup_put(cgrp); return PTR_ERR(new); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 81fca77397f6..9bb3f2b3ccfc 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -493,8 +493,12 @@ static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total, continue; /* Generate an event */ - if (cmpxchg(&t->event, 0, 1) == 0) - wake_up_interruptible(&t->event_wait); + if (cmpxchg(&t->event, 0, 1) == 0) { + if (t->of) + kernfs_notify(t->of->kn); + else + wake_up_interruptible(&t->event_wait); + } t->last_event_time = now; /* Reset threshold breach flag once event got generated */ t->pending_event = false; @@ -1271,8 +1275,9 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) return 0; } -struct psi_trigger *psi_trigger_create(struct psi_group *group, - char *buf, enum psi_res res, struct file *file) +struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, + enum psi_res res, struct file *file, + struct kernfs_open_file *of) { struct psi_trigger *t; enum psi_states state; @@ -1331,7 +1336,9 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, t->event = 0; t->last_event_time = 0; - init_waitqueue_head(&t->event_wait); + t->of = of; + if (!of) + init_waitqueue_head(&t->event_wait); t->pending_event = false; t->aggregator = privileged ? PSI_POLL : PSI_AVGS; @@ -1388,7 +1395,10 @@ void psi_trigger_destroy(struct psi_trigger *t) * being accessed later. Can happen if cgroup is deleted from under a * polling process. */ - wake_up_pollfree(&t->event_wait); + if (t->of) + kernfs_notify(t->of->kn); + else + wake_up_interruptible(&t->event_wait); if (t->aggregator == PSI_AVGS) { mutex_lock(&group->avgs_lock); @@ -1465,7 +1475,10 @@ __poll_t psi_trigger_poll(void **trigger_ptr, if (!t) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - poll_wait(file, &t->event_wait, wait); + if (t->of) + kernfs_generic_poll(t->of, wait); + else + poll_wait(file, &t->event_wait, wait); if (cmpxchg(&t->event, 1, 0) == 1) ret |= EPOLLPRI; @@ -1535,7 +1548,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, return -EBUSY; } - new = psi_trigger_create(&psi_system, buf, res, file); + new = psi_trigger_create(&psi_system, buf, res, file, NULL); if (IS_ERR(new)) { mutex_unlock(&seq->lock); return PTR_ERR(new); -- cgit v1.2.3