From ca174c705db52db3cc842e754fd25a5f50eb702d Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 5 Mar 2026 14:53:29 -0500
Subject: cgroup/cpuset: Call rebuild_sched_domains() directly in hotplug

Besides deferring the call to housekeeping_update(), commit 6df415aa46ec
("cgroup/cpuset: Defer housekeeping_update() calls from CPU hotplug
to workqueue") also defers the rebuild_sched_domains() call to
the workqueue. So a new offline CPU may still be in a sched domain
or new online CPU not showing up in the sched domains for a short
transition period. That could be a problem in some corner cases and
can be the cause of a reported test failure[1]. Fix it by calling
rebuild_sched_domains_cpuslocked() directly in hotplug as before. If
isolated partition invalidation or recreation is being done, the
housekeeping_update() call to update the housekeeping cpumasks will
still be deferred to a workqueue.

In commit 3bfe47967191 ("cgroup/cpuset: Move
housekeeping_update()/rebuild_sched_domains() together"),
housekeeping_update() is called before rebuild_sched_domains() because
it needs to access the HK_TYPE_DOMAIN housekeeping cpumask. That is now
changed to use the static HK_TYPE_DOMAIN_BOOT cpumask as HK_TYPE_DOMAIN
cpumask is now changeable at run time.  As a result, we can move the
rebuild_sched_domains() call before housekeeping_update() with
the slight advantage that it will be done in the same cpus_read_lock
critical section without the possibility of interference by a concurrent
cpu hot add/remove operation.

As it doesn't make sense to acquire cpuset_mutex/cpuset_top_mutex after
calling housekeeping_update() and immediately release them again, move
the cpuset_full_unlock() operation inside update_hk_sched_domains()
and rename it to cpuset_update_sd_hk_unlock() to signify that it will
release the full set of locks.

[1] https://lore.kernel.org/lkml/1a89aceb-48db-4edd-a730-b445e41221fe@nvidia.com

Fixes: 6df415aa46ec ("cgroup/cpuset: Defer housekeeping_update() calls from CPU hotplug to workqueue")
Tested-by: Jon Hunter <jonathanh@nvidia.com>
Reviewed-by: Chen Ridong <chenridong@huaweicloud.com>
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cpuset.c | 59 ++++++++++++++++++++++++++------------------------
 1 file changed, 31 insertions(+), 28 deletions(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index e200de7c60b6..d21868455341 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -879,7 +879,7 @@ generate_doms:
 	/*
 	 * Cgroup v2 doesn't support domain attributes, just set all of them
 	 * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a
-	 * subset of HK_TYPE_DOMAIN housekeeping CPUs.
+	 * subset of HK_TYPE_DOMAIN_BOOT housekeeping CPUs.
 	 */
 	for (i = 0; i < ndoms; i++) {
 		/*
@@ -888,7 +888,7 @@ generate_doms:
 		 */
 		if (!csa || csa[i] == &top_cpuset)
 			cpumask_and(doms[i], top_cpuset.effective_cpus,
-				    housekeeping_cpumask(HK_TYPE_DOMAIN));
+				    housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT));
 		else
 			cpumask_copy(doms[i], csa[i]->effective_cpus);
 		if (dattr)
@@ -1329,17 +1329,22 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
 }
 
 /*
- * update_hk_sched_domains - Update HK cpumasks & rebuild sched domains
+ * cpuset_update_sd_hk_unlock - Rebuild sched domains, update HK & unlock
  *
- * Update housekeeping cpumasks and rebuild sched domains if necessary.
- * This should be called at the end of cpuset or hotplug actions.
+ * Update housekeeping cpumasks and rebuild sched domains if necessary and
+ * then do a cpuset_full_unlock().
+ * This should be called at the end of cpuset operation.
  */
-static void update_hk_sched_domains(void)
+static void cpuset_update_sd_hk_unlock(void)
+	__releases(&cpuset_mutex)
+	__releases(&cpuset_top_mutex)
 {
+	/* force_sd_rebuild will be cleared in rebuild_sched_domains_locked() */
+	if (force_sd_rebuild)
+		rebuild_sched_domains_locked();
+
 	if (update_housekeeping) {
-		/* Updating HK cpumasks implies rebuild sched domains */
 		update_housekeeping = false;
-		force_sd_rebuild = true;
 		cpumask_copy(isolated_hk_cpus, isolated_cpus);
 
 		/*
@@ -1350,22 +1355,19 @@ static void update_hk_sched_domains(void)
 		mutex_unlock(&cpuset_mutex);
 		cpus_read_unlock();
 		WARN_ON_ONCE(housekeeping_update(isolated_hk_cpus));
-		cpus_read_lock();
-		mutex_lock(&cpuset_mutex);
+		mutex_unlock(&cpuset_top_mutex);
+	} else {
+		cpuset_full_unlock();
 	}
-	/* force_sd_rebuild will be cleared in rebuild_sched_domains_locked() */
-	if (force_sd_rebuild)
-		rebuild_sched_domains_locked();
 }
 
 /*
- * Work function to invoke update_hk_sched_domains()
+ * Work function to invoke cpuset_update_sd_hk_unlock()
  */
 static void hk_sd_workfn(struct work_struct *work)
 {
 	cpuset_full_lock();
-	update_hk_sched_domains();
-	cpuset_full_unlock();
+	cpuset_update_sd_hk_unlock();
 }
 
 /**
@@ -3230,8 +3232,7 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 
 	free_cpuset(trialcs);
 out_unlock:
-	update_hk_sched_domains();
-	cpuset_full_unlock();
+	cpuset_update_sd_hk_unlock();
 	if (of_cft(of)->private == FILE_MEMLIST)
 		schedule_flush_migrate_mm();
 	return retval ?: nbytes;
@@ -3338,8 +3339,7 @@ static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf,
 	cpuset_full_lock();
 	if (is_cpuset_online(cs))
 		retval = update_prstate(cs, val);
-	update_hk_sched_domains();
-	cpuset_full_unlock();
+	cpuset_update_sd_hk_unlock();
 	return retval ?: nbytes;
 }
 
@@ -3513,8 +3513,7 @@ static void cpuset_css_killed(struct cgroup_subsys_state *css)
 	/* Reset valid partition back to member */
 	if (is_partition_valid(cs))
 		update_prstate(cs, PRS_MEMBER);
-	update_hk_sched_domains();
-	cpuset_full_unlock();
+	cpuset_update_sd_hk_unlock();
 }
 
 static void cpuset_css_free(struct cgroup_subsys_state *css)
@@ -3923,11 +3922,13 @@ static void cpuset_handle_hotplug(void)
 		rcu_read_unlock();
 	}
 
-
 	/*
-	 * Queue a work to call housekeeping_update() & rebuild_sched_domains()
-	 * There will be a slight delay before the HK_TYPE_DOMAIN housekeeping
-	 * cpumask can correctly reflect what is in isolated_cpus.
+	 * rebuild_sched_domains() will always be called directly if needed
+	 * to make sure that newly added or removed CPU will be reflected in
+	 * the sched domains. However, if isolated partition invalidation
+	 * or recreation is being done (update_housekeeping set), a work item
+	 * will be queued to call housekeeping_update() to update the
+	 * corresponding housekeeping cpumasks after some slight delay.
 	 *
 	 * We rely on WORK_STRUCT_PENDING_BIT to not requeue a work item that
 	 * is still pending. Before the pending bit is cleared, the work data
@@ -3936,8 +3937,10 @@ static void cpuset_handle_hotplug(void)
 	 * previously queued work. Since hk_sd_workfn() doesn't use the work
 	 * item at all, this is not a problem.
 	 */
-	if (update_housekeeping || force_sd_rebuild)
-		queue_work(system_unbound_wq, &hk_sd_work);
+	if (force_sd_rebuild)
+		rebuild_sched_domains_cpuslocked();
+	if (update_housekeeping)
+		queue_work(system_dfl_wq, &hk_sd_work);
 
 	free_tmpmasks(ptmp);
 }
-- 
cgit v1.2.3


From a72f73c4dd9b209c53cf8b03b6e97fcefad4262c Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 6 Mar 2026 20:22:35 +0100
Subject: cgroup: Don't expose dead tasks in cgroup

Once a task exits it has its state set to TASK_DEAD and then it is
removed from the cgroup it belonged to. The last step happens on the task
gets out of its last schedule() invocation and is delayed on PREEMPT_RT
due to locking constraints.

As a result it is possible to receive a pid via waitpid() of a task
which is still listed in cgroup.procs for the cgroup it belonged
to. This is something that systemd does not expect and as a result it
waits for its exit until a time out occurs.
This can also be reproduced on !PREEMPT_RT kernel with a significant
delay in do_exit() after exit_notify().

Hide the task from the output which have PF_EXITING set which is done
before the parent is notified. Keeping zombies with live threads
shouldn't break anything (suggested by Tejun).

Reported-by: Bert Karwatzki <spasswolf@web.de>
Closes: https://lore.kernel.org/all/20260219164648.3014-1-spasswolf@web.de/
Tested-by: Bert Karwatzki <spasswolf@web.de>
Fixes: 9311e6c29b34 ("cgroup: Fix sleeping from invalid context warning on PREEMPT_RT")
Cc: stable@vger.kernel.org # v6.19+
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index be1d71dda317..01fc2a93f3ef 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5109,6 +5109,12 @@ repeat:
 		return;
 
 	task = list_entry(it->task_pos, struct task_struct, cg_list);
+	/*
+	 * Hide tasks that are exiting but not yet removed. Keep zombie
+	 * leaders with live threads visible.
+	 */
+	if ((task->flags & PF_EXITING) && !atomic_read(&task->signal->live))
+		goto repeat;
 
 	if (it->flags & CSS_TASK_ITER_PROCS) {
 		/* if PROCS, skip over tasks which aren't group leaders */
-- 
cgit v1.2.3