cgroup: replace global percpu_rwsem with per threadgroup resem when writing to cgroup.procs

The static usage pattern of creating a cgroup, enabling controllers, and then seeding it with CLONE_INTO_CGROUP doesn't require write locking cgroup_threadgroup_rwsem and thus doesn't benefit from this patch. To avoid affecting other users, the per threadgroup rwsem is only used when the favordynmods is enabled. As computer hardware advances, modern systems are typically equipped with many CPU cores and large amounts of memory, enabling the deployment of numerous applications. On such systems, container creation and deletion become frequent operations, making cgroup process migration no longer a cold path. This leads to noticeable contention with common process operations such as fork, exec, and exit. To alleviate the contention between cgroup process migration and operations like process fork, this patch modifies lock to take the write lock on signal_struct->group_rwsem when writing pid to cgroup.procs/threads instead of holding a global write lock. Cgroup process migration has historically relied on signal_struct->group_rwsem to protect thread group integrity. In commit <1ed1328792ff> ("sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem"), this was changed to a global cgroup_threadgroup_rwsem. The advantage of using a global lock was simplified handling of process group migrations. This patch retains the use of the global lock for protecting process group migration, while reducing contention by using per thread group lock during cgroup.procs/threads writes. The locking behavior is as follows: write cgroup.procs/threads | process fork,exec,exit | process group migration ------------------------------------------------------------------------------ cgroup_lock() | down_read(&g_rwsem) | cgroup_lock() down_write(&p_rwsem) | down_read(&p_rwsem) | down_write(&g_rwsem) critical section | critical section | critical section up_write(&p_rwsem) | up_read(&p_rwsem) | up_write(&g_rwsem) cgroup_unlock() | up_read(&g_rwsem) | cgroup_unlock() g_rwsem denotes cgroup_threadgroup_rwsem, p_rwsem denotes signal_struct->group_rwsem. This patch eliminates contention between cgroup migration and fork operations for threads that belong to different thread groups, thereby reducing the long-tail latency of cgroup migrations and lowering system load. With this patch, under heavy fork and exec interference, the long-tail latency of cgroup migration has been reduced from milliseconds to microseconds. Under heavy cgroup migration interference, the multi-CPU score of the spawn test case in UnixBench increased by 9%. tj: Update comment in cgroup_favor_dynmods() and switch WARN_ONCE() to pr_warn_once(). Signed-off-by: Yi Tao <escape@linux.alibaba.com> Signed-off-by: Tejun Heo <tj@kernel.org>
author: Yi Tao <escape@linux.alibaba.com> 2025-09-10 09:59:35 +0300
committer: Tejun Heo <tj@kernel.org> 2025-09-10 20:44:51 +0300
commit: 0568f89d4fb82d98001baeb870e92f43cd1f7317 (patch)
tree: 05c45b42c20443c9606d894de945b57d6040d102 /include/linux
parent: 477abc2ec889a9dd3eb4ae0adbf6408a569bf2b6 (diff)
download: linux-0568f89d4fb82d98001baeb870e92f43cd1f7317.tar.xz
2 files changed, 20 insertions, 1 deletions
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index ff3c7d0e3e01..93318fce31f3 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -91,6 +91,12 @@ enum {
 	 * cgroup_threadgroup_rwsem. This makes hot path operations such as
 	 * forks and exits into the slow path and more expensive.
 	 *
+	 * Alleviate the contention between fork, exec, exit operations and
+	 * writing to cgroup.procs by taking a per threadgroup rwsem instead of
+	 * the global cgroup_threadgroup_rwsem. Fork and other operations
+	 * from threads in different thread groups no longer contend with
+	 * writing to cgroup.procs.
+	 *
 	 * The static usage pattern of creating a cgroup, enabling controllers,
 	 * and then seeding it with CLONE_INTO_CGROUP doesn't require write
 	 * locking cgroup_threadgroup_rwsem and thus doesn't benefit from
@@ -146,6 +152,9 @@ enum cgroup_attach_lock_mode {
 
 	/* When pid=0 && threadgroup=false, see comments in cgroup_procs_write_start */
 	CGRP_ATTACH_LOCK_NONE,
+
+	/* When favordynmods is on, see comments above CGRP_ROOT_FAVOR_DYNMODS */
+	CGRP_ATTACH_LOCK_PER_THREADGROUP,
 };
 
 /*
@@ -846,6 +855,7 @@ struct cgroup_subsys {
 };
 
 extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+extern bool cgroup_enable_per_threadgroup_rwsem;
 
 struct cgroup_of_peak {
 	unsigned long		value;
@@ -857,11 +867,14 @@ struct cgroup_of_peak {
  * @tsk: target task
  *
  * Allows cgroup operations to synchronize against threadgroup changes
- * using a percpu_rw_semaphore.
+ * using a global percpu_rw_semaphore and a per threadgroup rw_semaphore when
+ * favordynmods is on. See the comment above CGRP_ROOT_FAVOR_DYNMODS definition.
  */
 static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
 {
 	percpu_down_read(&cgroup_threadgroup_rwsem);
+	if (cgroup_enable_per_threadgroup_rwsem)
+		down_read(&tsk->signal->cgroup_threadgroup_rwsem);
 }
 
 /**
@@ -872,6 +885,8 @@ static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
  */
 static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
 {
+	if (cgroup_enable_per_threadgroup_rwsem)
+		up_read(&tsk->signal->cgroup_threadgroup_rwsem);
 	percpu_up_read(&cgroup_threadgroup_rwsem);
 }
 
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 1ef1edbaaf79..7d6449982822 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -226,6 +226,10 @@ struct signal_struct {
 	struct tty_audit_buf *tty_audit_buf;
 #endif
 
+#ifdef CONFIG_CGROUPS
+	struct rw_semaphore cgroup_threadgroup_rwsem;
+#endif
+
 	/*
 	 * Thread is the potential origin of an oom condition; kill first on
 	 * oom
author	Yi Tao <escape@linux.alibaba.com>	2025-09-10 09:59:35 +0300
committer	Tejun Heo <tj@kernel.org>	2025-09-10 20:44:51 +0300
commit	0568f89d4fb82d98001baeb870e92f43cd1f7317 (patch)
tree	05c45b42c20443c9606d894de945b57d6040d102 /include/linux
parent	477abc2ec889a9dd3eb4ae0adbf6408a569bf2b6 (diff)
download	linux-0568f89d4fb82d98001baeb870e92f43cd1f7317.tar.xz