From fd55c0adb46a44c9a0630dc32509e4733c290103 Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Tue, 12 Sep 2023 12:34:34 +0530 Subject: cgroup: Check for ret during cgroup1_base_files cft addition There is no check for possible failure while populating cgroup1_base_files cft in css_populate_dir(), like its cgroup v2 counter parts cgroup_{base,psi}_files. In case of failure, the cgroup might not be set up right. Add ret value check to return on failure. Signed-off-by: Kamalesh Babulal Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/cgroup') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1fb7f562289d..d40d58b963c8 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1731,8 +1731,10 @@ static int css_populate_dir(struct cgroup_subsys_state *css) return ret; } } else { - cgroup_addrm_files(css, cgrp, - cgroup1_base_files, true); + ret = cgroup_addrm_files(css, cgrp, + cgroup1_base_files, true); + if (ret < 0) + return ret; } } else { list_for_each_entry(cfts, &css->ss->cfts, node) { -- cgit v1.2.3 From d24f05987ce8bf61e62d86fedbe47523dc5c3393 Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Tue, 12 Sep 2023 12:34:35 +0530 Subject: cgroup: Avoid extra dereference in css_populate_dir() Use css directly instead of dereferencing it from &cgroup->self, while adding the cgroup v2 cft base and psi files in css_populate_dir(). Both points to the same css, when css->ss is NULL, this avoids extra deferences and makes code consistent in usage across the function. Signed-off-by: Kamalesh Babulal Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/cgroup') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d40d58b963c8..833ac6dd15d9 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1719,13 +1719,13 @@ static int css_populate_dir(struct cgroup_subsys_state *css) if (!css->ss) { if (cgroup_on_dfl(cgrp)) { - ret = cgroup_addrm_files(&cgrp->self, cgrp, + ret = cgroup_addrm_files(css, cgrp, cgroup_base_files, true); if (ret < 0) return ret; if (cgroup_psi_enabled()) { - ret = cgroup_addrm_files(&cgrp->self, cgrp, + ret = cgroup_addrm_files(css, cgrp, cgroup_psi_files, true); if (ret < 0) return ret; -- cgit v1.2.3 From 6fcdb0183bf024a70abccb0439321c25891c708d Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 5 Sep 2023 09:32:37 -0400 Subject: cgroup/cpuset: Fix load balance state in update_partition_sd_lb() Commit a86ce68078b2 ("cgroup/cpuset: Extract out CS_CPU_EXCLUSIVE & CS_SCHED_LOAD_BALANCE handling") adds a new helper function update_partition_sd_lb() to update the load balance state of the cpuset. However the new load balance is determined by just looking at whether the cpuset is a valid isolated partition root or not. That is not enough if the cpuset is not a valid partition root but its parent is in the isolated state (load balance off). Update the function to set the new state to be the same as its parent in this case like what has been done in commit c8c926200c55 ("cgroup/cpuset: Inherit parent's load balance state in v2"). Fixes: a86ce68078b2 ("cgroup/cpuset: Extract out CS_CPU_EXCLUSIVE & CS_SCHED_LOAD_BALANCE handling") Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel/cgroup') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 58ec88efa4f8..4749e0c86c62 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1304,13 +1304,23 @@ static int update_partition_exclusive(struct cpuset *cs, int new_prs) * * Changing load balance flag will automatically call * rebuild_sched_domains_locked(). + * This function is for cgroup v2 only. */ static void update_partition_sd_lb(struct cpuset *cs, int old_prs) { int new_prs = cs->partition_root_state; - bool new_lb = (new_prs != PRS_ISOLATED); bool rebuild_domains = (new_prs > 0) || (old_prs > 0); + bool new_lb; + /* + * If cs is not a valid partition root, the load balance state + * will follow its parent. + */ + if (new_prs > 0) { + new_lb = (new_prs != PRS_ISOLATED); + } else { + new_lb = is_sched_load_balance(parent_cs(cs)); + } if (new_lb != !!is_sched_load_balance(cs)) { rebuild_domains = true; if (new_lb) -- cgit v1.2.3 From 0c7f293efc87a06b51db9aa65256f8cb0a5a0a21 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 5 Sep 2023 09:32:38 -0400 Subject: cgroup/cpuset: Add cpuset.cpus.exclusive.effective for v2 The creation of a cpuset partition means dedicating a set of exclusive CPUs to be used by a particular partition only. These exclusive CPUs will not be used by any cpusets outside of that partition. To enable more flexibility in creating partitions, we need a way to distribute exclusive CPUs that can be used in new partitions. Currently, we have a subparts_cpus cpumask in struct cpuset that tracks only the exclusive CPUs used by all the sub-partitions underneath a given cpuset. This patch reworks the way we do exclusive CPUs tracking. The subparts_cpus is now renamed to effective_xcpus which tracks the exclusive CPUs allocated to a partition root including those that are further distributed down to sub-partitions underneath it. IOW, it also includes the exclusive CPUs used by the current partition root. Note that effective_xcpus can contain offline CPUs and it will always be a subset of cpus_allowed. The renamed effective_xcpus is now exposed via a new read-only "cpuset.cpus.exclusive.effective" control file. The new effective_xcpus cpumask should be set to cpus_allowed when a cpuset becomes a partition root and be cleared if it is not a valid partition root. In the next patch, we will enable write to another new control file to enable further control of what can get into effective_xcpus. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 728 ++++++++++++++++++++++++++++--------------------- 1 file changed, 421 insertions(+), 307 deletions(-) (limited to 'kernel/cgroup') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4749e0c86c62..b269c6b79e1a 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -78,7 +78,7 @@ enum prs_errcode { }; static const char * const perr_strings[] = { - [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus", + [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive", [PERR_INVPARENT] = "Parent is an invalid partition root", [PERR_NOTPART] = "Parent is not a partition root", [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive", @@ -121,14 +121,18 @@ struct cpuset { nodemask_t effective_mems; /* - * CPUs allocated to child sub-partitions (default hierarchy only) - * - CPUs granted by the parent = effective_cpus U subparts_cpus - * - effective_cpus and subparts_cpus are mutually exclusive. + * Exclusive CPUs dedicated to current cgroup (default hierarchy only) * - * effective_cpus contains only onlined CPUs, but subparts_cpus - * may have offlined ones. + * This exclusive CPUs must be a subset of cpus_allowed. A parent + * cgroup can only grant exclusive CPUs to one of its children. + * + * When the cgroup becomes a valid partition root, effective_xcpus + * defaults to cpus_allowed if not set. The effective_cpus of a valid + * partition root comes solely from its effective_xcpus and some of the + * effective_xcpus may be distributed to sub-partitions below & hence + * excluded from its effective_cpus. */ - cpumask_var_t subparts_cpus; + cpumask_var_t effective_xcpus; /* * This is old Memory Nodes tasks took on. @@ -156,8 +160,8 @@ struct cpuset { /* for custom sched domain */ int relax_domain_level; - /* number of CPUs in subparts_cpus */ - int nr_subparts_cpus; + /* number of valid sub-partitions */ + int nr_subparts; /* partition root state */ int partition_root_state; @@ -185,6 +189,11 @@ struct cpuset { struct cgroup_file partition_file; }; +/* + * Exclusive CPUs distributed out to sub-partitions of top_cpuset + */ +static cpumask_var_t subpartitions_cpus; + /* * Partition root states: * @@ -312,7 +321,7 @@ static inline int is_partition_invalid(const struct cpuset *cs) */ static inline void make_partition_invalid(struct cpuset *cs) { - if (is_partition_valid(cs)) + if (cs->partition_root_state > 0) cs->partition_root_state = -cs->partition_root_state; } @@ -469,7 +478,7 @@ static inline bool partition_is_populated(struct cpuset *cs, if (cs->css.cgroup->nr_populated_csets) return true; - if (!excluded_child && !cs->nr_subparts_cpus) + if (!excluded_child && !cs->nr_subparts) return cgroup_is_populated(cs->css.cgroup); rcu_read_lock(); @@ -601,7 +610,7 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) if (cs) { pmask1 = &cs->cpus_allowed; pmask2 = &cs->effective_cpus; - pmask3 = &cs->subparts_cpus; + pmask3 = &cs->effective_xcpus; } else { pmask1 = &tmp->new_cpus; pmask2 = &tmp->addmask; @@ -636,7 +645,7 @@ static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) if (cs) { free_cpumask_var(cs->cpus_allowed); free_cpumask_var(cs->effective_cpus); - free_cpumask_var(cs->subparts_cpus); + free_cpumask_var(cs->effective_xcpus); } if (tmp) { free_cpumask_var(tmp->new_cpus); @@ -664,6 +673,7 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); cpumask_copy(trial->effective_cpus, cs->effective_cpus); + cpumask_copy(trial->effective_xcpus, cs->effective_xcpus); return trial; } @@ -677,6 +687,25 @@ static inline void free_cpuset(struct cpuset *cs) kfree(cs); } +/* + * cpu_exclusive_check() - check if two cpusets are exclusive + * + * Return 0 if exclusive, -EINVAL if not + */ +static inline bool cpu_exclusive_check(struct cpuset *cs1, struct cpuset *cs2) +{ + struct cpumask *cpus1, *cpus2; + + cpus1 = cpumask_empty(cs1->effective_xcpus) + ? cs1->cpus_allowed : cs1->effective_xcpus; + cpus2 = cpumask_empty(cs2->effective_xcpus) + ? cs2->cpus_allowed : cs2->effective_xcpus; + + if (cpumask_intersects(cpus1, cpus2)) + return -EINVAL; + return 0; +} + /* * validate_change_legacy() - Validate conditions specific to legacy (v1) * behavior. @@ -776,9 +805,10 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) ret = -EINVAL; cpuset_for_each_child(c, css, par) { if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && - c != cur && - cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) - goto out; + c != cur) { + if (cpu_exclusive_check(trial, c)) + goto out; + } if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && c != cur && nodes_intersects(trial->mems_allowed, c->mems_allowed)) @@ -908,7 +938,7 @@ static int generate_sched_domains(cpumask_var_t **domains, csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ - if (root_load_balance && !top_cpuset.nr_subparts_cpus) { + if (root_load_balance && !top_cpuset.nr_subparts) { ndoms = 1; doms = alloc_sched_domains(ndoms); if (!doms) @@ -1159,7 +1189,7 @@ static void rebuild_sched_domains_locked(void) * should be the same as the active CPUs, so checking only top_cpuset * is enough to detect racing CPU offlines. */ - if (!top_cpuset.nr_subparts_cpus && + if (cpumask_empty(subpartitions_cpus) && !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) return; @@ -1168,7 +1198,7 @@ static void rebuild_sched_domains_locked(void) * root should be only a subset of the active CPUs. Since a CPU in any * partition root could be offlined, all must be checked. */ - if (top_cpuset.nr_subparts_cpus) { + if (top_cpuset.nr_subparts) { rcu_read_lock(); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { if (!is_partition_valid(cs)) { @@ -1232,7 +1262,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) */ if (kthread_is_per_cpu(task)) continue; - cpumask_andnot(new_cpus, possible_mask, cs->subparts_cpus); + cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus); } else { cpumask_and(new_cpus, possible_mask, cs->effective_cpus); } @@ -1247,32 +1277,22 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) * @cs: the cpuset the need to recompute the new effective_cpus mask * @parent: the parent cpuset * - * If the parent has subpartition CPUs, include them in the list of - * allowable CPUs in computing the new effective_cpus mask. Since offlined - * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask - * to mask those out. + * The result is valid only if the given cpuset isn't a partition root. */ static void compute_effective_cpumask(struct cpumask *new_cpus, struct cpuset *cs, struct cpuset *parent) { - if (parent->nr_subparts_cpus && is_partition_valid(cs)) { - cpumask_or(new_cpus, parent->effective_cpus, - parent->subparts_cpus); - cpumask_and(new_cpus, new_cpus, cs->cpus_allowed); - cpumask_and(new_cpus, new_cpus, cpu_active_mask); - } else { - cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus); - } + cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus); } /* - * Commands for update_parent_subparts_cpumask + * Commands for update_parent_effective_cpumask */ -enum subparts_cmd { - partcmd_enable, /* Enable partition root */ - partcmd_disable, /* Disable partition root */ - partcmd_update, /* Update parent's subparts_cpus */ - partcmd_invalidate, /* Make partition invalid */ +enum partition_cmd { + partcmd_enable, /* Enable partition root */ + partcmd_disable, /* Disable partition root */ + partcmd_update, /* Update parent's effective_cpus */ + partcmd_invalidate, /* Make partition invalid */ }; static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, @@ -1333,8 +1353,23 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs) rebuild_sched_domains_locked(); } +/* + * tasks_nocpu_error - Return true if tasks will have no effective_cpus + */ +static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs, + struct cpumask *xcpus) +{ + /* + * A populated partition (cs or parent) can't have empty effective_cpus + */ + return (cpumask_subset(parent->effective_cpus, xcpus) && + partition_is_populated(parent, cs)) || + (!cpumask_intersects(xcpus, cpu_active_mask) && + partition_is_populated(cs, NULL)); +} + /** - * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset + * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset * @cs: The cpuset that requests change in partition root state * @cmd: Partition root state change command * @newmask: Optional new cpumask for partcmd_update @@ -1342,21 +1377,20 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs) * Return: 0 or a partition root state error code * * For partcmd_enable, the cpuset is being transformed from a non-partition - * root to a partition root. The cpus_allowed mask of the given cpuset will - * be put into parent's subparts_cpus and taken away from parent's + * root to a partition root. The effective_xcpus (cpus_allowed if effective_xcpus + * not set) mask of the given cpuset will be taken away from parent's * effective_cpus. The function will return 0 if all the CPUs listed in - * cpus_allowed can be granted or an error code will be returned. + * effective_xcpus can be granted or an error code will be returned. * * For partcmd_disable, the cpuset is being transformed from a partition - * root back to a non-partition root. Any CPUs in cpus_allowed that are in - * parent's subparts_cpus will be taken away from that cpumask and put back - * into parent's effective_cpus. 0 will always be returned. + * root back to a non-partition root. Any CPUs in effective_xcpus will be + * given back to parent's effective_cpus. 0 will always be returned. * * For partcmd_update, if the optional newmask is specified, the cpu list is - * to be changed from cpus_allowed to newmask. Otherwise, cpus_allowed is + * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is * assumed to remain the same. The cpuset should either be a valid or invalid * partition root. The partition root state may change from valid to invalid - * or vice versa. An error code will only be returned if transitioning from + * or vice versa. An error code will be returned if transitioning from * invalid to valid violates the exclusivity rule. * * For partcmd_invalidate, the current partition will be made invalid. @@ -1371,18 +1405,47 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs) * check for error and so partition_root_state and prs_error will be updated * directly. */ -static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, - struct cpumask *newmask, - struct tmpmasks *tmp) +static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, + struct cpumask *newmask, + struct tmpmasks *tmp) { struct cpuset *parent = parent_cs(cs); - int adding; /* Moving cpus from effective_cpus to subparts_cpus */ - int deleting; /* Moving cpus from subparts_cpus to effective_cpus */ + int adding; /* Adding cpus to parent's effective_cpus */ + int deleting; /* Deleting cpus from parent's effective_cpus */ int old_prs, new_prs; int part_error = PERR_NONE; /* Partition error? */ + int subparts_delta = 0; + struct cpumask *xcpus; /* cs effective_xcpus */ + bool nocpu; lockdep_assert_held(&cpuset_mutex); + /* + * new_prs will only be changed for the partcmd_update and + * partcmd_invalidate commands. + */ + adding = deleting = false; + old_prs = new_prs = cs->partition_root_state; + xcpus = !cpumask_empty(cs->effective_xcpus) + ? cs->effective_xcpus : cs->cpus_allowed; + + if (cmd == partcmd_invalidate) { + if (is_prs_invalid(old_prs)) + return 0; + + /* + * Make the current partition invalid. + */ + if (is_partition_valid(parent)) + adding = cpumask_and(tmp->addmask, + xcpus, parent->effective_xcpus); + if (old_prs > 0) { + new_prs = -old_prs; + subparts_delta--; + } + goto write_error; + } + /* * The parent must be a partition root. * The new cpumask, if present, or the current cpus_allowed must @@ -1395,124 +1458,124 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, if (!newmask && cpumask_empty(cs->cpus_allowed)) return PERR_CPUSEMPTY; - /* - * new_prs will only be changed for the partcmd_update and - * partcmd_invalidate commands. - */ - adding = deleting = false; - old_prs = new_prs = cs->partition_root_state; + nocpu = tasks_nocpu_error(parent, cs, xcpus); + if (cmd == partcmd_enable) { /* - * Enabling partition root is not allowed if cpus_allowed - * doesn't overlap parent's cpus_allowed. + * Enabling partition root is not allowed if its + * effective_xcpus is empty or doesn't overlap with + * parent's effective_xcpus. */ - if (!cpumask_intersects(cs->cpus_allowed, parent->cpus_allowed)) + if (cpumask_empty(xcpus) || + !cpumask_intersects(xcpus, parent->effective_xcpus)) return PERR_INVCPUS; /* * A parent can be left with no CPU as long as there is no * task directly associated with the parent partition. */ - if (cpumask_subset(parent->effective_cpus, cs->cpus_allowed) && - partition_is_populated(parent, cs)) + if (nocpu) return PERR_NOCPUS; - cpumask_copy(tmp->addmask, cs->cpus_allowed); - adding = true; + cpumask_copy(tmp->delmask, xcpus); + deleting = true; + subparts_delta++; } else if (cmd == partcmd_disable) { /* - * Need to remove cpus from parent's subparts_cpus for valid - * partition root. + n* May need to add cpus to parent's effective_cpus for + * valid partition root. */ - deleting = !is_prs_invalid(old_prs) && - cpumask_and(tmp->delmask, cs->cpus_allowed, - parent->subparts_cpus); - } else if (cmd == partcmd_invalidate) { - if (is_prs_invalid(old_prs)) - return 0; - + adding = !is_prs_invalid(old_prs) && + cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); + if (adding) + subparts_delta--; + } else if (newmask) { /* - * Make the current partition invalid. It is assumed that - * invalidation is caused by violating cpu exclusivity rule. + * Empty cpumask is not allowed */ - deleting = cpumask_and(tmp->delmask, cs->cpus_allowed, - parent->subparts_cpus); - if (old_prs > 0) { - new_prs = -old_prs; - part_error = PERR_NOTEXCL; + if (cpumask_empty(newmask)) { + part_error = PERR_CPUSEMPTY; + goto write_error; } - } else if (newmask) { + /* * partcmd_update with newmask: * - * Compute add/delete mask to/from subparts_cpus + * Compute add/delete mask to/from effective_cpus * - * delmask = cpus_allowed & ~newmask & parent->subparts_cpus - * addmask = newmask & parent->cpus_allowed - * & ~parent->subparts_cpus + * addmask = effective_xcpus & ~newmask & parent->effective_xcpus + * delmask = newmask & ~cs->effective_xcpus + * & parent->effective_xcpus */ - cpumask_andnot(tmp->delmask, cs->cpus_allowed, newmask); - deleting = cpumask_and(tmp->delmask, tmp->delmask, - parent->subparts_cpus); + cpumask_andnot(tmp->addmask, xcpus, newmask); + adding = cpumask_and(tmp->addmask, tmp->addmask, + parent->effective_xcpus); - cpumask_and(tmp->addmask, newmask, parent->cpus_allowed); - adding = cpumask_andnot(tmp->addmask, tmp->addmask, - parent->subparts_cpus); - /* - * Empty cpumask is not allowed - */ - if (cpumask_empty(newmask)) { - part_error = PERR_CPUSEMPTY; + cpumask_andnot(tmp->delmask, newmask, xcpus); + deleting = cpumask_and(tmp->delmask, tmp->delmask, + parent->effective_xcpus); /* * Make partition invalid if parent's effective_cpus could * become empty and there are tasks in the parent. */ - } else if (adding && - cpumask_subset(parent->effective_cpus, tmp->addmask) && - !cpumask_intersects(tmp->delmask, cpu_active_mask) && - partition_is_populated(parent, cs)) { + if (nocpu && (!adding || + !cpumask_intersects(tmp->addmask, cpu_active_mask))) { part_error = PERR_NOCPUS; - adding = false; - deleting = cpumask_and(tmp->delmask, cs->cpus_allowed, - parent->subparts_cpus); + deleting = false; + adding = cpumask_and(tmp->addmask, + xcpus, parent->effective_xcpus); } } else { /* - * partcmd_update w/o newmask: + * partcmd_update w/o newmask * - * delmask = cpus_allowed & parent->subparts_cpus - * addmask = cpus_allowed & parent->cpus_allowed - * & ~parent->subparts_cpus + * delmask = effective_xcpus & parent->effective_cpus * - * This gets invoked either due to a hotplug event or from - * update_cpumasks_hier(). This can cause the state of a - * partition root to transition from valid to invalid or vice - * versa. So we still need to compute the addmask and delmask. - - * A partition error happens when: - * 1) Cpuset is valid partition, but parent does not distribute - * out any CPUs. - * 2) Parent has tasks and all its effective CPUs will have - * to be distributed out. + * This can be called from: + * 1) update_cpumasks_hier() + * 2) cpuset_hotplug_update_tasks() + * + * Check to see if it can be transitioned from valid to + * invalid partition or vice versa. + * + * A partition error happens when parent has tasks and all + * its effective CPUs will have to be distributed out. */ - cpumask_and(tmp->addmask, cs->cpus_allowed, - parent->cpus_allowed); - adding = cpumask_andnot(tmp->addmask, tmp->addmask, - parent->subparts_cpus); - - if ((is_partition_valid(cs) && !parent->nr_subparts_cpus) || - (adding && - cpumask_subset(parent->effective_cpus, tmp->addmask) && - partition_is_populated(parent, cs))) { + WARN_ON_ONCE(!is_partition_valid(parent)); + if (nocpu) { part_error = PERR_NOCPUS; - adding = false; - } + if (is_partition_valid(cs)) + adding = cpumask_and(tmp->addmask, + xcpus, parent->effective_xcpus); + } else if (is_partition_invalid(cs) && + cpumask_subset(xcpus, parent->effective_xcpus)) { + struct cgroup_subsys_state *css; + struct cpuset *child; + bool exclusive = true; - if (part_error && is_partition_valid(cs) && - parent->nr_subparts_cpus) - deleting = cpumask_and(tmp->delmask, cs->cpus_allowed, - parent->subparts_cpus); + /* + * Convert invalid partition to valid has to + * pass the cpu exclusivity test. + */ + rcu_read_lock(); + cpuset_for_each_child(child, css, parent) { + if (child == cs) + continue; + if (cpu_exclusive_check(cs, child)) { + exclusive = false; + break; + } + } + rcu_read_unlock(); + if (exclusive) + deleting = cpumask_and(tmp->delmask, + xcpus, parent->effective_cpus); + else + part_error = PERR_NOTEXCL; + } } + +write_error: if (part_error) WRITE_ONCE(cs->prs_err, part_error); @@ -1524,13 +1587,17 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, switch (cs->partition_root_state) { case PRS_ROOT: case PRS_ISOLATED: - if (part_error) + if (part_error) { new_prs = -old_prs; + subparts_delta--; + } break; case PRS_INVALID_ROOT: case PRS_INVALID_ISOLATED: - if (!part_error) + if (!part_error) { new_prs = -old_prs; + subparts_delta++; + } break; } } @@ -1550,32 +1617,43 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, } /* - * Change the parent's subparts_cpus. + * Change the parent's effective_cpus & effective_xcpus (top cpuset + * only). + * * Newly added CPUs will be removed from effective_cpus and * newly deleted ones will be added back to effective_cpus. */ spin_lock_irq(&callback_lock); if (adding) { - cpumask_or(parent->subparts_cpus, - parent->subparts_cpus, tmp->addmask); - cpumask_andnot(parent->effective_cpus, - parent->effective_cpus, tmp->addmask); - } - if (deleting) { - cpumask_andnot(parent->subparts_cpus, - parent->subparts_cpus, tmp->delmask); + if (parent == &top_cpuset) + cpumask_andnot(subpartitions_cpus, + subpartitions_cpus, tmp->addmask); /* - * Some of the CPUs in subparts_cpus might have been offlined. + * Some of the CPUs in effective_xcpus might have been offlined. */ - cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask); cpumask_or(parent->effective_cpus, - parent->effective_cpus, tmp->delmask); + parent->effective_cpus, tmp->addmask); + cpumask_and(parent->effective_cpus, + parent->effective_cpus, cpu_active_mask); + } + if (deleting) { + if (parent == &top_cpuset) + cpumask_or(subpartitions_cpus, + subpartitions_cpus, tmp->delmask); + cpumask_andnot(parent->effective_cpus, + parent->effective_cpus, tmp->delmask); } - parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus); + if (is_partition_valid(parent)) { + parent->nr_subparts += subparts_delta; + WARN_ON_ONCE(parent->nr_subparts < 0); + } - if (old_prs != new_prs) + if (old_prs != new_prs) { cs->partition_root_state = new_prs; + if (new_prs <= 0) + cs->nr_subparts = 0; + } spin_unlock_irq(&callback_lock); @@ -1600,6 +1678,71 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, return 0; } +/** + * compute_partition_effective_cpumask - compute effective_cpus for partition + * @cs: partition root cpuset + * @new_ecpus: previously computed effective_cpus to be updated + * + * Compute the effective_cpus of a partition root by scanning effective_xcpus + * of child partition roots and exclusing their effective_xcpus. + * + * This has the side effect of invalidating valid child partition roots, + * if necessary. Since it is called from either cpuset_hotplug_update_tasks() + * or update_cpumasks_hier() where parent and children are modified + * successively, we don't need to call update_parent_effective_cpumask() + * and the child's effective_cpus will be updated in later iterations. + * + * Note that rcu_read_lock() is assumed to be held. + */ +static void compute_partition_effective_cpumask(struct cpuset *cs, + struct cpumask *new_ecpus) +{ + struct cgroup_subsys_state *css; + struct cpuset *child; + bool populated = partition_is_populated(cs, NULL); + + /* + * Check child partition roots to see if they should be + * invalidated when + * 1) child effective_xcpus not a subset of new + * excluisve_cpus + * 2) All the effective_cpus will be used up and cp + * has tasks + */ + cpumask_and(new_ecpus, cs->effective_xcpus, cpu_active_mask); + rcu_read_lock(); + cpuset_for_each_child(child, css, cs) { + if (!is_partition_valid(child)) + continue; + + child->prs_err = 0; + if (!cpumask_subset(child->effective_xcpus, + cs->effective_xcpus)) + child->prs_err = PERR_INVCPUS; + else if (populated && + cpumask_subset(new_ecpus, child->effective_xcpus)) + child->prs_err = PERR_NOCPUS; + + if (child->prs_err) { + int old_prs = child->partition_root_state; + + /* + * Invalidate child partition + */ + spin_lock_irq(&callback_lock); + make_partition_invalid(child); + cs->nr_subparts--; + child->nr_subparts = 0; + spin_unlock_irq(&callback_lock); + notify_partition_change(child, old_prs); + continue; + } + cpumask_andnot(new_ecpus, new_ecpus, + child->effective_xcpus); + } + rcu_read_unlock(); +} + /* * update_cpumasks_hier() flags */ @@ -1634,6 +1777,19 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, compute_effective_cpumask(tmp->new_cpus, cp, parent); + if (is_partition_valid(parent) && is_partition_valid(cp)) + compute_partition_effective_cpumask(cp, tmp->new_cpus); + + /* + * A partition with no effective_cpus is allowed as long as + * there is no task associated with it. Call + * update_parent_effective_cpumask() to check it. + */ + if (is_partition_valid(cp) && cpumask_empty(tmp->new_cpus)) { + update_parent = true; + goto update_parent_effective; + } + /* * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some CPUs unless @@ -1641,10 +1797,6 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, * out all its CPUs. */ if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) { - if (is_partition_valid(cp) && - cpumask_equal(cp->cpus_allowed, cp->subparts_cpus)) - goto update_parent_subparts; - cpumask_copy(tmp->new_cpus, parent->effective_cpus); if (!cp->use_parent_ecpus) { cp->use_parent_ecpus = true; @@ -1671,12 +1823,12 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, continue; } -update_parent_subparts: +update_parent_effective: /* - * update_parent_subparts_cpumask() should have been called + * update_parent_effective_cpumask() should have been called * for cs already in update_cpumask(). We should also call * update_tasks_cpumask() again for tasks in the parent - * cpuset if the parent's subparts_cpus changes. + * cpuset if the parent's effective_cpus changes. */ old_prs = new_prs = cp->partition_root_state; if ((cp != cs) && old_prs) { @@ -1706,8 +1858,7 @@ update_parent_subparts: rcu_read_unlock(); if (update_parent) { - update_parent_subparts_cpumask(cp, partcmd_update, NULL, - tmp); + update_parent_effective_cpumask(cp, partcmd_update, NULL, tmp); /* * The cpuset partition_root_state may become * invalid. Capture it. @@ -1716,30 +1867,18 @@ update_parent_subparts: } spin_lock_irq(&callback_lock); - - if (cp->nr_subparts_cpus && !is_partition_valid(cp)) { - /* - * Put all active subparts_cpus back to effective_cpus. - */ - cpumask_or(tmp->new_cpus, tmp->new_cpus, - cp->subparts_cpus); - cpumask_and(tmp->new_cpus, tmp->new_cpus, - cpu_active_mask); - cp->nr_subparts_cpus = 0; - cpumask_clear(cp->subparts_cpus); - } - cpumask_copy(cp->effective_cpus, tmp->new_cpus); - if (cp->nr_subparts_cpus) { - /* - * Make sure that effective_cpus & subparts_cpus - * are mutually exclusive. - */ - cpumask_andnot(cp->effective_cpus, cp->effective_cpus, - cp->subparts_cpus); - } - cp->partition_root_state = new_prs; + if ((new_prs > 0) && cpumask_empty(cp->effective_xcpus)) + cpumask_and(cp->effective_xcpus, + cp->cpus_allowed, parent->effective_xcpus); + if (new_prs < 0) { + /* Reset partition data */ + cp->nr_subparts = 0; + cpumask_clear(cp->effective_xcpus); + if (is_cpu_exclusive(cp)) + clear_bit(CS_CPU_EXCLUSIVE, &cp->flags); + } spin_unlock_irq(&callback_lock); notify_partition_change(cp, old_prs); @@ -1836,6 +1975,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, { int retval; struct tmpmasks tmp; + struct cpuset *parent = parent_cs(cs); bool invalidate = false; int old_prs = cs->partition_root_state; @@ -1851,6 +1991,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, */ if (!*buf) { cpumask_clear(trialcs->cpus_allowed); + cpumask_clear(trialcs->effective_xcpus); } else { retval = cpulist_parse(buf, trialcs->cpus_allowed); if (retval < 0) @@ -1859,6 +2000,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (!cpumask_subset(trialcs->cpus_allowed, top_cpuset.cpus_allowed)) return -EINVAL; + + /* + * When effective_xcpus is set, make sure it is a subset of + * cpus_allowed and parent's effective_xcpus. + */ + cpumask_and(trialcs->effective_xcpus, + parent->effective_xcpus, trialcs->cpus_allowed); } /* Nothing to do if the cpus didn't change */ @@ -1868,11 +2016,21 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (alloc_cpumasks(NULL, &tmp)) return -ENOMEM; + if (is_partition_valid(cs)) { + if (cpumask_empty(trialcs->effective_xcpus)) { + invalidate = true; + cs->prs_err = PERR_INVCPUS; + } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { + invalidate = true; + cs->prs_err = PERR_NOCPUS; + } + } + retval = validate_change(cs, trialcs); if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { - struct cpuset *cp, *parent; struct cgroup_subsys_state *css; + struct cpuset *cp; /* * The -EINVAL error code indicates that partition sibling @@ -1883,69 +2041,44 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, */ invalidate = true; rcu_read_lock(); - parent = parent_cs(cs); cpuset_for_each_child(cp, css, parent) if (is_partition_valid(cp) && - cpumask_intersects(trialcs->cpus_allowed, cp->cpus_allowed)) { + cpumask_intersects(trialcs->effective_xcpus, cp->effective_xcpus)) { rcu_read_unlock(); - update_parent_subparts_cpumask(cp, partcmd_invalidate, NULL, &tmp); + update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp); rcu_read_lock(); } rcu_read_unlock(); retval = 0; } + if (retval < 0) goto out_free; if (cs->partition_root_state) { if (invalidate) - update_parent_subparts_cpumask(cs, partcmd_invalidate, - NULL, &tmp); + update_parent_effective_cpumask(cs, partcmd_invalidate, + NULL, &tmp); else - update_parent_subparts_cpumask(cs, partcmd_update, - trialcs->cpus_allowed, &tmp); + update_parent_effective_cpumask(cs, partcmd_update, + trialcs->effective_xcpus, &tmp); } - compute_effective_cpumask(trialcs->effective_cpus, trialcs, - parent_cs(cs)); spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); + if (!is_partition_valid(cs)) + cpumask_clear(cs->effective_xcpus); + else + cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); - /* - * Make sure that subparts_cpus, if not empty, is a subset of - * cpus_allowed. Clear subparts_cpus if partition not valid or - * empty effective cpus with tasks. - */ - if (cs->nr_subparts_cpus) { - if (!is_partition_valid(cs) || - (cpumask_subset(trialcs->effective_cpus, cs->subparts_cpus) && - partition_is_populated(cs, NULL))) { - cs->nr_subparts_cpus = 0; - cpumask_clear(cs->subparts_cpus); - } else { - cpumask_and(cs->subparts_cpus, cs->subparts_cpus, - cs->cpus_allowed); - cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus); - } - } spin_unlock_irq(&callback_lock); /* effective_cpus will be updated here */ update_cpumasks_hier(cs, &tmp, 0); - if (cs->partition_root_state) { - struct cpuset *parent = parent_cs(cs); - - /* - * For partition root, update the cpumasks of sibling - * cpusets if they use parent's effective_cpus. - */ - if (parent->child_ecpus_count) - update_sibling_cpumasks(parent, cs, &tmp); - - /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains */ + /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ + if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); - } out_free: free_cpumasks(NULL, &tmp); return 0; @@ -2323,7 +2456,6 @@ out: static int update_prstate(struct cpuset *cs, int new_prs) { int err = PERR_NONE, old_prs = cs->partition_root_state; - struct cpuset *parent = parent_cs(cs); struct tmpmasks tmpmask; if (old_prs == new_prs) @@ -2341,6 +2473,19 @@ static int update_prstate(struct cpuset *cs, int new_prs) if (alloc_cpumasks(NULL, &tmpmask)) return -ENOMEM; + /* + * Setup effective_xcpus if not set yet, it will be cleared later + * if partition becomes invalid. + */ + if ((new_prs > 0) && cpumask_empty(cs->effective_xcpus)) { + struct cpuset *parent = parent_cs(cs); + + spin_lock_irq(&callback_lock); + cpumask_and(cs->effective_xcpus, + cs->cpus_allowed, parent->effective_xcpus); + spin_unlock_irq(&callback_lock); + } + err = update_partition_exclusive(cs, new_prs); if (err) goto out; @@ -2354,8 +2499,8 @@ static int update_prstate(struct cpuset *cs, int new_prs) goto out; } - err = update_parent_subparts_cpumask(cs, partcmd_enable, - NULL, &tmpmask); + err = update_parent_effective_cpumask(cs, partcmd_enable, + NULL, &tmpmask); } else if (old_prs && new_prs) { /* * A change in load balance state only, no change in cpumasks. @@ -2366,19 +2511,13 @@ static int update_prstate(struct cpuset *cs, int new_prs) * Switching back to member is always allowed even if it * disables child partitions. */ - update_parent_subparts_cpumask(cs, partcmd_disable, NULL, - &tmpmask); + update_parent_effective_cpumask(cs, partcmd_disable, NULL, + &tmpmask); /* - * If there are child partitions, they will all become invalid. + * Invalidation of child partitions will be done in + * update_cpumasks_hier(). */ - if (unlikely(cs->nr_subparts_cpus)) { - spin_lock_irq(&callback_lock); - cs->nr_subparts_cpus = 0; - cpumask_clear(cs->subparts_cpus); - compute_effective_cpumask(cs->effective_cpus, cs, parent); - spin_unlock_irq(&callback_lock); - } } out: /* @@ -2393,14 +2532,12 @@ out: spin_lock_irq(&callback_lock); cs->partition_root_state = new_prs; WRITE_ONCE(cs->prs_err, err); + if (!is_partition_valid(cs)) + cpumask_clear(cs->effective_xcpus); spin_unlock_irq(&callback_lock); - /* - * Update child cpusets, if present. - * Force update if switching back to member. - */ - if (!list_empty(&cs->css.children)) - update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0); + /* Force update if switching back to member */ + update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0); /* Update sched domains and load balance flag */ update_partition_sd_lb(cs, old_prs); @@ -2649,7 +2786,7 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) guarantee_online_cpus(task, cpus_attach); else cpumask_andnot(cpus_attach, task_cpu_possible_mask(task), - cs->subparts_cpus); + subpartitions_cpus); /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here @@ -2752,6 +2889,7 @@ typedef enum { FILE_EFFECTIVE_CPULIST, FILE_EFFECTIVE_MEMLIST, FILE_SUBPARTS_CPULIST, + FILE_EFFECTIVE_XCPULIST, FILE_CPU_EXCLUSIVE, FILE_MEM_EXCLUSIVE, FILE_MEM_HARDWALL, @@ -2936,8 +3074,11 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) case FILE_EFFECTIVE_MEMLIST: seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems)); break; + case FILE_EFFECTIVE_XCPULIST: + seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus)); + break; case FILE_SUBPARTS_CPULIST: - seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus)); + seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus)); break; default: ret = -EINVAL; @@ -3209,11 +3350,18 @@ static struct cftype dfl_files[] = { .file_offset = offsetof(struct cpuset, partition_file), }, + { + .name = "cpus.exclusive.effective", + .seq_show = cpuset_common_seq_show, + .private = FILE_EFFECTIVE_XCPULIST, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { .name = "cpus.subpartitions", .seq_show = cpuset_common_seq_show, .private = FILE_SUBPARTS_CPULIST, - .flags = CFTYPE_DEBUG, + .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG, }, { } /* terminate */ @@ -3387,6 +3535,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) if (is_in_v2_mode()) { cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); + cpumask_copy(top_cpuset.effective_xcpus, cpu_possible_mask); top_cpuset.mems_allowed = node_possible_map; } else { cpumask_copy(top_cpuset.cpus_allowed, @@ -3525,11 +3674,13 @@ int __init cpuset_init(void) { BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); - BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL)); cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); cpumask_setall(top_cpuset.effective_cpus); + cpumask_setall(top_cpuset.effective_xcpus); nodes_setall(top_cpuset.effective_mems); fmeter_init(&top_cpuset.fmeter); @@ -3669,30 +3820,15 @@ retry: compute_effective_cpumask(&new_cpus, cs, parent); nodes_and(new_mems, cs->mems_allowed, parent->effective_mems); - if (cs->nr_subparts_cpus) - /* - * Make sure that CPUs allocated to child partitions - * do not show up in effective_cpus. - */ - cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus); - if (!tmp || !cs->partition_root_state) goto update_tasks; /* - * In the unlikely event that a partition root has empty - * effective_cpus with tasks, we will have to invalidate child - * partitions, if present, by setting nr_subparts_cpus to 0 to - * reclaim their cpus. + * Compute effective_cpus for valid partition root, may invalidate + * child partition roots if necessary. */ - if (cs->nr_subparts_cpus && is_partition_valid(cs) && - cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)) { - spin_lock_irq(&callback_lock); - cs->nr_subparts_cpus = 0; - cpumask_clear(cs->subparts_cpus); - spin_unlock_irq(&callback_lock); - compute_effective_cpumask(&new_cpus, cs, parent); - } + if (is_partition_valid(cs) && is_partition_valid(parent)) + compute_partition_effective_cpumask(cs, &new_cpus); /* * Force the partition to become invalid if either one of @@ -3701,44 +3837,22 @@ retry: * 2) parent is invalid or doesn't grant any cpus to child * partitions. */ - if (is_partition_valid(cs) && (!parent->nr_subparts_cpus || - (cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)))) { - int old_prs, parent_prs; - - update_parent_subparts_cpumask(cs, partcmd_disable, NULL, tmp); - if (cs->nr_subparts_cpus) { - spin_lock_irq(&callback_lock); - cs->nr_subparts_cpus = 0; - cpumask_clear(cs->subparts_cpus); - spin_unlock_irq(&callback_lock); - compute_effective_cpumask(&new_cpus, cs, parent); - } - - old_prs = cs->partition_root_state; - parent_prs = parent->partition_root_state; - if (is_partition_valid(cs)) { - spin_lock_irq(&callback_lock); - make_partition_invalid(cs); - spin_unlock_irq(&callback_lock); - if (is_prs_invalid(parent_prs)) - WRITE_ONCE(cs->prs_err, PERR_INVPARENT); - else if (!parent_prs) - WRITE_ONCE(cs->prs_err, PERR_NOTPART); - else - WRITE_ONCE(cs->prs_err, PERR_HOTPLUG); - notify_partition_change(cs, old_prs); - } + if (is_partition_valid(cs) && (!is_partition_valid(parent) || + tasks_nocpu_error(parent, cs, &new_cpus))) { + update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, tmp); + compute_effective_cpumask(&new_cpus, cs, parent); cpuset_force_rebuild(); } - /* * On the other hand, an invalid partition root may be transitioned * back to a regular one. */ else if (is_partition_valid(parent) && is_partition_invalid(cs)) { - update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp); - if (is_partition_valid(cs)) + update_parent_effective_cpumask(cs, partcmd_update, NULL, tmp); + if (is_partition_valid(cs)) { + compute_partition_effective_cpumask(cs, &new_cpus); cpuset_force_rebuild(); + } } update_tasks: @@ -3796,21 +3910,22 @@ static void cpuset_hotplug_workfn(struct work_struct *work) new_mems = node_states[N_MEMORY]; /* - * If subparts_cpus is populated, it is likely that the check below - * will produce a false positive on cpus_updated when the cpu list - * isn't changed. It is extra work, but it is better to be safe. + * If subpartitions_cpus is populated, it is likely that the check + * below will produce a false positive on cpus_updated when the cpu + * list isn't changed. It is extra work, but it is better to be safe. */ - cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus); + cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus) || + !cpumask_empty(subpartitions_cpus); mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); /* - * In the rare case that hotplug removes all the cpus in subparts_cpus, - * we assumed that cpus are updated. + * In the rare case that hotplug removes all the cpus in + * subpartitions_cpus, we assumed that cpus are updated. */ - if (!cpus_updated && top_cpuset.nr_subparts_cpus) + if (!cpus_updated && top_cpuset.nr_subparts) cpus_updated = true; - /* synchronize cpus_allowed to cpu_active_mask */ + /* For v1, synchronize cpus_allowed to cpu_active_mask */ if (cpus_updated) { spin_lock_irq(&callback_lock); if (!on_dfl) @@ -3818,17 +3933,16 @@ static void cpuset_hotplug_workfn(struct work_struct *work) /* * Make sure that CPUs allocated to child partitions * do not show up in effective_cpus. If no CPU is left, - * we clear the subparts_cpus & let the child partitions + * we clear the subpartitions_cpus & let the child partitions * fight for the CPUs again. */ - if (top_cpuset.nr_subparts_cpus) { - if (cpumask_subset(&new_cpus, - top_cpuset.subparts_cpus)) { - top_cpuset.nr_subparts_cpus = 0; - cpumask_clear(top_cpuset.subparts_cpus); + if (!cpumask_empty(subpartitions_cpus)) { + if (cpumask_subset(&new_cpus, subpartitions_cpus)) { + top_cpuset.nr_subparts = 0; + cpumask_clear(subpartitions_cpus); } else { cpumask_andnot(&new_cpus, &new_cpus, - top_cpuset.subparts_cpus); + subpartitions_cpus); } } cpumask_copy(top_cpuset.effective_cpus, &new_cpus); @@ -3960,7 +4074,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) * We first exclude cpus allocated to partitions. If there is no * allowable online cpu left, we fall back to all possible cpus. */ - cpumask_andnot(pmask, possible_mask, top_cpuset.subparts_cpus); + cpumask_andnot(pmask, possible_mask, subpartitions_cpus); if (!cpumask_intersects(pmask, cpu_online_mask)) cpumask_copy(pmask, possible_mask); } -- cgit v1.2.3 From e2ffe502ba4505ee9c7b432980c702b7801a37f3 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 5 Sep 2023 09:32:39 -0400 Subject: cgroup/cpuset: Add cpuset.cpus.exclusive for v2 This patch introduces a new writable "cpuset.cpus.exclusive" control file for v2 which will be added to non-root cpuset enabled cgroups. This new file enables user to set a smaller list of exclusive CPUs to be used in the creation of a cpuset partition. The value written to "cpuset.cpus.exclusive" may not be the effective value being used for the creation of cpuset partition, the effective value will show up in "cpuset.cpus.exclusive.effective" and it is subject to the constraint that it must also be a subset of cpus_allowed and parent's "cpuset.cpus.exclusive.effective". By writing to "cpuset.cpus.exclusive", "cpuset.cpus.exclusive.effective" may be set to a non-empty value even for cgroups that are not valid partition roots yet. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 273 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 239 insertions(+), 34 deletions(-) (limited to 'kernel/cgroup') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index b269c6b79e1a..0419654f3004 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -134,6 +134,11 @@ struct cpuset { */ cpumask_var_t effective_xcpus; + /* + * Exclusive CPUs as requested by the user (default hierarchy only) + */ + cpumask_var_t exclusive_cpus; + /* * This is old Memory Nodes tasks took on. * @@ -605,16 +610,18 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) */ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) { - cpumask_var_t *pmask1, *pmask2, *pmask3; + cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4; if (cs) { pmask1 = &cs->cpus_allowed; pmask2 = &cs->effective_cpus; pmask3 = &cs->effective_xcpus; + pmask4 = &cs->exclusive_cpus; } else { pmask1 = &tmp->new_cpus; pmask2 = &tmp->addmask; pmask3 = &tmp->delmask; + pmask4 = NULL; } if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) @@ -626,8 +633,14 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) goto free_two; + if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL)) + goto free_three; + + return 0; +free_three: + free_cpumask_var(*pmask3); free_two: free_cpumask_var(*pmask2); free_one: @@ -646,6 +659,7 @@ static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) free_cpumask_var(cs->cpus_allowed); free_cpumask_var(cs->effective_cpus); free_cpumask_var(cs->effective_xcpus); + free_cpumask_var(cs->exclusive_cpus); } if (tmp) { free_cpumask_var(tmp->new_cpus); @@ -674,6 +688,7 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); cpumask_copy(trial->effective_cpus, cs->effective_cpus); cpumask_copy(trial->effective_xcpus, cs->effective_xcpus); + cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus); return trial; } @@ -687,6 +702,13 @@ static inline void free_cpuset(struct cpuset *cs) kfree(cs); } +static inline struct cpumask *fetch_xcpus(struct cpuset *cs) +{ + return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus : + cpumask_empty(cs->effective_xcpus) ? cs->cpus_allowed + : cs->effective_xcpus; +} + /* * cpu_exclusive_check() - check if two cpusets are exclusive * @@ -694,14 +716,10 @@ static inline void free_cpuset(struct cpuset *cs) */ static inline bool cpu_exclusive_check(struct cpuset *cs1, struct cpuset *cs2) { - struct cpumask *cpus1, *cpus2; + struct cpumask *xcpus1 = fetch_xcpus(cs1); + struct cpumask *xcpus2 = fetch_xcpus(cs2); - cpus1 = cpumask_empty(cs1->effective_xcpus) - ? cs1->cpus_allowed : cs1->effective_xcpus; - cpus2 = cpumask_empty(cs2->effective_xcpus) - ? cs2->cpus_allowed : cs2->effective_xcpus; - - if (cpumask_intersects(cpus1, cpus2)) + if (cpumask_intersects(xcpus1, xcpus2)) return -EINVAL; return 0; } @@ -1368,6 +1386,54 @@ static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs, partition_is_populated(cs, NULL)); } +static void reset_partition_data(struct cpuset *cs) +{ + struct cpuset *parent = parent_cs(cs); + + if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + return; + + lockdep_assert_held(&callback_lock); + + cs->nr_subparts = 0; + if (cpumask_empty(cs->exclusive_cpus)) { + cpumask_clear(cs->effective_xcpus); + if (is_cpu_exclusive(cs)) + clear_bit(CS_CPU_EXCLUSIVE, &cs->flags); + } + if (!cpumask_and(cs->effective_cpus, + parent->effective_cpus, cs->cpus_allowed)) { + cs->use_parent_ecpus = true; + parent->child_ecpus_count++; + cpumask_copy(cs->effective_cpus, parent->effective_cpus); + } +} + +/* + * compute_effective_exclusive_cpumask - compute effective exclusive CPUs + * @cs: cpuset + * @xcpus: effective exclusive CPUs value to be set + * Return: true if xcpus is not empty, false otherwise. + * + * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set), + * it must be a subset of cpus_allowed and parent's effective_xcpus. + */ +static bool compute_effective_exclusive_cpumask(struct cpuset *cs, + struct cpumask *xcpus) +{ + struct cpuset *parent = parent_cs(cs); + + if (!xcpus) + xcpus = cs->effective_xcpus; + + if (!cpumask_empty(cs->exclusive_cpus)) + cpumask_and(xcpus, cs->exclusive_cpus, cs->cpus_allowed); + else + cpumask_copy(xcpus, cs->cpus_allowed); + + return cpumask_and(xcpus, xcpus, parent->effective_xcpus); +} + /** * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset * @cs: The cpuset that requests change in partition root state @@ -1426,7 +1492,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, */ adding = deleting = false; old_prs = new_prs = cs->partition_root_state; - xcpus = !cpumask_empty(cs->effective_xcpus) + xcpus = !cpumask_empty(cs->exclusive_cpus) ? cs->effective_xcpus : cs->cpus_allowed; if (cmd == partcmd_invalidate) { @@ -1659,8 +1725,7 @@ write_error: if (adding || deleting) { update_tasks_cpumask(parent, tmp->addmask); - if (parent->child_ecpus_count) - update_sibling_cpumasks(parent, cs, tmp); + update_sibling_cpumasks(parent, cs, tmp); } /* @@ -1709,7 +1774,9 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, * 2) All the effective_cpus will be used up and cp * has tasks */ - cpumask_and(new_ecpus, cs->effective_xcpus, cpu_active_mask); + compute_effective_exclusive_cpumask(cs, new_ecpus); + cpumask_and(new_ecpus, new_ecpus, cpu_active_mask); + rcu_read_lock(); cpuset_for_each_child(child, css, cs) { if (!is_partition_valid(child)) @@ -1777,6 +1844,16 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, compute_effective_cpumask(tmp->new_cpus, cp, parent); + /* + * Update effective_xcpus if exclusive_cpus set. + * The case when exclusive_cpus isn't set is handled later. + */ + if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) { + spin_lock_irq(&callback_lock); + compute_effective_exclusive_cpumask(cp, NULL); + spin_unlock_irq(&callback_lock); + } + if (is_partition_valid(parent) && is_partition_valid(cp)) compute_partition_effective_cpumask(cp, tmp->new_cpus); @@ -1869,7 +1946,11 @@ update_parent_effective: spin_lock_irq(&callback_lock); cpumask_copy(cp->effective_cpus, tmp->new_cpus); cp->partition_root_state = new_prs; - if ((new_prs > 0) && cpumask_empty(cp->effective_xcpus)) + /* + * Make sure effective_xcpus is properly set for a valid + * partition root. + */ + if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus)) cpumask_and(cp->effective_xcpus, cp->cpus_allowed, parent->effective_xcpus); if (new_prs < 0) { @@ -1886,7 +1967,7 @@ update_parent_effective: WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); - update_tasks_cpumask(cp, tmp->new_cpus); + update_tasks_cpumask(cp, cp->effective_cpus); /* * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE @@ -1939,8 +2020,13 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, /* * Check all its siblings and call update_cpumasks_hier() - * if their use_parent_ecpus flag is set in order for them - * to use the right effective_cpus value. + * if their effective_cpus will need to be changed. + * + * With the addition of effective_xcpus which is a subset of + * cpus_allowed. It is possible a change in parent's effective_cpus + * due to a change in a child partition's effective_xcpus will impact + * its siblings even if they do not inherit parent's effective_cpus + * directly. * * The update_cpumasks_hier() function may sleep. So we have to * release the RCU read lock before calling it. HIER_NO_SD_REBUILD @@ -1951,8 +2037,13 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, cpuset_for_each_child(sibling, pos_css, parent) { if (sibling == cs) continue; - if (!sibling->use_parent_ecpus) - continue; + if (!sibling->use_parent_ecpus && + !is_partition_valid(sibling)) { + compute_effective_cpumask(tmp->new_cpus, sibling, + parent); + if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) + continue; + } if (!css_tryget_online(&sibling->css)) continue; @@ -1977,6 +2068,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, struct tmpmasks tmp; struct cpuset *parent = parent_cs(cs); bool invalidate = false; + int hier_flags = 0; int old_prs = cs->partition_root_state; /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ @@ -2002,11 +2094,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, return -EINVAL; /* - * When effective_xcpus is set, make sure it is a subset of - * cpus_allowed and parent's effective_xcpus. + * When exclusive_cpus isn't explicitly set, it is constrainted + * by cpus_allowed and parent's effective_xcpus. Otherwise, + * trialcs->effective_xcpus is used as a temporary cpumask + * for checking validity of the partition root. */ - cpumask_and(trialcs->effective_xcpus, - parent->effective_xcpus, trialcs->cpus_allowed); + if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs)) + compute_effective_exclusive_cpumask(trialcs, NULL); } /* Nothing to do if the cpus didn't change */ @@ -2026,6 +2120,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, } } + /* + * Check all the descendants in update_cpumasks_hier() if + * effective_xcpus is to be changed. + */ + if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus)) + hier_flags = HIER_CHECKALL; + retval = validate_change(cs, trialcs); if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { @@ -2055,7 +2156,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) goto out_free; - if (cs->partition_root_state) { + if (is_partition_valid(cs)) { if (invalidate) update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, &tmp); @@ -2066,15 +2167,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); - if (!is_partition_valid(cs)) - cpumask_clear(cs->effective_xcpus); - else - cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); - + cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); + if ((old_prs > 0) && !is_partition_valid(cs)) + reset_partition_data(cs); spin_unlock_irq(&callback_lock); - /* effective_cpus will be updated here */ - update_cpumasks_hier(cs, &tmp, 0); + /* effective_cpus/effective_xcpus will be updated here */ + update_cpumasks_hier(cs, &tmp, hier_flags); /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ if (cs->partition_root_state) @@ -2084,6 +2183,94 @@ out_free: return 0; } +/** + * update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset + * @cs: the cpuset to consider + * @trialcs: trial cpuset + * @buf: buffer of cpu numbers written to this cpuset + * + * The tasks' cpumask will be updated if cs is a valid partition root. + */ +static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, + const char *buf) +{ + int retval; + struct tmpmasks tmp; + struct cpuset *parent = parent_cs(cs); + bool invalidate = false; + int hier_flags = 0; + int old_prs = cs->partition_root_state; + + if (!*buf) { + cpumask_clear(trialcs->exclusive_cpus); + } else { + retval = cpulist_parse(buf, trialcs->exclusive_cpus); + if (retval < 0) + return retval; + if (!is_cpu_exclusive(cs)) + set_bit(CS_CPU_EXCLUSIVE, &trialcs->flags); + } + + /* Nothing to do if the CPUs didn't change */ + if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus)) + return 0; + + if (alloc_cpumasks(NULL, &tmp)) + return -ENOMEM; + + compute_effective_exclusive_cpumask(trialcs, NULL); + + /* + * Check all the descendants in update_cpumasks_hier() if + * effective_xcpus is to be changed. + */ + if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus)) + hier_flags = HIER_CHECKALL; + + retval = validate_change(cs, trialcs); + if (retval) + return retval; + + if (is_partition_valid(cs)) { + if (cpumask_empty(trialcs->effective_xcpus)) { + invalidate = true; + cs->prs_err = PERR_INVCPUS; + } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { + invalidate = true; + cs->prs_err = PERR_NOCPUS; + } + + if (invalidate) + update_parent_effective_cpumask(cs, partcmd_invalidate, + NULL, &tmp); + else + update_parent_effective_cpumask(cs, partcmd_update, + trialcs->effective_xcpus, &tmp); + } + + spin_lock_irq(&callback_lock); + cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus); + cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); + if ((old_prs > 0) && !is_partition_valid(cs)) + reset_partition_data(cs); + spin_unlock_irq(&callback_lock); + + /* + * Call update_cpumasks_hier() to update effective_cpus/effective_xcpus + * of the subtree when it is a valid partition root or effective_xcpus + * is updated. + */ + if (is_partition_valid(cs) || hier_flags) + update_cpumasks_hier(cs, &tmp, hier_flags); + + /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ + if (cs->partition_root_state) + update_partition_sd_lb(cs, old_prs); + + free_cpumasks(NULL, &tmp); + return 0; +} + /* * Migrate memory region from one set of nodes to another. This is * performed asynchronously as it can be called from process migration path @@ -2474,10 +2661,10 @@ static int update_prstate(struct cpuset *cs, int new_prs) return -ENOMEM; /* - * Setup effective_xcpus if not set yet, it will be cleared later - * if partition becomes invalid. + * Setup effective_xcpus if not properly set yet, it will be cleared + * later if partition becomes invalid. */ - if ((new_prs > 0) && cpumask_empty(cs->effective_xcpus)) { + if ((new_prs > 0) && cpumask_empty(cs->exclusive_cpus)) { struct cpuset *parent = parent_cs(cs); spin_lock_irq(&callback_lock); @@ -2533,7 +2720,7 @@ out: cs->partition_root_state = new_prs; WRITE_ONCE(cs->prs_err, err); if (!is_partition_valid(cs)) - cpumask_clear(cs->effective_xcpus); + reset_partition_data(cs); spin_unlock_irq(&callback_lock); /* Force update if switching back to member */ @@ -2889,6 +3076,7 @@ typedef enum { FILE_EFFECTIVE_CPULIST, FILE_EFFECTIVE_MEMLIST, FILE_SUBPARTS_CPULIST, + FILE_EXCLUSIVE_CPULIST, FILE_EFFECTIVE_XCPULIST, FILE_CPU_EXCLUSIVE, FILE_MEM_EXCLUSIVE, @@ -3027,6 +3215,9 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, case FILE_CPULIST: retval = update_cpumask(cs, trialcs, buf); break; + case FILE_EXCLUSIVE_CPULIST: + retval = update_exclusive_cpumask(cs, trialcs, buf); + break; case FILE_MEMLIST: retval = update_nodemask(cs, trialcs, buf); break; @@ -3074,6 +3265,9 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) case FILE_EFFECTIVE_MEMLIST: seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems)); break; + case FILE_EXCLUSIVE_CPULIST: + seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus)); + break; case FILE_EFFECTIVE_XCPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus)); break; @@ -3350,6 +3544,15 @@ static struct cftype dfl_files[] = { .file_offset = offsetof(struct cpuset, partition_file), }, + { + .name = "cpus.exclusive", + .seq_show = cpuset_common_seq_show, + .write = cpuset_write_resmask, + .max_write_len = (100U + 6 * NR_CPUS), + .private = FILE_EXCLUSIVE_CPULIST, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { .name = "cpus.exclusive.effective", .seq_show = cpuset_common_seq_show, @@ -3675,12 +3878,14 @@ int __init cpuset_init(void) BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL)); cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); cpumask_setall(top_cpuset.effective_cpus); cpumask_setall(top_cpuset.effective_xcpus); + cpumask_setall(top_cpuset.exclusive_cpus); nodes_setall(top_cpuset.effective_mems); fmeter_init(&top_cpuset.fmeter); -- cgit v1.2.3 From 181c8e091aae11b0b7efba49b34adfe3c89ce648 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 5 Sep 2023 09:32:40 -0400 Subject: cgroup/cpuset: Introduce remote partition One can use "cpuset.cpus.partition" to create multiple scheduling domains or to produce a set of isolated CPUs where load balancing is disabled. The former use case is less common but the latter one can be frequently used especially for the Telco use cases like DPDK. The existing "isolated" partition can be used to produce isolated CPUs if the applications have full control of a system. However, in a containerized environment where all the apps are run in a container, it is hard to distribute out isolated CPUs from the root down given the unified hierarchy nature of cgroup v2. The container running on isolated CPUs can be several layers down from the root. The current partition feature requires that all the ancestors of a leaf partition root must be parititon roots themselves. This can be hard to configure. This patch introduces a new type of partition called remote partition. A remote partition is a partition whose parent is not a partition root itself and its CPUs are acquired directly from available CPUs in the top cpuset through a hierachical distribution of exclusive CPUs down from it. By contrast, the existing type of partitions where their parents have to be valid partition roots are referred to as local partitions as they have to be clustered around a parent partition root. Child local partitons can be created under a remote partition, but a remote partition cannot be created under a local partition. We may relax this limitation in the future if there are use cases for such configuration. Manually writing to the "cpuset.cpus.exclusive" file is not necessary when creating local partitions. However, writing proper values to "cpuset.cpus.exclusive" down the cgroup hierarchy before the target remote partition root is mandatory for the creation of a remote partition. The value in "cpuset.cpus.exclusive.effective" may change if its "cpuset.cpus" or its parent's "cpuset.cpus.exclusive.effective" changes. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 335 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 306 insertions(+), 29 deletions(-) (limited to 'kernel/cgroup') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 0419654f3004..7ac320e079b8 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -192,6 +192,9 @@ struct cpuset { /* Handle for cpuset.cpus.partition */ struct cgroup_file partition_file; + + /* Remote partition silbling list anchored at remote_children */ + struct list_head remote_sibling; }; /* @@ -199,6 +202,9 @@ struct cpuset { */ static cpumask_var_t subpartitions_cpus; +/* List of remote partition root children */ +static struct list_head remote_children; + /* * Partition root states: * @@ -348,6 +354,7 @@ static struct cpuset top_cpuset = { .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), .partition_root_state = PRS_ROOT, + .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling), }; /** @@ -1434,6 +1441,211 @@ static bool compute_effective_exclusive_cpumask(struct cpuset *cs, return cpumask_and(xcpus, xcpus, parent->effective_xcpus); } +static inline bool is_remote_partition(struct cpuset *cs) +{ + return !list_empty(&cs->remote_sibling); +} + +static inline bool is_local_partition(struct cpuset *cs) +{ + return is_partition_valid(cs) && !is_remote_partition(cs); +} + +/* + * remote_partition_enable - Enable current cpuset as a remote partition root + * @cs: the cpuset to update + * @tmp: temparary masks + * Return: 1 if successful, 0 if error + * + * Enable the current cpuset to become a remote partition root taking CPUs + * directly from the top cpuset. cpuset_mutex must be held by the caller. + */ +static int remote_partition_enable(struct cpuset *cs, struct tmpmasks *tmp) +{ + /* + * The user must have sysadmin privilege. + */ + if (!capable(CAP_SYS_ADMIN)) + return 0; + + /* + * The requested exclusive_cpus must not be allocated to other + * partitions and it can't use up all the root's effective_cpus. + * + * Note that if there is any local partition root above it or + * remote partition root underneath it, its exclusive_cpus must + * have overlapped with subpartitions_cpus. + */ + compute_effective_exclusive_cpumask(cs, tmp->new_cpus); + if (cpumask_empty(tmp->new_cpus) || + cpumask_intersects(tmp->new_cpus, subpartitions_cpus) || + cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) + return 0; + + spin_lock_irq(&callback_lock); + cpumask_andnot(top_cpuset.effective_cpus, + top_cpuset.effective_cpus, tmp->new_cpus); + cpumask_or(subpartitions_cpus, + subpartitions_cpus, tmp->new_cpus); + + if (cs->use_parent_ecpus) { + struct cpuset *parent = parent_cs(cs); + + cs->use_parent_ecpus = false; + parent->child_ecpus_count--; + } + list_add(&cs->remote_sibling, &remote_children); + spin_unlock_irq(&callback_lock); + + /* + * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. + */ + update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + update_sibling_cpumasks(&top_cpuset, NULL, tmp); + + return 1; +} + +/* + * remote_partition_disable - Remove current cpuset from remote partition list + * @cs: the cpuset to update + * @tmp: temparary masks + * + * The effective_cpus is also updated. + * + * cpuset_mutex must be held by the caller. + */ +static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) +{ + compute_effective_exclusive_cpumask(cs, tmp->new_cpus); + WARN_ON_ONCE(!is_remote_partition(cs)); + WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus)); + + spin_lock_irq(&callback_lock); + cpumask_andnot(subpartitions_cpus, + subpartitions_cpus, tmp->new_cpus); + cpumask_and(tmp->new_cpus, + tmp->new_cpus, cpu_active_mask); + cpumask_or(top_cpuset.effective_cpus, + top_cpuset.effective_cpus, tmp->new_cpus); + list_del_init(&cs->remote_sibling); + cs->partition_root_state = -cs->partition_root_state; + if (!cs->prs_err) + cs->prs_err = PERR_INVCPUS; + reset_partition_data(cs); + spin_unlock_irq(&callback_lock); + + /* + * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. + */ + update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + update_sibling_cpumasks(&top_cpuset, NULL, tmp); +} + +/* + * remote_cpus_update - cpus_exclusive change of remote partition + * @cs: the cpuset to be updated + * @newmask: the new effective_xcpus mask + * @tmp: temparary masks + * + * top_cpuset and subpartitions_cpus will be updated or partition can be + * invalidated. + */ +static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, + struct tmpmasks *tmp) +{ + bool adding, deleting; + + if (WARN_ON_ONCE(!is_remote_partition(cs))) + return; + + WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); + + if (cpumask_empty(newmask)) + goto invalidate; + + adding = cpumask_andnot(tmp->addmask, newmask, cs->effective_xcpus); + deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, newmask); + + /* + * Additions of remote CPUs is only allowed if those CPUs are + * not allocated to other partitions and there are effective_cpus + * left in the top cpuset. + */ + if (adding && (!capable(CAP_SYS_ADMIN) || + cpumask_intersects(tmp->addmask, subpartitions_cpus) || + cpumask_subset(top_cpuset.effective_cpus, tmp->addmask))) + goto invalidate; + + spin_lock_irq(&callback_lock); + if (adding) { + cpumask_or(subpartitions_cpus, + subpartitions_cpus, tmp->addmask); + cpumask_andnot(top_cpuset.effective_cpus, + top_cpuset.effective_cpus, tmp->addmask); + } + if (deleting) { + cpumask_andnot(subpartitions_cpus, + subpartitions_cpus, tmp->delmask); + cpumask_and(tmp->delmask, + tmp->delmask, cpu_active_mask); + cpumask_or(top_cpuset.effective_cpus, + top_cpuset.effective_cpus, tmp->delmask); + } + spin_unlock_irq(&callback_lock); + + /* + * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. + */ + update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + update_sibling_cpumasks(&top_cpuset, NULL, tmp); + return; + +invalidate: + remote_partition_disable(cs, tmp); +} + +/* + * remote_partition_check - check if a child remote partition needs update + * @cs: the cpuset to be updated + * @newmask: the new effective_xcpus mask + * @delmask: temporary mask for deletion (not in tmp) + * @tmp: temparary masks + * + * This should be called before the given cs has updated its cpus_allowed + * and/or effective_xcpus. + */ +static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask, + struct cpumask *delmask, struct tmpmasks *tmp) +{ + struct cpuset *child, *next; + int disable_cnt = 0; + + /* + * Compute the effective exclusive CPUs that will be deleted. + */ + if (!cpumask_andnot(delmask, cs->effective_xcpus, newmask) || + !cpumask_intersects(delmask, subpartitions_cpus)) + return; /* No deletion of exclusive CPUs in partitions */ + + /* + * Searching the remote children list to look for those that will + * be impacted by the deletion of exclusive CPUs. + * + * Since a cpuset must be removed from the remote children list + * before it can go offline and holding cpuset_mutex will prevent + * any change in cpuset status. RCU read lock isn't needed. + */ + lockdep_assert_held(&cpuset_mutex); + list_for_each_entry_safe(child, next, &remote_children, remote_sibling) + if (cpumask_intersects(child->effective_cpus, delmask)) { + remote_partition_disable(child, tmp); + disable_cnt++; + } + if (disable_cnt) + rebuild_sched_domains_locked(); +} + /** * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset * @cs: The cpuset that requests change in partition root state @@ -1548,7 +1760,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, subparts_delta++; } else if (cmd == partcmd_disable) { /* - n* May need to add cpus to parent's effective_cpus for + * May need to add cpus to parent's effective_cpus for * valid partition root. */ adding = !is_prs_invalid(old_prs) && @@ -1749,7 +1961,7 @@ write_error: * @new_ecpus: previously computed effective_cpus to be updated * * Compute the effective_cpus of a partition root by scanning effective_xcpus - * of child partition roots and exclusing their effective_xcpus. + * of child partition roots and excluding their effective_xcpus. * * This has the side effect of invalidating valid child partition roots, * if necessary. Since it is called from either cpuset_hotplug_update_tasks() @@ -1840,9 +2052,17 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, cs) { struct cpuset *parent = parent_cs(cp); + bool remote = is_remote_partition(cp); bool update_parent = false; - compute_effective_cpumask(tmp->new_cpus, cp, parent); + /* + * Skip descendent remote partition that acquires CPUs + * directly from top cpuset unless it is cs. + */ + if (remote && (cp != cs)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } /* * Update effective_xcpus if exclusive_cpus set. @@ -1854,8 +2074,12 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, spin_unlock_irq(&callback_lock); } - if (is_partition_valid(parent) && is_partition_valid(cp)) + old_prs = new_prs = cp->partition_root_state; + if (remote || (is_partition_valid(parent) && + is_partition_valid(cp))) compute_partition_effective_cpumask(cp, tmp->new_cpus); + else + compute_effective_cpumask(tmp->new_cpus, cp, parent); /* * A partition with no effective_cpus is allowed as long as @@ -1873,7 +2097,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, * it is a partition root that has explicitly distributed * out all its CPUs. */ - if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) { + if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) { cpumask_copy(tmp->new_cpus, parent->effective_cpus); if (!cp->use_parent_ecpus) { cp->use_parent_ecpus = true; @@ -1885,6 +2109,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, parent->child_ecpus_count--; } + if (remote) + goto get_css; + /* * Skip the whole subtree if * 1) the cpumask remains the same, @@ -1907,7 +2134,6 @@ update_parent_effective: * update_tasks_cpumask() again for tasks in the parent * cpuset if the parent's effective_cpus changes. */ - old_prs = new_prs = cp->partition_root_state; if ((cp != cs) && old_prs) { switch (parent->partition_root_state) { case PRS_ROOT: @@ -1929,7 +2155,7 @@ update_parent_effective: break; } } - +get_css: if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); @@ -1953,13 +2179,8 @@ update_parent_effective: if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus)) cpumask_and(cp->effective_xcpus, cp->cpus_allowed, parent->effective_xcpus); - if (new_prs < 0) { - /* Reset partition data */ - cp->nr_subparts = 0; - cpumask_clear(cp->effective_xcpus); - if (is_cpu_exclusive(cp)) - clear_bit(CS_CPU_EXCLUSIVE, &cp->flags); - } + else if (new_prs < 0) + reset_partition_data(cp); spin_unlock_irq(&callback_lock); notify_partition_change(cp, old_prs); @@ -2157,12 +2378,23 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, goto out_free; if (is_partition_valid(cs)) { - if (invalidate) + /* + * Call remote_cpus_update() to handle valid remote partition + */ + if (is_remote_partition(cs)) + remote_cpus_update(cs, trialcs->effective_xcpus, &tmp); + else if (invalidate) update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, &tmp); else update_parent_effective_cpumask(cs, partcmd_update, trialcs->effective_xcpus, &tmp); + } else if (!cpumask_empty(cs->exclusive_cpus)) { + /* + * Use trialcs->effective_cpus as a temp cpumask + */ + remote_partition_check(cs, trialcs->effective_xcpus, + trialcs->effective_cpus, &tmp); } spin_lock_irq(&callback_lock); @@ -2203,6 +2435,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (!*buf) { cpumask_clear(trialcs->exclusive_cpus); + cpumask_clear(trialcs->effective_xcpus); } else { retval = cpulist_parse(buf, trialcs->exclusive_cpus); if (retval < 0) @@ -2218,7 +2451,8 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (alloc_cpumasks(NULL, &tmp)) return -ENOMEM; - compute_effective_exclusive_cpumask(trialcs, NULL); + if (*buf) + compute_effective_exclusive_cpumask(trialcs, NULL); /* * Check all the descendants in update_cpumasks_hier() if @@ -2240,14 +2474,26 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, cs->prs_err = PERR_NOCPUS; } - if (invalidate) + if (is_remote_partition(cs)) { + if (invalidate) + remote_partition_disable(cs, &tmp); + else + remote_cpus_update(cs, trialcs->effective_xcpus, + &tmp); + } else if (invalidate) { update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, &tmp); - else + } else { update_parent_effective_cpumask(cs, partcmd_update, trialcs->effective_xcpus, &tmp); + } + } else if (!cpumask_empty(trialcs->exclusive_cpus)) { + /* + * Use trialcs->effective_cpus as a temp cpumask + */ + remote_partition_check(cs, trialcs->effective_xcpus, + trialcs->effective_cpus, &tmp); } - spin_lock_irq(&callback_lock); cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus); cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); @@ -2643,18 +2889,25 @@ out: static int update_prstate(struct cpuset *cs, int new_prs) { int err = PERR_NONE, old_prs = cs->partition_root_state; + struct cpuset *parent = parent_cs(cs); struct tmpmasks tmpmask; if (old_prs == new_prs) return 0; /* - * For a previously invalid partition root, leave it at being - * invalid if new_prs is not "member". + * For a previously invalid partition root with valid partition root + * parent, treat it as if it is a "member". Otherwise, reject it as + * remote partition cannot currently self-recover from an invalid + * state. */ if (new_prs && is_prs_invalid(old_prs)) { - cs->partition_root_state = -new_prs; - return 0; + if (is_partition_valid(parent)) { + old_prs = PRS_MEMBER; + } else { + cs->partition_root_state = -new_prs; + return 0; + } } if (alloc_cpumasks(NULL, &tmpmask)) @@ -2665,8 +2918,6 @@ static int update_prstate(struct cpuset *cs, int new_prs) * later if partition becomes invalid. */ if ((new_prs > 0) && cpumask_empty(cs->exclusive_cpus)) { - struct cpuset *parent = parent_cs(cs); - spin_lock_irq(&callback_lock); cpumask_and(cs->effective_xcpus, cs->cpus_allowed, parent->effective_xcpus); @@ -2688,6 +2939,12 @@ static int update_prstate(struct cpuset *cs, int new_prs) err = update_parent_effective_cpumask(cs, partcmd_enable, NULL, &tmpmask); + /* + * If an attempt to become local partition root fails, + * try to become a remote partition root instead. + */ + if (err && remote_partition_enable(cs, &tmpmask)) + err = 0; } else if (old_prs && new_prs) { /* * A change in load balance state only, no change in cpumasks. @@ -2698,8 +2955,11 @@ static int update_prstate(struct cpuset *cs, int new_prs) * Switching back to member is always allowed even if it * disables child partitions. */ - update_parent_effective_cpumask(cs, partcmd_disable, NULL, - &tmpmask); + if (is_remote_partition(cs)) + remote_partition_disable(cs, &tmpmask); + else + update_parent_effective_cpumask(cs, partcmd_disable, + NULL, &tmpmask); /* * Invalidation of child partitions will be done in @@ -3602,6 +3862,7 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) nodes_clear(cs->effective_mems); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; + INIT_LIST_HEAD(&cs->remote_sibling); /* Set CS_MEMORY_MIGRATE for default hierarchy */ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) @@ -3637,6 +3898,11 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->effective_mems = parent->effective_mems; cs->use_parent_ecpus = true; parent->child_ecpus_count++; + /* + * Clear CS_SCHED_LOAD_BALANCE if parent is isolated + */ + if (!is_sched_load_balance(parent)) + clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); } /* @@ -3891,6 +4157,7 @@ int __init cpuset_init(void) fmeter_init(&top_cpuset.fmeter); set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); top_cpuset.relax_domain_level = -1; + INIT_LIST_HEAD(&remote_children); BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); @@ -4006,6 +4273,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) static nodemask_t new_mems; bool cpus_updated; bool mems_updated; + bool remote; struct cpuset *parent; retry: wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); @@ -4032,9 +4300,18 @@ retry: * Compute effective_cpus for valid partition root, may invalidate * child partition roots if necessary. */ - if (is_partition_valid(cs) && is_partition_valid(parent)) + remote = is_remote_partition(cs); + if (remote || (is_partition_valid(cs) && is_partition_valid(parent))) compute_partition_effective_cpumask(cs, &new_cpus); + if (remote && cpumask_empty(&new_cpus) && + partition_is_populated(cs, NULL)) { + remote_partition_disable(cs, tmp); + compute_effective_cpumask(&new_cpus, cs, parent); + remote = false; + cpuset_force_rebuild(); + } + /* * Force the partition to become invalid if either one of * the following conditions hold: @@ -4042,7 +4319,7 @@ retry: * 2) parent is invalid or doesn't grant any cpus to child * partitions. */ - if (is_partition_valid(cs) && (!is_partition_valid(parent) || + if (is_local_partition(cs) && (!is_partition_valid(parent) || tasks_nocpu_error(parent, cs, &new_cpus))) { update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, tmp); compute_effective_cpumask(&new_cpus, cs, parent); -- cgit v1.2.3 From 4a74e418881f26cdeae1011453acd66cedc8ad2c Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 5 Sep 2023 09:32:41 -0400 Subject: cgroup/cpuset: Check partition conflict with housekeeping setup A user can pre-configure certain CPUs in an isolated state at boot time with the "isolcpus" kernel boot command line option. Those CPUs will not be in the housekeeping_cpumask(HK_TYPE_DOMAIN) and so will not be in any sched domains. This may conflict with the partition setup at runtime. Those boot time isolated CPUs should only be used in an isolated partition. This patch adds the necessary check and disallows partition setup if the check fails. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'kernel/cgroup') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 7ac320e079b8..15f399153a2e 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -75,6 +75,7 @@ enum prs_errcode { PERR_NOCPUS, PERR_HOTPLUG, PERR_CPUSEMPTY, + PERR_HKEEPING, }; static const char * const perr_strings[] = { @@ -85,6 +86,7 @@ static const char * const perr_strings[] = { [PERR_NOCPUS] = "Parent unable to distribute cpu downstream", [PERR_HOTPLUG] = "No cpu available due to hotplug", [PERR_CPUSEMPTY] = "cpuset.cpus is empty", + [PERR_HKEEPING] = "partition config conflicts with housekeeping setup", }; struct cpuset { @@ -1646,6 +1648,26 @@ static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask, rebuild_sched_domains_locked(); } +/* + * prstate_housekeeping_conflict - check for partition & housekeeping conflicts + * @prstate: partition root state to be checked + * @new_cpus: cpu mask + * Return: true if there is conflict, false otherwise + * + * CPUs outside of housekeeping_cpumask(HK_TYPE_DOMAIN) can only be used in + * an isolated partition. + */ +static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) +{ + const struct cpumask *hk_domain = housekeeping_cpumask(HK_TYPE_DOMAIN); + bool all_in_hk = cpumask_subset(new_cpus, hk_domain); + + if (!all_in_hk && (prstate != PRS_ISOLATED)) + return true; + + return false; +} + /** * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset * @cs: The cpuset that requests change in partition root state @@ -1748,6 +1770,9 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, !cpumask_intersects(xcpus, parent->effective_xcpus)) return PERR_INVCPUS; + if (prstate_housekeeping_conflict(new_prs, xcpus)) + return PERR_HKEEPING; + /* * A parent can be left with no CPU as long as there is no * task directly associated with the parent partition. @@ -2335,6 +2360,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cpumask_empty(trialcs->effective_xcpus)) { invalidate = true; cs->prs_err = PERR_INVCPUS; + } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { + invalidate = true; + cs->prs_err = PERR_HKEEPING; } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { invalidate = true; cs->prs_err = PERR_NOCPUS; @@ -2469,6 +2497,9 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cpumask_empty(trialcs->effective_xcpus)) { invalidate = true; cs->prs_err = PERR_INVCPUS; + } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { + invalidate = true; + cs->prs_err = PERR_HKEEPING; } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { invalidate = true; cs->prs_err = PERR_NOCPUS; -- cgit v1.2.3 From 9b81d3a5be05d350ac93d99762c7ee91fe29b4cb Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Wed, 27 Sep 2023 14:25:40 +0000 Subject: cgroup: add cgroup_favordynmods= command-line option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We have a need of using favordynmods with cgroup v1, which doesn't support changing mount flags during remount. Enabling CONFIG_CGROUP_FAVOR_DYNMODS at build-time is not an option because we want to be able to selectively enable it for certain systems. This commit addresses this by introducing the cgroup_favordynmods= command-line option. This option works for both cgroup v1 and v2 and also allows for disabling favorynmods when the kernel built with CONFIG_CGROUP_FAVOR_DYNMODS=y. Also, note that when cgroup_favordynmods=true favordynmods is never disabled in cgroup_destroy_root(). Signed-off-by: Luiz Capitulino Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- Documentation/admin-guide/kernel-parameters.txt | 4 ++++ kernel/cgroup/cgroup.c | 18 ++++++++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) (limited to 'kernel/cgroup') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 0a1731a0f0ef..8b744d39d393 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -580,6 +580,10 @@ named mounts. Specifying both "all" and "named" disables all v1 hierarchies. + cgroup_favordynmods= [KNL] Enable or Disable favordynmods. + Format: { "true" | "false" } + Defaults to the value of CONFIG_CGROUP_FAVOR_DYNMODS. + cgroup.memory= [KNL] Pass options to the cgroup memory controller. Format: nosocket -- Disable socket memory accounting. diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 833ac6dd15d9..059cd5651d41 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -207,6 +207,8 @@ static u16 have_exit_callback __read_mostly; static u16 have_release_callback __read_mostly; static u16 have_canfork_callback __read_mostly; +static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS); + /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { .ns.count = REFCOUNT_INIT(2), @@ -1350,7 +1352,9 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_root_count--; } - cgroup_favor_dynmods(root, false); + if (!have_favordynmods) + cgroup_favor_dynmods(root, false); + cgroup_exit_root_id(root); cgroup_unlock(); @@ -2245,9 +2249,9 @@ static int cgroup_init_fs_context(struct fs_context *fc) fc->user_ns = get_user_ns(ctx->ns->user_ns); fc->global = true; -#ifdef CONFIG_CGROUP_FAVOR_DYNMODS - ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; -#endif + if (have_favordynmods) + ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; + return 0; } @@ -6766,6 +6770,12 @@ static int __init enable_cgroup_debug(char *str) } __setup("cgroup_debug", enable_cgroup_debug); +static int __init cgroup_favordynmods_setup(char *str) +{ + return (kstrtobool(str, &have_favordynmods) == 0); +} +__setup("cgroup_favordynmods=", cgroup_favordynmods_setup); + /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest -- cgit v1.2.3 From 46c521bac592251229acdd2cd67976a7b1f88bed Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 3 Oct 2023 10:44:20 -0400 Subject: cgroup/cpuset: Enable invalid to valid local partition transition When a local partition becomes invalid, it won't transition back to valid partition automatically if a proper "cpuset.cpus.exclusive" or "cpuset.cpus" change is made. Instead, system administrators have to explicitly echo "root" or "isolated" into the "cpuset.cpus.partition" file at the partition root. This patch now enables the automatic transition of an invalid local partition back to valid when there is a proper "cpuset.cpus.exclusive" or "cpuset.cpus" change. Automatic transition of an invalid remote partition to a valid one, however, is not covered by this patch. They still need an explicit write to "cpuset.cpus.partition" to become valid again. The test_cpuset_prs.sh test script is updated to add new test cases to test this automatic state transition. Reported-by: Pierre Gondois Link: https://lore.kernel.org/lkml/9777f0d2-2fdf-41cb-bd01-19c52939ef42@arm.com Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 79 ++++++++++++++--------- tools/testing/selftests/cgroup/test_cpuset_prs.sh | 21 ++++-- 2 files changed, 62 insertions(+), 38 deletions(-) (limited to 'kernel/cgroup') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 15f399153a2e..93facdab513c 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1806,17 +1806,28 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, * * Compute add/delete mask to/from effective_cpus * - * addmask = effective_xcpus & ~newmask & parent->effective_xcpus - * delmask = newmask & ~cs->effective_xcpus - * & parent->effective_xcpus + * For valid partition: + * addmask = exclusive_cpus & ~newmask + * & parent->effective_xcpus + * delmask = newmask & ~exclusive_cpus + * & parent->effective_xcpus + * + * For invalid partition: + * delmask = newmask & parent->effective_xcpus */ - cpumask_andnot(tmp->addmask, xcpus, newmask); - adding = cpumask_and(tmp->addmask, tmp->addmask, - parent->effective_xcpus); + if (is_prs_invalid(old_prs)) { + adding = false; + deleting = cpumask_and(tmp->delmask, + newmask, parent->effective_xcpus); + } else { + cpumask_andnot(tmp->addmask, xcpus, newmask); + adding = cpumask_and(tmp->addmask, tmp->addmask, + parent->effective_xcpus); - cpumask_andnot(tmp->delmask, newmask, xcpus); - deleting = cpumask_and(tmp->delmask, tmp->delmask, - parent->effective_xcpus); + cpumask_andnot(tmp->delmask, newmask, xcpus); + deleting = cpumask_and(tmp->delmask, tmp->delmask, + parent->effective_xcpus); + } /* * Make partition invalid if parent's effective_cpus could * become empty and there are tasks in the parent. @@ -1910,9 +1921,11 @@ write_error: /* * Transitioning between invalid to valid or vice versa may require - * changing CS_CPU_EXCLUSIVE. + * changing CS_CPU_EXCLUSIVE. In the case of partcmd_update, + * validate_change() has already been successfully called and + * CPU lists in cs haven't been updated yet. So defer it to later. */ - if (old_prs != new_prs) { + if ((old_prs != new_prs) && (cmd != partcmd_update)) { int err = update_partition_exclusive(cs, new_prs); if (err) @@ -1960,6 +1973,9 @@ write_error: spin_unlock_irq(&callback_lock); + if ((old_prs != new_prs) && (cmd == partcmd_update)) + update_partition_exclusive(cs, new_prs); + if (adding || deleting) { update_tasks_cpumask(parent, tmp->addmask); update_sibling_cpumasks(parent, cs, tmp); @@ -2356,8 +2372,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (alloc_cpumasks(NULL, &tmp)) return -ENOMEM; - if (is_partition_valid(cs)) { - if (cpumask_empty(trialcs->effective_xcpus)) { + if (old_prs) { + if (is_partition_valid(cs) && + cpumask_empty(trialcs->effective_xcpus)) { invalidate = true; cs->prs_err = PERR_INVCPUS; } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { @@ -2391,13 +2408,16 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, */ invalidate = true; rcu_read_lock(); - cpuset_for_each_child(cp, css, parent) + cpuset_for_each_child(cp, css, parent) { + struct cpumask *xcpus = fetch_xcpus(trialcs); + if (is_partition_valid(cp) && - cpumask_intersects(trialcs->effective_xcpus, cp->effective_xcpus)) { + cpumask_intersects(xcpus, cp->effective_xcpus)) { rcu_read_unlock(); update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp); rcu_read_lock(); } + } rcu_read_unlock(); retval = 0; } @@ -2405,18 +2425,24 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) goto out_free; - if (is_partition_valid(cs)) { + if (is_partition_valid(cs) || + (is_partition_invalid(cs) && !invalidate)) { + struct cpumask *xcpus = trialcs->effective_xcpus; + + if (cpumask_empty(xcpus) && is_partition_invalid(cs)) + xcpus = trialcs->cpus_allowed; + /* * Call remote_cpus_update() to handle valid remote partition */ if (is_remote_partition(cs)) - remote_cpus_update(cs, trialcs->effective_xcpus, &tmp); + remote_cpus_update(cs, xcpus, &tmp); else if (invalidate) update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, &tmp); else update_parent_effective_cpumask(cs, partcmd_update, - trialcs->effective_xcpus, &tmp); + xcpus, &tmp); } else if (!cpumask_empty(cs->exclusive_cpus)) { /* * Use trialcs->effective_cpus as a temp cpumask @@ -2493,7 +2519,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval) return retval; - if (is_partition_valid(cs)) { + if (old_prs) { if (cpumask_empty(trialcs->effective_xcpus)) { invalidate = true; cs->prs_err = PERR_INVCPUS; @@ -2927,19 +2953,10 @@ static int update_prstate(struct cpuset *cs, int new_prs) return 0; /* - * For a previously invalid partition root with valid partition root - * parent, treat it as if it is a "member". Otherwise, reject it as - * remote partition cannot currently self-recover from an invalid - * state. + * Treat a previously invalid partition root as if it is a "member". */ - if (new_prs && is_prs_invalid(old_prs)) { - if (is_partition_valid(parent)) { - old_prs = PRS_MEMBER; - } else { - cs->partition_root_state = -new_prs; - return 0; - } - } + if (new_prs && is_prs_invalid(old_prs)) + old_prs = PRS_MEMBER; if (alloc_cpumasks(NULL, &tmpmask)) return -ENOMEM; diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index 0f4f4a57ae12..a6e9848189d6 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -220,7 +220,8 @@ test_isolated() # +- B1 # # P = set cpus.partition (0:member, 1:root, 2:isolated) -# C = add cpu-list +# C = add cpu-list to cpuset.cpus +# X = add cpu-list to cpuset.cpus.exclusive # S

= use prefix in subtree_control # T = put a task into cgroup # O= = Write to CPU online file of @@ -318,16 +319,19 @@ TEST_MATRIX=( " C0-3:S+ C1-3:S+ C2 . X2-3 X2-3 T:P2:O2=0 O2=1 0 A1:0-3,A2:1-3,A3:2 A1:P0,A3:P-2" # cpus.exclusive.effective clearing test - " C0-3:S+ C1-3:S+ C2 . X2-3:X . . . 0 A1:0-3,A2:1-3,A3:2,XA1:" + " C0-3:S+ C1-3:S+ C2 . X2-3:X . . . 0 A1:0-3,A2:1-3,A3:2,XA1:" - # Invalid to valid remote partition indirect transition test via member - " C0-3:S+ C1-3 . . . X3:P2 . . 0 A1:0-3,A2:1-3,XA2: A2:P-2" + # Invalid to valid remote partition transition test + " C0-3:S+ C1-3 . . . X3:P2 . . 0 A1:0-3,A2:1-3,XA2: A2:P-2" " C0-3:S+ C1-3:X3:P2 - . . X2-3 P0:P2 . . 0 A1:0-2,A2:3,XA2:3 A2:P2 3" + . . X2-3 P2 . . 0 A1:0-2,A2:3,XA2:3 A2:P2 3" # Invalid to valid local partition direct transition tests - " C1-3:S+:P2 C2-3:X1:P2 . . . . . . 0 A1:1-3,XA1:1-3,A2:2-3:XA2: A1:P2,A2:P-2 1-3" - " C1-3:S+:P2 C2-3:X1:P2 . . . X3:P2 . . 0 A1:1-2,XA1:1-3,A2:3:XA2:3 A1:P2,A2:P2 1-3" + " C1-3:S+:P2 C2-3:X1:P2 . . . . . . 0 A1:1-3,XA1:1-3,A2:2-3:XA2: A1:P2,A2:P-2 1-3" + " C1-3:S+:P2 C2-3:X1:P2 . . . X3:P2 . . 0 A1:1-2,XA1:1-3,A2:3:XA2:3 A1:P2,A2:P2 1-3" + " C0-3:P2 . . C4-6 C0-4 . . . 0 A1:0-4,B1:4-6 A1:P-2,B1:P0" + " C0-3:P2 . . C4-6 C0-4:C0-3 . . . 0 A1:0-3,B1:4-6 A1:P2,B1:P0 0-3" + " C0-3:P2 . . C3-5:C4-5 . . . . 0 A1:0-3,B1:4-5 A1:P2,B1:P0 0-3" # Local partition invalidation tests " C0-3:X1-3:S+:P2 C1-3:X2-3:S+:P2 C2-3:X3:P2 \ @@ -336,6 +340,9 @@ TEST_MATRIX=( . . X4 . . 0 A1:1-3,A2:1-3,A3:2-3,XA2:,XA3: A1:P2,A2:P-2,A3:P-2 1-3" " C0-3:X1-3:S+:P2 C1-3:X2-3:S+:P2 C2-3:X3:P2 \ . . C4 . . 0 A1:1-3,A2:1-3,A3:2-3,XA2:,XA3: A1:P2,A2:P-2,A3:P-2 1-3" + # Local partition CPU change tests + " C0-5:S+:P2 C4-5:S+:P1 . . . C3-5 . . 0 A1:0-2,A2:3-5 A1:P2,A2:P1 0-2" + " C0-5:S+:P2 C4-5:S+:P1 . . C1-5 . . . 0 A1:1-3,A2:4-5 A1:P2,A2:P1 1-3" # cpus_allowed/exclusive_cpus update tests " C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \ -- cgit v1.2.3 From 783a8334ec1cadefbb992ca2adbb459b0ee0f9f7 Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Tue, 26 Sep 2023 23:58:01 -0700 Subject: cgroup/cpuset: Cleanup signedness issue in cpu_exclusive_check() Smatch complains about returning negative error codes from a type bool function. kernel/cgroup/cpuset.c:705 cpu_exclusive_check() warn: signedness bug returning '(-22)' The code works correctly, but it is confusing. The current behavior is that cpu_exclusive_check() returns true if it's *NOT* exclusive. Rename it to cpusets_are_exclusive() and reverse the returns so it returns true if it is exclusive and false if it's not. Update both callers as well. Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202309201706.2LhKdM6o-lkp@intel.com/ Signed-off-by: Harshit Mogalapalli Reviewed-by: Kamalesh Babulal Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel/cgroup') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 93facdab513c..615daaf87f1f 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -719,18 +719,18 @@ static inline struct cpumask *fetch_xcpus(struct cpuset *cs) } /* - * cpu_exclusive_check() - check if two cpusets are exclusive + * cpusets_are_exclusive() - check if two cpusets are exclusive * - * Return 0 if exclusive, -EINVAL if not + * Return true if exclusive, false if not */ -static inline bool cpu_exclusive_check(struct cpuset *cs1, struct cpuset *cs2) +static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) { struct cpumask *xcpus1 = fetch_xcpus(cs1); struct cpumask *xcpus2 = fetch_xcpus(cs2); if (cpumask_intersects(xcpus1, xcpus2)) - return -EINVAL; - return 0; + return false; + return true; } /* @@ -833,7 +833,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) cpuset_for_each_child(c, css, par) { if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && c != cur) { - if (cpu_exclusive_check(trial, c)) + if (!cpusets_are_exclusive(trial, c)) goto out; } if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && @@ -1875,7 +1875,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, cpuset_for_each_child(child, css, parent) { if (child == cs) continue; - if (cpu_exclusive_check(cs, child)) { + if (!cpusets_are_exclusive(cs, child)) { exclusive = false; break; } -- cgit v1.2.3 From 27a6c5c50c4bb0c56296f01a3142db796bb01da1 Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Fri, 6 Oct 2023 17:20:32 +0530 Subject: cgroup: use legacy_name for cgroup v1 disable info cgroup v1 or v2 or both controller names can be passed as arguments to the 'cgroup_no_v1' kernel parameter, though most of the controller's names are the same for both cgroup versions. This can be confusing when both versions are used interchangeably, i.e., passing cgroup_no_v1=io $ sudo dmesg |grep cgroup ... cgroup: Disabling io control group subsystem in v1 mounts cgroup: Disabled controller 'blkio' Make it consistent across the pr_info()'s, by using ss->legacy_name, as the subsystem name, while printing the cgroup v1 controller disabling information in cgroup_init(). Signed-off-by: Kamalesh Babulal Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/cgroup') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 059cd5651d41..c09531bace38 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6127,7 +6127,7 @@ int __init cgroup_init(void) if (cgroup1_ssid_disabled(ssid)) pr_info("Disabling %s control group subsystem in v1 mounts\n", - ss->name); + ss->legacy_name); cgrp_dfl_root.subsys_mask |= 1 << ss->id; -- cgit v1.2.3