diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-09-02 01:58:21 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-09-02 01:58:21 +0300 |
commit | 7716f383a58314378604eecdd66949ea2cd80ef3 (patch) | |
tree | 1d63c807c6bb4af84d7fa30a5476aa16e55708e1 /kernel/cgroup | |
parent | e987af4546ac5de50e514182c1d0ca33843fa665 (diff) | |
parent | 78d44b824ed04dd1553c55c5b839c9a55cbcaf4e (diff) | |
download | linux-7716f383a58314378604eecdd66949ea2cd80ef3.tar.xz |
Merge tag 'cgroup-for-6.6' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo:
- Per-cpu cpu usage stats are now tracked
This currently isn't printed out in the cgroupfs interface and can
only be accessed through e.g. BPF. Should decide on a not-too-ugly
way to show per-cpu stats in cgroupfs
- cpuset received some cleanups and prepatory patches for the pending
cpus.exclusive patchset which will allow cpuset partitions to be
created below non-partition parents, which should ease the management
of partition cpusets
- A lot of code and documentation cleanup patches
- tools/testing/selftests/cgroup/test_cpuset.c added
* tag 'cgroup-for-6.6' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (32 commits)
cgroup: Avoid -Wstringop-overflow warnings
cgroup:namespace: Remove unused cgroup_namespaces_init()
cgroup/rstat: Record the cumulative per-cpu time of cgroup and its descendants
cgroup: clean up if condition in cgroup_pidlist_start()
cgroup: fix obsolete function name in cgroup_destroy_locked()
Documentation: cgroup-v2.rst: Correct number of stats entries
cgroup: fix obsolete function name above css_free_rwork_fn()
cgroup/cpuset: fix kernel-doc
cgroup: clean up printk()
cgroup: fix obsolete comment above cgroup_create()
docs: cgroup-v1: fix typo
docs: cgroup-v1: correct the term of Page Cache organization in inode
cgroup/misc: Store atomic64_t reads to u64
cgroup/misc: Change counters to be explicit 64bit types
cgroup/misc: update struct members descriptions
cgroup: remove cgrp->kn check in css_populate_dir()
cgroup: fix obsolete function name
cgroup: use cached local variable parent in for loop
cgroup: remove obsolete comment above struct cgroupstats
cgroup: put cgroup_tryget_css() inside CONFIG_CGROUP_SCHED
...
Diffstat (limited to 'kernel/cgroup')
-rw-r--r-- | kernel/cgroup/cgroup-v1.c | 2 | ||||
-rw-r--r-- | kernel/cgroup/cgroup.c | 85 | ||||
-rw-r--r-- | kernel/cgroup/cpuset.c | 264 | ||||
-rw-r--r-- | kernel/cgroup/misc.c | 55 | ||||
-rw-r--r-- | kernel/cgroup/namespace.c | 6 | ||||
-rw-r--r-- | kernel/cgroup/rstat.c | 12 |
6 files changed, 243 insertions, 181 deletions
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 83044312bc41..c487ffef6652 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -431,7 +431,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) if (l->list[mid] == pid) { index = mid; break; - } else if (l->list[mid] <= pid) + } else if (l->list[mid] < pid) index = mid + 1; else end = mid; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 5fa95f86cb4d..babb34a643ea 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -493,28 +493,6 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, } /** - * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem - * @cgrp: the cgroup of interest - * @ss: the subsystem of interest - * - * Find and get @cgrp's css associated with @ss. If the css doesn't exist - * or is offline, %NULL is returned. - */ -static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, - struct cgroup_subsys *ss) -{ - struct cgroup_subsys_state *css; - - rcu_read_lock(); - css = cgroup_css(cgrp, ss); - if (css && !css_tryget_online(css)) - css = NULL; - rcu_read_unlock(); - - return css; -} - -/** * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) @@ -679,7 +657,7 @@ EXPORT_SYMBOL_GPL(of_css); * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end * @cgrp: the target cgroup to iterate css's of * - * Should be called under cgroup_[tree_]mutex. + * Should be called under cgroup_mutex. */ #define for_each_css(css, ssid, cgrp) \ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ @@ -929,7 +907,7 @@ static void css_set_move_task(struct task_struct *task, #define CSS_SET_HASH_BITS 7 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); -static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) +static unsigned long css_set_hash(struct cgroup_subsys_state **css) { unsigned long key = 0UL; struct cgroup_subsys *ss; @@ -1070,7 +1048,7 @@ static bool compare_css_sets(struct css_set *cset, */ static struct css_set *find_existing_css_set(struct css_set *old_cset, struct cgroup *cgrp, - struct cgroup_subsys_state *template[]) + struct cgroup_subsys_state **template) { struct cgroup_root *root = cgrp->root; struct cgroup_subsys *ss; @@ -1736,7 +1714,7 @@ static int css_populate_dir(struct cgroup_subsys_state *css) struct cftype *cfts, *failed_cfts; int ret; - if ((css->flags & CSS_VISIBLE) || !cgrp->kn) + if (css->flags & CSS_VISIBLE) return 0; if (!css->ss) { @@ -2499,7 +2477,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, /* * This function may be called both before and - * after cgroup_taskset_migrate(). The two cases + * after cgroup_migrate_execute(). The two cases * can be distinguished by looking at whether @cset * has its ->mg_dst_cset set. */ @@ -3654,9 +3632,32 @@ static int cgroup_stat_show(struct seq_file *seq, void *v) return 0; } -static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq, - struct cgroup *cgrp, int ssid) +#ifdef CONFIG_CGROUP_SCHED +/** + * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest + * + * Find and get @cgrp's css associated with @ss. If the css doesn't exist + * or is offline, %NULL is returned. + */ +static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) { + struct cgroup_subsys_state *css; + + rcu_read_lock(); + css = cgroup_css(cgrp, ss); + if (css && !css_tryget_online(css)) + css = NULL; + rcu_read_unlock(); + + return css; +} + +static int cgroup_extra_stat_show(struct seq_file *seq, int ssid) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; struct cgroup_subsys *ss = cgroup_subsys[ssid]; struct cgroup_subsys_state *css; int ret; @@ -3672,15 +3673,15 @@ static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq, css_put(css); return ret; } +#endif static int cpu_stat_show(struct seq_file *seq, void *v) { - struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; int ret = 0; cgroup_base_stat_cputime_show(seq); #ifdef CONFIG_CGROUP_SCHED - ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id); + ret = cgroup_extra_stat_show(seq, cpu_cgrp_id); #endif return ret; } @@ -4350,14 +4351,13 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) return ret; } -static int cgroup_rm_cftypes_locked(struct cftype *cfts) +static void cgroup_rm_cftypes_locked(struct cftype *cfts) { lockdep_assert_held(&cgroup_mutex); list_del(&cfts->node); cgroup_apply_cftypes(cfts, false); cgroup_exit_cftypes(cfts); - return 0; } /** @@ -4373,8 +4373,6 @@ static int cgroup_rm_cftypes_locked(struct cftype *cfts) */ int cgroup_rm_cftypes(struct cftype *cfts) { - int ret; - if (!cfts || cfts[0].name[0] == '\0') return 0; @@ -4382,9 +4380,9 @@ int cgroup_rm_cftypes(struct cftype *cfts) return -ENOENT; cgroup_lock(); - ret = cgroup_rm_cftypes_locked(cfts); + cgroup_rm_cftypes_locked(cfts); cgroup_unlock(); - return ret; + return 0; } /** @@ -5337,7 +5335,7 @@ static struct cftype cgroup_psi_files[] = { * RCU callback. * * 4. After the grace period, the css can be freed. Implemented in - * css_free_work_fn(). + * css_free_rwork_fn(). * * It is actually hairier because both step 2 and 4 require process context * and thus involve punting to css->destroy_work adding two additional @@ -5581,8 +5579,7 @@ err_free_css: /* * The returned cgroup is fully initialized including its control mask, but - * it isn't associated with its kernfs_node and doesn't have the control - * mask applied. + * it doesn't have the control mask applied. */ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, umode_t mode) @@ -5908,7 +5905,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Mark @cgrp and the associated csets dead. The former prevents * further task migration and child creation by disabling - * cgroup_lock_live_group(). The latter makes the csets ignored by + * cgroup_kn_lock_live(). The latter makes the csets ignored by * the migration path. */ cgrp->self.flags &= ~CSS_ONLINE; @@ -5930,7 +5927,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) parent->nr_threaded_children--; spin_lock_irq(&css_set_lock); - for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) { + for (tcgrp = parent; tcgrp; tcgrp = cgroup_parent(tcgrp)) { tcgrp->nr_descendants--; tcgrp->nr_dying_descendants++; /* @@ -6123,8 +6120,8 @@ int __init cgroup_init(void) continue; if (cgroup1_ssid_disabled(ssid)) - printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n", - ss->name); + pr_info("Disabling %s control group subsystem in v1 mounts\n", + ss->name); cgrp_dfl_root.subsys_mask |= 1 << ss->id; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 58e6f18f01c1..58ec88efa4f8 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1230,7 +1230,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) /* * Percpu kthreads in top_cpuset are ignored */ - if ((task->flags & PF_KTHREAD) && kthread_is_per_cpu(task)) + if (kthread_is_per_cpu(task)) continue; cpumask_andnot(new_cpus, possible_mask, cs->subparts_cpus); } else { @@ -1255,7 +1255,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) static void compute_effective_cpumask(struct cpumask *new_cpus, struct cpuset *cs, struct cpuset *parent) { - if (parent->nr_subparts_cpus) { + if (parent->nr_subparts_cpus && is_partition_valid(cs)) { cpumask_or(new_cpus, parent->effective_cpus, parent->subparts_cpus); cpumask_and(new_cpus, new_cpus, cs->cpus_allowed); @@ -1277,6 +1277,52 @@ enum subparts_cmd { static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on); +static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, + struct tmpmasks *tmp); + +/* + * Update partition exclusive flag + * + * Return: 0 if successful, an error code otherwise + */ +static int update_partition_exclusive(struct cpuset *cs, int new_prs) +{ + bool exclusive = (new_prs > 0); + + if (exclusive && !is_cpu_exclusive(cs)) { + if (update_flag(CS_CPU_EXCLUSIVE, cs, 1)) + return PERR_NOTEXCL; + } else if (!exclusive && is_cpu_exclusive(cs)) { + /* Turning off CS_CPU_EXCLUSIVE will not return error */ + update_flag(CS_CPU_EXCLUSIVE, cs, 0); + } + return 0; +} + +/* + * Update partition load balance flag and/or rebuild sched domain + * + * Changing load balance flag will automatically call + * rebuild_sched_domains_locked(). + */ +static void update_partition_sd_lb(struct cpuset *cs, int old_prs) +{ + int new_prs = cs->partition_root_state; + bool new_lb = (new_prs != PRS_ISOLATED); + bool rebuild_domains = (new_prs > 0) || (old_prs > 0); + + if (new_lb != !!is_sched_load_balance(cs)) { + rebuild_domains = true; + if (new_lb) + set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); + else + clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); + } + + if (rebuild_domains) + rebuild_sched_domains_locked(); +} + /** * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset * @cs: The cpuset that requests change in partition root state @@ -1336,8 +1382,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, return is_partition_invalid(parent) ? PERR_INVPARENT : PERR_NOTPART; } - if ((newmask && cpumask_empty(newmask)) || - (!newmask && cpumask_empty(cs->cpus_allowed))) + if (!newmask && cpumask_empty(cs->cpus_allowed)) return PERR_CPUSEMPTY; /* @@ -1404,10 +1449,15 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, adding = cpumask_andnot(tmp->addmask, tmp->addmask, parent->subparts_cpus); /* + * Empty cpumask is not allowed + */ + if (cpumask_empty(newmask)) { + part_error = PERR_CPUSEMPTY; + /* * Make partition invalid if parent's effective_cpus could * become empty and there are tasks in the parent. */ - if (adding && + } else if (adding && cpumask_subset(parent->effective_cpus, tmp->addmask) && !cpumask_intersects(tmp->delmask, cpu_active_mask) && partition_is_populated(parent, cs)) { @@ -1480,14 +1530,13 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, /* * Transitioning between invalid to valid or vice versa may require - * changing CS_CPU_EXCLUSIVE and CS_SCHED_LOAD_BALANCE. + * changing CS_CPU_EXCLUSIVE. */ if (old_prs != new_prs) { - if (is_prs_invalid(old_prs) && !is_cpu_exclusive(cs) && - (update_flag(CS_CPU_EXCLUSIVE, cs, 1) < 0)) - return PERR_NOTEXCL; - if (is_prs_invalid(new_prs) && is_cpu_exclusive(cs)) - update_flag(CS_CPU_EXCLUSIVE, cs, 0); + int err = update_partition_exclusive(cs, new_prs); + + if (err) + return err; } /* @@ -1520,24 +1569,34 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, spin_unlock_irq(&callback_lock); - if (adding || deleting) + if (adding || deleting) { update_tasks_cpumask(parent, tmp->addmask); + if (parent->child_ecpus_count) + update_sibling_cpumasks(parent, cs, tmp); + } /* - * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary. - * rebuild_sched_domains_locked() may be called. + * For partcmd_update without newmask, it is being called from + * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken. + * Update the load balance flag and scheduling domain if + * cpus_read_trylock() is successful. */ - if (old_prs != new_prs) { - if (old_prs == PRS_ISOLATED) - update_flag(CS_SCHED_LOAD_BALANCE, cs, 1); - else if (new_prs == PRS_ISOLATED) - update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); + if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) { + update_partition_sd_lb(cs, old_prs); + cpus_read_unlock(); } + notify_partition_change(cs, old_prs); return 0; } /* + * update_cpumasks_hier() flags + */ +#define HIER_CHECKALL 0x01 /* Check all cpusets with no skipping */ +#define HIER_NO_SD_REBUILD 0x02 /* Don't rebuild sched domains */ + +/* * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree * @cs: the cpuset to consider * @tmp: temp variables for calculating effective_cpus & partition setup @@ -1551,7 +1610,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, * Called with cpuset_mutex held */ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, - bool force) + int flags) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; @@ -1588,11 +1647,16 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, } /* - * Skip the whole subtree if the cpumask remains the same - * and has no partition root state and force flag not set. + * Skip the whole subtree if + * 1) the cpumask remains the same, + * 2) has no partition root state, + * 3) HIER_CHECKALL flag not set, and + * 4) for v2 load balance state same as its parent. */ - if (!cp->partition_root_state && !force && - cpumask_equal(tmp->new_cpus, cp->effective_cpus)) { + if (!cp->partition_root_state && !(flags & HIER_CHECKALL) && + cpumask_equal(tmp->new_cpus, cp->effective_cpus) && + (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || + (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) { pos_css = css_rightmost_descendant(pos_css); continue; } @@ -1676,6 +1740,20 @@ update_parent_subparts: update_tasks_cpumask(cp, tmp->new_cpus); /* + * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE + * from parent if current cpuset isn't a valid partition root + * and their load balance states differ. + */ + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + !is_partition_valid(cp) && + (is_sched_load_balance(parent) != is_sched_load_balance(cp))) { + if (is_sched_load_balance(parent)) + set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); + else + clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); + } + + /* * On legacy hierarchy, if the effective cpumask of any non- * empty cpuset is changed, we need to rebuild sched domains. * On default hierarchy, the cpuset needs to be a partition @@ -1692,7 +1770,7 @@ update_parent_subparts: } rcu_read_unlock(); - if (need_rebuild_sched_domains) + if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD)) rebuild_sched_domains_locked(); } @@ -1716,7 +1794,9 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, * to use the right effective_cpus value. * * The update_cpumasks_hier() function may sleep. So we have to - * release the RCU read lock before calling it. + * release the RCU read lock before calling it. HIER_NO_SD_REBUILD + * flag is used to suppress rebuild of sched domains as the callers + * will take care of that. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { @@ -1728,7 +1808,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, continue; rcu_read_unlock(); - update_cpumasks_hier(sibling, tmp, false); + update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD); rcu_read_lock(); css_put(&sibling->css); } @@ -1747,6 +1827,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, int retval; struct tmpmasks tmp; bool invalidate = false; + int old_prs = cs->partition_root_state; /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ if (cs == &top_cpuset) @@ -1774,18 +1855,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; -#ifdef CONFIG_CPUMASK_OFFSTACK - /* - * Use the cpumasks in trialcs for tmpmasks when they are pointers - * to allocated cpumasks. - * - * Note that update_parent_subparts_cpumask() uses only addmask & - * delmask, but not new_cpus. - */ - tmp.addmask = trialcs->subparts_cpus; - tmp.delmask = trialcs->effective_cpus; - tmp.new_cpus = NULL; -#endif + if (alloc_cpumasks(NULL, &tmp)) + return -ENOMEM; retval = validate_change(cs, trialcs); @@ -1814,7 +1885,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, retval = 0; } if (retval < 0) - return retval; + goto out_free; if (cs->partition_root_state) { if (invalidate) @@ -1849,13 +1920,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, } spin_unlock_irq(&callback_lock); -#ifdef CONFIG_CPUMASK_OFFSTACK - /* Now trialcs->cpus_allowed is available */ - tmp.new_cpus = trialcs->cpus_allowed; -#endif - /* effective_cpus will be updated here */ - update_cpumasks_hier(cs, &tmp, false); + update_cpumasks_hier(cs, &tmp, 0); if (cs->partition_root_state) { struct cpuset *parent = parent_cs(cs); @@ -1866,7 +1932,12 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, */ if (parent->child_ecpus_count) update_sibling_cpumasks(parent, cs, &tmp); + + /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains */ + update_partition_sd_lb(cs, old_prs); } +out_free: + free_cpumasks(NULL, &tmp); return 0; } @@ -2242,7 +2313,6 @@ out: static int update_prstate(struct cpuset *cs, int new_prs) { int err = PERR_NONE, old_prs = cs->partition_root_state; - bool sched_domain_rebuilt = false; struct cpuset *parent = parent_cs(cs); struct tmpmasks tmpmask; @@ -2261,45 +2331,26 @@ static int update_prstate(struct cpuset *cs, int new_prs) if (alloc_cpumasks(NULL, &tmpmask)) return -ENOMEM; + err = update_partition_exclusive(cs, new_prs); + if (err) + goto out; + if (!old_prs) { /* - * Turning on partition root requires setting the - * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed - * cannot be empty. + * cpus_allowed cannot be empty. */ if (cpumask_empty(cs->cpus_allowed)) { err = PERR_CPUSEMPTY; goto out; } - err = update_flag(CS_CPU_EXCLUSIVE, cs, 1); - if (err) { - err = PERR_NOTEXCL; - goto out; - } - err = update_parent_subparts_cpumask(cs, partcmd_enable, NULL, &tmpmask); - if (err) { - update_flag(CS_CPU_EXCLUSIVE, cs, 0); - goto out; - } - - if (new_prs == PRS_ISOLATED) { - /* - * Disable the load balance flag should not return an - * error unless the system is running out of memory. - */ - update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); - sched_domain_rebuilt = true; - } } else if (old_prs && new_prs) { /* * A change in load balance state only, no change in cpumasks. */ - update_flag(CS_SCHED_LOAD_BALANCE, cs, (new_prs != PRS_ISOLATED)); - sched_domain_rebuilt = true; - goto out; /* Sched domain is rebuilt in update_flag() */ + ; } else { /* * Switching back to member is always allowed even if it @@ -2318,40 +2369,31 @@ static int update_prstate(struct cpuset *cs, int new_prs) compute_effective_cpumask(cs->effective_cpus, cs, parent); spin_unlock_irq(&callback_lock); } - - /* Turning off CS_CPU_EXCLUSIVE will not return error */ - update_flag(CS_CPU_EXCLUSIVE, cs, 0); - - if (!is_sched_load_balance(cs)) { - /* Make sure load balance is on */ - update_flag(CS_SCHED_LOAD_BALANCE, cs, 1); - sched_domain_rebuilt = true; - } } - - update_tasks_cpumask(parent, tmpmask.new_cpus); - - if (parent->child_ecpus_count) - update_sibling_cpumasks(parent, cs, &tmpmask); - - if (!sched_domain_rebuilt) - rebuild_sched_domains_locked(); out: /* - * Make partition invalid if an error happen + * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error + * happens. */ - if (err) + if (err) { new_prs = -new_prs; + update_partition_exclusive(cs, new_prs); + } + spin_lock_irq(&callback_lock); cs->partition_root_state = new_prs; WRITE_ONCE(cs->prs_err, err); spin_unlock_irq(&callback_lock); + /* * Update child cpusets, if present. * Force update if switching back to member. */ if (!list_empty(&cs->css.children)) - update_cpumasks_hier(cs, &tmpmask, !new_prs); + update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0); + + /* Update sched domains and load balance flag */ + update_partition_sd_lb(cs, old_prs); notify_partition_change(cs, old_prs); free_cpumasks(NULL, &tmpmask); @@ -2487,6 +2529,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; struct cpuset *cs, *oldcs; struct task_struct *task; + bool cpus_updated, mems_updated; int ret; /* used later by cpuset_attach() */ @@ -2501,13 +2544,25 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) if (ret) goto out_unlock; + cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus); + mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); + cgroup_taskset_for_each(task, css, tset) { ret = task_can_attach(task); if (ret) goto out_unlock; - ret = security_task_setscheduler(task); - if (ret) - goto out_unlock; + + /* + * Skip rights over task check in v2 when nothing changes, + * migration permission derives from hierarchy ownership in + * cgroup_procs_write_permission()). + */ + if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || + (cpus_updated || mems_updated)) { + ret = security_task_setscheduler(task); + if (ret) + goto out_unlock; + } if (dl_task(task)) { cs->nr_migrate_dl_tasks++; @@ -3222,6 +3277,14 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->use_parent_ecpus = true; parent->child_ecpus_count++; } + + /* + * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated + */ + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + !is_sched_load_balance(parent)) + clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); + spin_unlock_irq(&callback_lock); if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) @@ -3521,17 +3584,16 @@ hotplug_update_tasks_legacy(struct cpuset *cs, is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed); - mutex_unlock(&cpuset_mutex); - /* * Move tasks to the nearest ancestor with execution resources, * This is full cgroup operation which will also call back into * cpuset. Should be done outside any lock. */ - if (is_empty) + if (is_empty) { + mutex_unlock(&cpuset_mutex); remove_tasks_in_empty_cpuset(cs); - - mutex_lock(&cpuset_mutex); + mutex_lock(&cpuset_mutex); + } } static void @@ -3691,6 +3753,7 @@ unlock: /** * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset + * @work: unused * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -4073,6 +4136,7 @@ bool cpuset_node_allowed(int node, gfp_t gfp_mask) /** * cpuset_spread_node() - On which node to begin search for a page + * @rotor: round robin rotor * * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for * tasks in a cpuset with is_spread_page or is_spread_slab set), diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index ae2f4dd47508..79a3717a5803 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -14,7 +14,7 @@ #include <linux/misc_cgroup.h> #define MAX_STR "max" -#define MAX_NUM ULONG_MAX +#define MAX_NUM U64_MAX /* Miscellaneous res name, keep it in sync with enum misc_res_type */ static const char *const misc_res_name[] = { @@ -37,7 +37,7 @@ static struct misc_cg root_cg; * more than the actual capacity. We are using Limits resource distribution * model of cgroup for miscellaneous controller. */ -static unsigned long misc_res_capacity[MISC_CG_RES_TYPES]; +static u64 misc_res_capacity[MISC_CG_RES_TYPES]; /** * parent_misc() - Get the parent of the passed misc cgroup. @@ -74,10 +74,10 @@ static inline bool valid_type(enum misc_res_type type) * Context: Any context. * Return: Current total usage of the resource. */ -unsigned long misc_cg_res_total_usage(enum misc_res_type type) +u64 misc_cg_res_total_usage(enum misc_res_type type) { if (valid_type(type)) - return atomic_long_read(&root_cg.res[type].usage); + return atomic64_read(&root_cg.res[type].usage); return 0; } @@ -95,7 +95,7 @@ EXPORT_SYMBOL_GPL(misc_cg_res_total_usage); * * %0 - Successfully registered the capacity. * * %-EINVAL - If @type is invalid. */ -int misc_cg_set_capacity(enum misc_res_type type, unsigned long capacity) +int misc_cg_set_capacity(enum misc_res_type type, u64 capacity) { if (!valid_type(type)) return -EINVAL; @@ -114,9 +114,9 @@ EXPORT_SYMBOL_GPL(misc_cg_set_capacity); * Context: Any context. */ static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg, - unsigned long amount) + u64 amount) { - WARN_ONCE(atomic_long_add_negative(-amount, &cg->res[type].usage), + WARN_ONCE(atomic64_add_negative(-amount, &cg->res[type].usage), "misc cgroup resource %s became less than 0", misc_res_name[type]); } @@ -137,13 +137,12 @@ static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg, * * -EBUSY - If max limit will be crossed or total usage will be more than the * capacity. */ -int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, - unsigned long amount) +int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount) { struct misc_cg *i, *j; int ret; struct misc_res *res; - int new_usage; + u64 new_usage; if (!(valid_type(type) && cg && READ_ONCE(misc_res_capacity[type]))) return -EINVAL; @@ -154,7 +153,7 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, for (i = cg; i; i = parent_misc(i)) { res = &i->res[type]; - new_usage = atomic_long_add_return(amount, &res->usage); + new_usage = atomic64_add_return(amount, &res->usage); if (new_usage > READ_ONCE(res->max) || new_usage > READ_ONCE(misc_res_capacity[type])) { ret = -EBUSY; @@ -165,7 +164,7 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, err_charge: for (j = i; j; j = parent_misc(j)) { - atomic_long_inc(&j->res[type].events); + atomic64_inc(&j->res[type].events); cgroup_file_notify(&j->events_file); } @@ -184,8 +183,7 @@ EXPORT_SYMBOL_GPL(misc_cg_try_charge); * * Context: Any context. */ -void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, - unsigned long amount) +void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount) { struct misc_cg *i; @@ -209,7 +207,7 @@ static int misc_cg_max_show(struct seq_file *sf, void *v) { int i; struct misc_cg *cg = css_misc(seq_css(sf)); - unsigned long max; + u64 max; for (i = 0; i < MISC_CG_RES_TYPES; i++) { if (READ_ONCE(misc_res_capacity[i])) { @@ -217,7 +215,7 @@ static int misc_cg_max_show(struct seq_file *sf, void *v) if (max == MAX_NUM) seq_printf(sf, "%s max\n", misc_res_name[i]); else - seq_printf(sf, "%s %lu\n", misc_res_name[i], + seq_printf(sf, "%s %llu\n", misc_res_name[i], max); } } @@ -241,13 +239,13 @@ static int misc_cg_max_show(struct seq_file *sf, void *v) * Return: * * >= 0 - Number of bytes processed in the input. * * -EINVAL - If buf is not valid. - * * -ERANGE - If number is bigger than the unsigned long capacity. + * * -ERANGE - If number is bigger than the u64 capacity. */ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct misc_cg *cg; - unsigned long max; + u64 max; int ret = 0, i; enum misc_res_type type = MISC_CG_RES_TYPES; char *token; @@ -271,7 +269,7 @@ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf, if (!strcmp(MAX_STR, buf)) { max = MAX_NUM; } else { - ret = kstrtoul(buf, 0, &max); + ret = kstrtou64(buf, 0, &max); if (ret) return ret; } @@ -297,13 +295,13 @@ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf, static int misc_cg_current_show(struct seq_file *sf, void *v) { int i; - unsigned long usage; + u64 usage; struct misc_cg *cg = css_misc(seq_css(sf)); for (i = 0; i < MISC_CG_RES_TYPES; i++) { - usage = atomic_long_read(&cg->res[i].usage); + usage = atomic64_read(&cg->res[i].usage); if (READ_ONCE(misc_res_capacity[i]) || usage) - seq_printf(sf, "%s %lu\n", misc_res_name[i], usage); + seq_printf(sf, "%s %llu\n", misc_res_name[i], usage); } return 0; @@ -322,12 +320,12 @@ static int misc_cg_current_show(struct seq_file *sf, void *v) static int misc_cg_capacity_show(struct seq_file *sf, void *v) { int i; - unsigned long cap; + u64 cap; for (i = 0; i < MISC_CG_RES_TYPES; i++) { cap = READ_ONCE(misc_res_capacity[i]); if (cap) - seq_printf(sf, "%s %lu\n", misc_res_name[i], cap); + seq_printf(sf, "%s %llu\n", misc_res_name[i], cap); } return 0; @@ -336,12 +334,13 @@ static int misc_cg_capacity_show(struct seq_file *sf, void *v) static int misc_events_show(struct seq_file *sf, void *v) { struct misc_cg *cg = css_misc(seq_css(sf)); - unsigned long events, i; + u64 events; + int i; for (i = 0; i < MISC_CG_RES_TYPES; i++) { - events = atomic_long_read(&cg->res[i].events); + events = atomic64_read(&cg->res[i].events); if (READ_ONCE(misc_res_capacity[i]) || events) - seq_printf(sf, "%s.max %lu\n", misc_res_name[i], events); + seq_printf(sf, "%s.max %llu\n", misc_res_name[i], events); } return 0; } @@ -397,7 +396,7 @@ misc_cg_alloc(struct cgroup_subsys_state *parent_css) for (i = 0; i < MISC_CG_RES_TYPES; i++) { WRITE_ONCE(cg->res[i].max, MAX_NUM); - atomic_long_set(&cg->res[i].usage, 0); + atomic64_set(&cg->res[i].usage, 0); } return &cg->css; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 0d5c29879a50..144a464e45c6 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -149,9 +149,3 @@ const struct proc_ns_operations cgroupns_operations = { .install = cgroupns_install, .owner = cgroupns_owner, }; - -static __init int cgroup_namespaces_init(void) -{ - return 0; -} -subsys_initcall(cgroup_namespaces_init); diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 2542c21b6b6d..d80d7a608141 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -344,6 +344,7 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup_rstat_cpu *prstatc; struct cgroup_base_stat delta; unsigned seq; @@ -357,17 +358,24 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) delta = rstatc->bstat; } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); - /* propagate percpu delta to global */ + /* propagate per-cpu delta to cgroup and per-cpu global statistics */ cgroup_base_stat_sub(&delta, &rstatc->last_bstat); cgroup_base_stat_add(&cgrp->bstat, &delta); cgroup_base_stat_add(&rstatc->last_bstat, &delta); + cgroup_base_stat_add(&rstatc->subtree_bstat, &delta); - /* propagate global delta to parent (unless that's root) */ + /* propagate cgroup and per-cpu global delta to parent (unless that's root) */ if (cgroup_parent(parent)) { delta = cgrp->bstat; cgroup_base_stat_sub(&delta, &cgrp->last_bstat); cgroup_base_stat_add(&parent->bstat, &delta); cgroup_base_stat_add(&cgrp->last_bstat, &delta); + + delta = rstatc->subtree_bstat; + prstatc = cgroup_rstat_cpu(parent, cpu); + cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat); + cgroup_base_stat_add(&prstatc->subtree_bstat, &delta); + cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta); } } |