diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup.c | 453 | ||||
-rw-r--r-- | kernel/cgroup_freezer.c | 2 | ||||
-rw-r--r-- | kernel/cpuset.c | 500 | ||||
-rw-r--r-- | kernel/sched/core.c | 2 | ||||
-rw-r--r-- | kernel/sched/cpuacct.c | 2 |
5 files changed, 627 insertions, 332 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index aad41f06901b..7dc8788cfd52 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -149,12 +149,14 @@ struct cgroup_root cgrp_dfl_root; */ static bool cgrp_dfl_root_visible; +/* + * Set by the boot param of the same name and makes subsystems with NULL + * ->dfl_files to use ->legacy_files on the default hierarchy. + */ +static bool cgroup_legacy_files_on_dfl; + /* some controllers are not supported in the default hierarchy */ -static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0 -#ifdef CONFIG_CGROUP_DEBUG - | (1 << debug_cgrp_id) -#endif - ; +static unsigned int cgrp_dfl_root_inhibit_ss_mask; /* The list of hierarchy roots */ @@ -180,13 +182,15 @@ static u64 css_serial_nr_next = 1; */ static int need_forkexit_callback __read_mostly; -static struct cftype cgroup_base_files[]; +static struct cftype cgroup_dfl_base_files[]; +static struct cftype cgroup_legacy_base_files[]; static void cgroup_put(struct cgroup *cgrp); static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask); static int cgroup_destroy_locked(struct cgroup *cgrp); -static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); +static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, + bool visible); static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], @@ -1037,6 +1041,58 @@ static void cgroup_put(struct cgroup *cgrp) } /** + * cgroup_refresh_child_subsys_mask - update child_subsys_mask + * @cgrp: the target cgroup + * + * On the default hierarchy, a subsystem may request other subsystems to be + * enabled together through its ->depends_on mask. In such cases, more + * subsystems than specified in "cgroup.subtree_control" may be enabled. + * + * This function determines which subsystems need to be enabled given the + * current @cgrp->subtree_control and records it in + * @cgrp->child_subsys_mask. The resulting mask is always a superset of + * @cgrp->subtree_control and follows the usual hierarchy rules. + */ +static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) +{ + struct cgroup *parent = cgroup_parent(cgrp); + unsigned int cur_ss_mask = cgrp->subtree_control; + struct cgroup_subsys *ss; + int ssid; + + lockdep_assert_held(&cgroup_mutex); + + if (!cgroup_on_dfl(cgrp)) { + cgrp->child_subsys_mask = cur_ss_mask; + return; + } + + while (true) { + unsigned int new_ss_mask = cur_ss_mask; + + for_each_subsys(ss, ssid) + if (cur_ss_mask & (1 << ssid)) + new_ss_mask |= ss->depends_on; + + /* + * Mask out subsystems which aren't available. This can + * happen only if some depended-upon subsystems were bound + * to non-default hierarchies. + */ + if (parent) + new_ss_mask &= parent->child_subsys_mask; + else + new_ss_mask &= cgrp->root->subsys_mask; + + if (new_ss_mask == cur_ss_mask) + break; + cur_ss_mask = new_ss_mask; + } + + cgrp->child_subsys_mask = cur_ss_mask; +} + +/** * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced * @@ -1208,12 +1264,15 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) up_write(&css_set_rwsem); src_root->subsys_mask &= ~(1 << ssid); - src_root->cgrp.child_subsys_mask &= ~(1 << ssid); + src_root->cgrp.subtree_control &= ~(1 << ssid); + cgroup_refresh_child_subsys_mask(&src_root->cgrp); /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; - if (dst_root != &cgrp_dfl_root) - dst_root->cgrp.child_subsys_mask |= 1 << ssid; + if (dst_root != &cgrp_dfl_root) { + dst_root->cgrp.subtree_control |= 1 << ssid; + cgroup_refresh_child_subsys_mask(&dst_root->cgrp); + } if (ss->bind) ss->bind(css); @@ -1233,8 +1292,6 @@ static int cgroup_show_options(struct seq_file *seq, for_each_subsys(ss, ssid) if (root->subsys_mask & (1 << ssid)) seq_printf(seq, ",%s", ss->name); - if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) - seq_puts(seq, ",sane_behavior"); if (root->flags & CGRP_ROOT_NOPREFIX) seq_puts(seq, ",noprefix"); if (root->flags & CGRP_ROOT_XATTR) @@ -1268,6 +1325,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) bool all_ss = false, one_ss = false; unsigned int mask = -1U; struct cgroup_subsys *ss; + int nr_opts = 0; int i; #ifdef CONFIG_CPUSETS @@ -1277,6 +1335,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) memset(opts, 0, sizeof(*opts)); while ((token = strsep(&o, ",")) != NULL) { + nr_opts++; + if (!*token) return -EINVAL; if (!strcmp(token, "none")) { @@ -1361,37 +1421,33 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) return -ENOENT; } - /* Consistency checks */ - if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); - - if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || - opts->cpuset_clone_children || opts->release_agent || - opts->name) { - pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); + if (nr_opts != 1) { + pr_err("sane_behavior: no other mount options allowed\n"); return -EINVAL; } - } else { - /* - * If the 'all' option was specified select all the - * subsystems, otherwise if 'none', 'name=' and a subsystem - * name options were not specified, let's default to 'all' - */ - if (all_ss || (!one_ss && !opts->none && !opts->name)) - for_each_subsys(ss, i) - if (!ss->disabled) - opts->subsys_mask |= (1 << i); - - /* - * We either have to specify by name or by subsystems. (So - * all empty hierarchies must have a name). - */ - if (!opts->subsys_mask && !opts->name) - return -EINVAL; + return 0; } /* + * If the 'all' option was specified select all the subsystems, + * otherwise if 'none', 'name=' and a subsystem name options were + * not specified, let's default to 'all' + */ + if (all_ss || (!one_ss && !opts->none && !opts->name)) + for_each_subsys(ss, i) + if (!ss->disabled) + opts->subsys_mask |= (1 << i); + + /* + * We either have to specify by name or by subsystems. (So all + * empty hierarchies must have a name). + */ + if (!opts->subsys_mask && !opts->name) + return -EINVAL; + + /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just * the cpuset subsystem. @@ -1399,7 +1455,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) return -EINVAL; - /* Can't specify "none" and some subsystems */ if (opts->subsys_mask && opts->none) return -EINVAL; @@ -1414,8 +1469,8 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) struct cgroup_sb_opts opts; unsigned int added_mask, removed_mask; - if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { - pr_err("sane_behavior: remount is not allowed\n"); + if (root == &cgrp_dfl_root) { + pr_err("remount is not allowed\n"); return -EINVAL; } @@ -1434,11 +1489,10 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) removed_mask = root->subsys_mask & ~opts.subsys_mask; /* Don't allow flags or name to change at remount */ - if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || + if ((opts.flags ^ root->flags) || (opts.name && strcmp(opts.name, root->name))) { pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", - opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", - root->flags & CGRP_ROOT_OPTION_MASK, root->name); + opts.flags, opts.name ?: "", root->flags, root->name); ret = -EINVAL; goto out_unlock; } @@ -1563,6 +1617,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; + struct cftype *base_files; struct css_set *cset; int i, ret; @@ -1600,7 +1655,12 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) } root_cgrp->kn = root->kf_root->kn; - ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); + if (root == &cgrp_dfl_root) + base_files = cgroup_dfl_base_files; + else + base_files = cgroup_legacy_base_files; + + ret = cgroup_addrm_files(root_cgrp, base_files, true); if (ret) goto destroy_root; @@ -1672,7 +1732,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, goto out_unlock; /* look for a matching existing root */ - if (!opts.subsys_mask && !opts.none && !opts.name) { + if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) { cgrp_dfl_root_visible = true; root = &cgrp_dfl_root; cgroup_get(&root->cgrp); @@ -1730,15 +1790,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, goto out_unlock; } - if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { - if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { - pr_err("sane_behavior: new mount options should match the existing superblock\n"); - ret = -EINVAL; - goto out_unlock; - } else { - pr_warn("new mount options do not match the existing superblock, will be ignored\n"); - } - } + if (root->flags ^ opts.flags) + pr_warn("new mount options do not match the existing superblock, will be ignored\n"); /* * We want to reuse @root whose lifetime is governed by its @@ -2457,9 +2510,7 @@ static int cgroup_release_agent_show(struct seq_file *seq, void *v) static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) { - struct cgroup *cgrp = seq_css(seq)->cgroup; - - seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); + seq_puts(seq, "0\n"); return 0; } @@ -2496,7 +2547,7 @@ static int cgroup_controllers_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask); + cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control); return 0; } @@ -2505,7 +2556,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - cgroup_print_ss_mask(seq, cgrp->child_subsys_mask); + cgroup_print_ss_mask(seq, cgrp->subtree_control); return 0; } @@ -2611,6 +2662,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, loff_t off) { unsigned int enable = 0, disable = 0; + unsigned int css_enable, css_disable, old_ctrl, new_ctrl; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; @@ -2650,11 +2702,26 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, for_each_subsys(ss, ssid) { if (enable & (1 << ssid)) { - if (cgrp->child_subsys_mask & (1 << ssid)) { + if (cgrp->subtree_control & (1 << ssid)) { enable &= ~(1 << ssid); continue; } + /* unavailable or not enabled on the parent? */ + if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || + (cgroup_parent(cgrp) && + !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) { + ret = -ENOENT; + goto out_unlock; + } + + /* + * @ss is already enabled through dependency and + * we'll just make it visible. Skip draining. + */ + if (cgrp->child_subsys_mask & (1 << ssid)) + continue; + /* * Because css offlining is asynchronous, userland * might try to re-enable the same controller while @@ -2677,23 +2744,15 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, return restart_syscall(); } - - /* unavailable or not enabled on the parent? */ - if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || - (cgroup_parent(cgrp) && - !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) { - ret = -ENOENT; - goto out_unlock; - } } else if (disable & (1 << ssid)) { - if (!(cgrp->child_subsys_mask & (1 << ssid))) { + if (!(cgrp->subtree_control & (1 << ssid))) { disable &= ~(1 << ssid); continue; } /* a child has it enabled? */ cgroup_for_each_live_child(child, cgrp) { - if (child->child_subsys_mask & (1 << ssid)) { + if (child->subtree_control & (1 << ssid)) { ret = -EBUSY; goto out_unlock; } @@ -2707,7 +2766,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, } /* - * Except for the root, child_subsys_mask must be zero for a cgroup + * Except for the root, subtree_control must be zero for a cgroup * with tasks so that child cgroups don't compete against tasks. */ if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { @@ -2716,36 +2775,75 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, } /* - * Create csses for enables and update child_subsys_mask. This - * changes cgroup_e_css() results which in turn makes the - * subsequent cgroup_update_dfl_csses() associate all tasks in the - * subtree to the updated csses. + * Update subsys masks and calculate what needs to be done. More + * subsystems than specified may need to be enabled or disabled + * depending on subsystem dependencies. + */ + cgrp->subtree_control |= enable; + cgrp->subtree_control &= ~disable; + + old_ctrl = cgrp->child_subsys_mask; + cgroup_refresh_child_subsys_mask(cgrp); + new_ctrl = cgrp->child_subsys_mask; + + css_enable = ~old_ctrl & new_ctrl; + css_disable = old_ctrl & ~new_ctrl; + enable |= css_enable; + disable |= css_disable; + + /* + * Create new csses or make the existing ones visible. A css is + * created invisible if it's being implicitly enabled through + * dependency. An invisible css is made visible when the userland + * explicitly enables it. */ for_each_subsys(ss, ssid) { if (!(enable & (1 << ssid))) continue; cgroup_for_each_live_child(child, cgrp) { - ret = create_css(child, ss); + if (css_enable & (1 << ssid)) + ret = create_css(child, ss, + cgrp->subtree_control & (1 << ssid)); + else + ret = cgroup_populate_dir(child, 1 << ssid); if (ret) goto err_undo_css; } } - cgrp->child_subsys_mask |= enable; - cgrp->child_subsys_mask &= ~disable; - + /* + * At this point, cgroup_e_css() results reflect the new csses + * making the following cgroup_update_dfl_csses() properly update + * css associations of all tasks in the subtree. + */ ret = cgroup_update_dfl_csses(cgrp); if (ret) goto err_undo_css; - /* all tasks are now migrated away from the old csses, kill them */ + /* + * All tasks are migrated out of disabled csses. Kill or hide + * them. A css is hidden when the userland requests it to be + * disabled while other subsystems are still depending on it. The + * css must not actively control resources and be in the vanilla + * state if it's made visible again later. Controllers which may + * be depended upon should provide ->css_reset() for this purpose. + */ for_each_subsys(ss, ssid) { if (!(disable & (1 << ssid))) continue; - cgroup_for_each_live_child(child, cgrp) - kill_css(cgroup_css(child, ss)); + cgroup_for_each_live_child(child, cgrp) { + struct cgroup_subsys_state *css = cgroup_css(child, ss); + + if (css_disable & (1 << ssid)) { + kill_css(css); + } else { + cgroup_clear_dir(child, 1 << ssid); + if (ss->css_reset) + ss->css_reset(css); + } + } } kernfs_activate(cgrp->kn); @@ -2755,8 +2853,9 @@ out_unlock: return ret ?: nbytes; err_undo_css: - cgrp->child_subsys_mask &= ~enable; - cgrp->child_subsys_mask |= disable; + cgrp->subtree_control &= ~enable; + cgrp->subtree_control |= disable; + cgroup_refresh_child_subsys_mask(cgrp); for_each_subsys(ss, ssid) { if (!(enable & (1 << ssid))) @@ -2764,8 +2863,14 @@ err_undo_css: cgroup_for_each_live_child(child, cgrp) { struct cgroup_subsys_state *css = cgroup_css(child, ss); - if (css) + + if (!css) + continue; + + if (css_enable & (1 << ssid)) kill_css(css); + else + cgroup_clear_dir(child, 1 << ssid); } } goto out_unlock; @@ -2878,9 +2983,9 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, /* * This isn't a proper migration and its usefulness is very - * limited. Disallow if sane_behavior. + * limited. Disallow on the default hierarchy. */ - if (cgroup_sane_behavior(cgrp)) + if (cgroup_on_dfl(cgrp)) return -EPERM; /* @@ -2964,9 +3069,9 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], for (cft = cfts; cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ - if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) + if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) continue; - if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) + if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp)) continue; if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp)) continue; @@ -3024,6 +3129,9 @@ static void cgroup_exit_cftypes(struct cftype *cfts) kfree(cft->kf_ops); cft->kf_ops = NULL; cft->ss = NULL; + + /* revert flags set by cgroup core while adding @cfts */ + cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL); } } @@ -3109,7 +3217,7 @@ int cgroup_rm_cftypes(struct cftype *cfts) * function currently returns 0 as long as @cfts registration is successful * even if some file creation attempts on existing cgroups fail. */ -int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { int ret; @@ -3135,6 +3243,40 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) } /** + * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy + * @ss: target cgroup subsystem + * @cfts: zero-length name terminated array of cftypes + * + * Similar to cgroup_add_cftypes() but the added files are only used for + * the default hierarchy. + */ +int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +{ + struct cftype *cft; + + for (cft = cfts; cft && cft->name[0] != '\0'; cft++) + cft->flags |= __CFTYPE_ONLY_ON_DFL; + return cgroup_add_cftypes(ss, cfts); +} + +/** + * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies + * @ss: target cgroup subsystem + * @cfts: zero-length name terminated array of cftypes + * + * Similar to cgroup_add_cftypes() but the added files are only used for + * the legacy hierarchies. + */ +int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +{ + struct cftype *cft; + + for (cft = cfts; cft && cft->name[0] != '\0'; cft++) + cft->flags |= __CFTYPE_NOT_ON_DFL; + return cgroup_add_cftypes(ss, cfts); +} + +/** * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question * @@ -3699,8 +3841,9 @@ after: * * All this extra complexity was caused by the original implementation * committing to an entirely unnecessary property. In the long term, we - * want to do away with it. Explicitly scramble sort order if - * sane_behavior so that no such expectation exists in the new interface. + * want to do away with it. Explicitly scramble sort order if on the + * default hierarchy so that no such expectation exists in the new + * interface. * * Scrambling is done by swapping every two consecutive bits, which is * non-identity one-to-one mapping which disturbs sort order sufficiently. @@ -3715,7 +3858,7 @@ static pid_t pid_fry(pid_t pid) static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid) { - if (cgroup_sane_behavior(cgrp)) + if (cgroup_on_dfl(cgrp)) return pid_fry(pid); else return pid; @@ -3818,7 +3961,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, css_task_iter_end(&it); length = n; /* now sort & (if procs) strip out duplicates */ - if (cgroup_sane_behavior(cgrp)) + if (cgroup_on_dfl(cgrp)) sort(array, length, sizeof(pid_t), fried_cmppid, NULL); else sort(array, length, sizeof(pid_t), cmppid, NULL); @@ -4040,7 +4183,8 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css, return 0; } -static struct cftype cgroup_base_files[] = { +/* cgroup core interface files for the default hierarchy */ +static struct cftype cgroup_dfl_base_files[] = { { .name = "cgroup.procs", .seq_start = cgroup_pidlist_start, @@ -4052,46 +4196,52 @@ static struct cftype cgroup_base_files[] = { .mode = S_IRUGO | S_IWUSR, }, { - .name = "cgroup.clone_children", - .flags = CFTYPE_INSANE, - .read_u64 = cgroup_clone_children_read, - .write_u64 = cgroup_clone_children_write, - }, - { - .name = "cgroup.sane_behavior", - .flags = CFTYPE_ONLY_ON_ROOT, - .seq_show = cgroup_sane_behavior_show, - }, - { .name = "cgroup.controllers", - .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT, + .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_root_controllers_show, }, { .name = "cgroup.controllers", - .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, + .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_controllers_show, }, { .name = "cgroup.subtree_control", - .flags = CFTYPE_ONLY_ON_DFL, .seq_show = cgroup_subtree_control_show, .write = cgroup_subtree_control_write, }, { .name = "cgroup.populated", - .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, + .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_populated_show, }, + { } /* terminate */ +}; - /* - * Historical crazy stuff. These don't have "cgroup." prefix and - * don't exist if sane_behavior. If you're depending on these, be - * prepared to be burned. - */ +/* cgroup core interface files for the legacy hierarchies */ +static struct cftype cgroup_legacy_base_files[] = { + { + .name = "cgroup.procs", + .seq_start = cgroup_pidlist_start, + .seq_next = cgroup_pidlist_next, + .seq_stop = cgroup_pidlist_stop, + .seq_show = cgroup_pidlist_show, + .private = CGROUP_FILE_PROCS, + .write = cgroup_procs_write, + .mode = S_IRUGO | S_IWUSR, + }, + { + .name = "cgroup.clone_children", + .read_u64 = cgroup_clone_children_read, + .write_u64 = cgroup_clone_children_write, + }, + { + .name = "cgroup.sane_behavior", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = cgroup_sane_behavior_show, + }, { .name = "tasks", - .flags = CFTYPE_INSANE, /* use "procs" instead */ .seq_start = cgroup_pidlist_start, .seq_next = cgroup_pidlist_next, .seq_stop = cgroup_pidlist_stop, @@ -4102,13 +4252,12 @@ static struct cftype cgroup_base_files[] = { }, { .name = "notify_on_release", - .flags = CFTYPE_INSANE, .read_u64 = cgroup_read_notify_on_release, .write_u64 = cgroup_write_notify_on_release, }, { .name = "release_agent", - .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, + .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_release_agent_show, .write = cgroup_release_agent_write, .max_write_len = PATH_MAX - 1, @@ -4316,12 +4465,14 @@ static void offline_css(struct cgroup_subsys_state *css) * create_css - create a cgroup_subsys_state * @cgrp: the cgroup new css will be associated with * @ss: the subsys of new css + * @visible: whether to create control knobs for the new css or not * * Create a new css associated with @cgrp - @ss pair. On success, the new - * css is online and installed in @cgrp with all interface files created. - * Returns 0 on success, -errno on failure. + * css is online and installed in @cgrp with all interface files created if + * @visible. Returns 0 on success, -errno on failure. */ -static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) +static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, + bool visible) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); @@ -4345,9 +4496,11 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) goto err_free_percpu_ref; css->id = err; - err = cgroup_populate_dir(cgrp, 1 << ss->id); - if (err) - goto err_free_id; + if (visible) { + err = cgroup_populate_dir(cgrp, 1 << ss->id); + if (err) + goto err_free_id; + } /* @css is ready to be brought online now, make it visible */ list_add_tail_rcu(&css->sibling, &parent_css->children); @@ -4387,6 +4540,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, struct cgroup_root *root; struct cgroup_subsys *ss; struct kernfs_node *kn; + struct cftype *base_files; int ssid, ret; parent = cgroup_kn_lock_live(parent_kn); @@ -4457,14 +4611,20 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; - ret = cgroup_addrm_files(cgrp, cgroup_base_files, true); + if (cgroup_on_dfl(cgrp)) + base_files = cgroup_dfl_base_files; + else + base_files = cgroup_legacy_base_files; + + ret = cgroup_addrm_files(cgrp, base_files, true); if (ret) goto out_destroy; /* let's create and online css's */ for_each_subsys(ss, ssid) { if (parent->child_subsys_mask & (1 << ssid)) { - ret = create_css(cgrp, ss); + ret = create_css(cgrp, ss, + parent->subtree_control & (1 << ssid)); if (ret) goto out_destroy; } @@ -4472,10 +4632,12 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, /* * On the default hierarchy, a child doesn't automatically inherit - * child_subsys_mask from the parent. Each is configured manually. + * subtree_control from the parent. Each is configured manually. */ - if (!cgroup_on_dfl(cgrp)) - cgrp->child_subsys_mask = parent->child_subsys_mask; + if (!cgroup_on_dfl(cgrp)) { + cgrp->subtree_control = parent->subtree_control; + cgroup_refresh_child_subsys_mask(cgrp); + } kernfs_activate(kn); @@ -4738,8 +4900,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) */ int __init cgroup_init_early(void) { - static struct cgroup_sb_opts __initdata opts = - { .flags = CGRP_ROOT_SANE_BEHAVIOR }; + static struct cgroup_sb_opts __initdata opts; struct cgroup_subsys *ss; int i; @@ -4777,7 +4938,8 @@ int __init cgroup_init(void) unsigned long key; int ssid, err; - BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); + BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); + BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); mutex_lock(&cgroup_mutex); @@ -4809,9 +4971,22 @@ int __init cgroup_init(void) * disabled flag and cftype registration needs kmalloc, * both of which aren't available during early_init. */ - if (!ss->disabled) { - cgrp_dfl_root.subsys_mask |= 1 << ss->id; - WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); + if (ss->disabled) + continue; + + cgrp_dfl_root.subsys_mask |= 1 << ss->id; + + if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes) + ss->dfl_cftypes = ss->legacy_cftypes; + + if (!ss->dfl_cftypes) + cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id; + + if (ss->dfl_cftypes == ss->legacy_cftypes) { + WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes)); + } else { + WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes)); + WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes)); } } @@ -5207,6 +5382,14 @@ static int __init cgroup_disable(char *str) } __setup("cgroup_disable=", cgroup_disable); +static int __init cgroup_set_legacy_files_on_dfl(char *str) +{ + printk("cgroup: using legacy files on the default hierarchy\n"); + cgroup_legacy_files_on_dfl = true; + return 0; +} +__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl); + /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest @@ -5401,6 +5584,6 @@ static struct cftype debug_files[] = { struct cgroup_subsys debug_cgrp_subsys = { .css_alloc = debug_css_alloc, .css_free = debug_css_free, - .base_cftypes = debug_files, + .legacy_cftypes = debug_files, }; #endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index a79e40f9d700..92b98cc0ee76 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -480,5 +480,5 @@ struct cgroup_subsys freezer_cgrp_subsys = { .css_free = freezer_css_free, .attach = freezer_attach, .fork = freezer_fork, - .base_cftypes = files, + .legacy_cftypes = files, }; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 116a4164720a..22874d7cf2c0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -76,8 +76,34 @@ struct cpuset { struct cgroup_subsys_state css; unsigned long flags; /* "unsigned long" so bitops work */ - cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ - nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ + + /* + * On default hierarchy: + * + * The user-configured masks can only be changed by writing to + * cpuset.cpus and cpuset.mems, and won't be limited by the + * parent masks. + * + * The effective masks is the real masks that apply to the tasks + * in the cpuset. They may be changed if the configured masks are + * changed or hotplug happens. + * + * effective_mask == configured_mask & parent's effective_mask, + * and if it ends up empty, it will inherit the parent's mask. + * + * + * On legacy hierachy: + * + * The user-configured masks are always the same with effective masks. + */ + + /* user-configured CPUs and Memory Nodes allow to tasks */ + cpumask_var_t cpus_allowed; + nodemask_t mems_allowed; + + /* effective CPUs and Memory Nodes allow to tasks */ + cpumask_var_t effective_cpus; + nodemask_t effective_mems; /* * This is old Memory Nodes tasks took on. @@ -307,9 +333,9 @@ static struct file_system_type cpuset_fs_type = { */ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) { - while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) + while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) cs = parent_cs(cs); - cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); + cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); } /* @@ -325,9 +351,9 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) */ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { - while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) + while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY])) cs = parent_cs(cs); - nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]); + nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); } /* @@ -376,13 +402,20 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) if (!trial) return NULL; - if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) { - kfree(trial); - return NULL; - } - cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); + if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) + goto free_cs; + if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL)) + goto free_cpus; + cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); + cpumask_copy(trial->effective_cpus, cs->effective_cpus); return trial; + +free_cpus: + free_cpumask_var(trial->cpus_allowed); +free_cs: + kfree(trial); + return NULL; } /** @@ -391,6 +424,7 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) */ static void free_trial_cpuset(struct cpuset *trial) { + free_cpumask_var(trial->effective_cpus); free_cpumask_var(trial->cpus_allowed); kfree(trial); } @@ -436,9 +470,9 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) par = parent_cs(cur); - /* We must be a subset of our parent cpuset */ + /* On legacy hiearchy, we must be a subset of our parent cpuset. */ ret = -EACCES; - if (!is_cpuset_subset(trial, par)) + if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par)) goto out; /* @@ -480,11 +514,11 @@ out: #ifdef CONFIG_SMP /* * Helper routine for generate_sched_domains(). - * Do cpusets a, b have overlapping cpus_allowed masks? + * Do cpusets a, b have overlapping effective cpus_allowed masks? */ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) { - return cpumask_intersects(a->cpus_allowed, b->cpus_allowed); + return cpumask_intersects(a->effective_cpus, b->effective_cpus); } static void @@ -601,7 +635,7 @@ static int generate_sched_domains(cpumask_var_t **domains, *dattr = SD_ATTR_INIT; update_domain_attr_tree(dattr, &top_cpuset); } - cpumask_copy(doms[0], top_cpuset.cpus_allowed); + cpumask_copy(doms[0], top_cpuset.effective_cpus); goto done; } @@ -705,7 +739,7 @@ restart: struct cpuset *b = csa[j]; if (apn == b->pn) { - cpumask_or(dp, dp, b->cpus_allowed); + cpumask_or(dp, dp, b->effective_cpus); if (dattr) update_domain_attr_tree(dattr + nslot, b); @@ -757,7 +791,7 @@ static void rebuild_sched_domains_locked(void) * passing doms with offlined cpu to partition_sched_domains(). * Anyways, hotplug work item will rebuild sched domains. */ - if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask)) + if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) goto out; /* Generate domain masks and attrs */ @@ -781,45 +815,6 @@ void rebuild_sched_domains(void) mutex_unlock(&cpuset_mutex); } -/* - * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus - * @cs: the cpuset in interest - * - * A cpuset's effective cpumask is the cpumask of the nearest ancestor - * with non-empty cpus. We use effective cpumask whenever: - * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask - * if the cpuset they reside in has no cpus) - * - we want to retrieve task_cs(tsk)'s cpus_allowed. - * - * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an - * exception. See comments there. - */ -static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs) -{ - while (cpumask_empty(cs->cpus_allowed)) - cs = parent_cs(cs); - return cs; -} - -/* - * effective_nodemask_cpuset - return nearest ancestor with non-empty mems - * @cs: the cpuset in interest - * - * A cpuset's effective nodemask is the nodemask of the nearest ancestor - * with non-empty memss. We use effective nodemask whenever: - * - we update tasks' mems_allowed. (they take on the ancestor's nodemask - * if the cpuset they reside in has no mems) - * - we want to retrieve task_cs(tsk)'s mems_allowed. - * - * Called with cpuset_mutex held. - */ -static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) -{ - while (nodes_empty(cs->mems_allowed)) - cs = parent_cs(cs); - return cs; -} - /** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed @@ -830,53 +825,80 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) */ static void update_tasks_cpumask(struct cpuset *cs) { - struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); struct css_task_iter it; struct task_struct *task; css_task_iter_start(&cs->css, &it); while ((task = css_task_iter_next(&it))) - set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed); + set_cpus_allowed_ptr(task, cs->effective_cpus); css_task_iter_end(&it); } /* - * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. - * @root_cs: the root cpuset of the hierarchy - * @update_root: update root cpuset or not? + * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree + * @cs: the cpuset to consider + * @new_cpus: temp variable for calculating new effective_cpus + * + * When congifured cpumask is changed, the effective cpumasks of this cpuset + * and all its descendants need to be updated. * - * This will update cpumasks of tasks in @root_cs and all other empty cpusets - * which take on cpumask of @root_cs. + * On legacy hierachy, effective_cpus will be the same with cpu_allowed. * * Called with cpuset_mutex held */ -static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root) +static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; + bool need_rebuild_sched_domains = false; rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { - if (cp == root_cs) { - if (!update_root) - continue; - } else { - /* skip the whole subtree if @cp have some CPU */ - if (!cpumask_empty(cp->cpus_allowed)) { - pos_css = css_rightmost_descendant(pos_css); - continue; - } + cpuset_for_each_descendant_pre(cp, pos_css, cs) { + struct cpuset *parent = parent_cs(cp); + + cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus); + + /* + * If it becomes empty, inherit the effective mask of the + * parent, which is guaranteed to have some CPUs. + */ + if (cpumask_empty(new_cpus)) + cpumask_copy(new_cpus, parent->effective_cpus); + + /* Skip the whole subtree if the cpumask remains the same. */ + if (cpumask_equal(new_cpus, cp->effective_cpus)) { + pos_css = css_rightmost_descendant(pos_css); + continue; } + if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); + mutex_lock(&callback_mutex); + cpumask_copy(cp->effective_cpus, new_cpus); + mutex_unlock(&callback_mutex); + + WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && + !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); + update_tasks_cpumask(cp); + /* + * If the effective cpumask of any non-empty cpuset is changed, + * we need to rebuild sched domains. + */ + if (!cpumask_empty(cp->cpus_allowed) && + is_sched_load_balance(cp)) + need_rebuild_sched_domains = true; + rcu_read_lock(); css_put(&cp->css); } rcu_read_unlock(); + + if (need_rebuild_sched_domains) + rebuild_sched_domains_locked(); } /** @@ -889,7 +911,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { int retval; - int is_load_balanced; /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ if (cs == &top_cpuset) @@ -908,7 +929,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) return retval; - if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) + if (!cpumask_subset(trialcs->cpus_allowed, + top_cpuset.cpus_allowed)) return -EINVAL; } @@ -920,16 +942,12 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) return retval; - is_load_balanced = is_sched_load_balance(trialcs); - mutex_lock(&callback_mutex); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); mutex_unlock(&callback_mutex); - update_tasks_cpumask_hier(cs, true); - - if (is_load_balanced) - rebuild_sched_domains_locked(); + /* use trialcs->cpus_allowed as a temp variable */ + update_cpumasks_hier(cs, trialcs->cpus_allowed); return 0; } @@ -951,15 +969,13 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to) { struct task_struct *tsk = current; - struct cpuset *mems_cs; tsk->mems_allowed = *to; do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); rcu_read_lock(); - mems_cs = effective_nodemask_cpuset(task_cs(tsk)); - guarantee_online_mems(mems_cs, &tsk->mems_allowed); + guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed); rcu_read_unlock(); } @@ -1028,13 +1044,12 @@ static void *cpuset_being_rebound; static void update_tasks_nodemask(struct cpuset *cs) { static nodemask_t newmems; /* protected by cpuset_mutex */ - struct cpuset *mems_cs = effective_nodemask_cpuset(cs); struct css_task_iter it; struct task_struct *task; cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ - guarantee_online_mems(mems_cs, &newmems); + guarantee_online_mems(cs, &newmems); /* * The mpol_rebind_mm() call takes mmap_sem, which we couldn't @@ -1077,36 +1092,52 @@ static void update_tasks_nodemask(struct cpuset *cs) } /* - * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. - * @cs: the root cpuset of the hierarchy - * @update_root: update the root cpuset or not? + * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree + * @cs: the cpuset to consider + * @new_mems: a temp variable for calculating new effective_mems * - * This will update nodemasks of tasks in @root_cs and all other empty cpusets - * which take on nodemask of @root_cs. + * When configured nodemask is changed, the effective nodemasks of this cpuset + * and all its descendants need to be updated. + * + * On legacy hiearchy, effective_mems will be the same with mems_allowed. * * Called with cpuset_mutex held */ -static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root) +static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { - if (cp == root_cs) { - if (!update_root) - continue; - } else { - /* skip the whole subtree if @cp have some CPU */ - if (!nodes_empty(cp->mems_allowed)) { - pos_css = css_rightmost_descendant(pos_css); - continue; - } + cpuset_for_each_descendant_pre(cp, pos_css, cs) { + struct cpuset *parent = parent_cs(cp); + + nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); + + /* + * If it becomes empty, inherit the effective mask of the + * parent, which is guaranteed to have some MEMs. + */ + if (nodes_empty(*new_mems)) + *new_mems = parent->effective_mems; + + /* Skip the whole subtree if the nodemask remains the same. */ + if (nodes_equal(*new_mems, cp->effective_mems)) { + pos_css = css_rightmost_descendant(pos_css); + continue; } + if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); + mutex_lock(&callback_mutex); + cp->effective_mems = *new_mems; + mutex_unlock(&callback_mutex); + + WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && + !nodes_equal(cp->mems_allowed, cp->effective_mems)); + update_tasks_nodemask(cp); rcu_read_lock(); @@ -1156,8 +1187,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, goto done; if (!nodes_subset(trialcs->mems_allowed, - node_states[N_MEMORY])) { - retval = -EINVAL; + top_cpuset.mems_allowed)) { + retval = -EINVAL; goto done; } } @@ -1174,7 +1205,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, cs->mems_allowed = trialcs->mems_allowed; mutex_unlock(&callback_mutex); - update_tasks_nodemask_hier(cs, true); + /* use trialcs->mems_allowed as a temp variable */ + update_nodemasks_hier(cs, &cs->mems_allowed); done: return retval; } @@ -1389,12 +1421,9 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, mutex_lock(&cpuset_mutex); - /* - * We allow to move tasks into an empty cpuset if sane_behavior - * flag is set. - */ + /* allow moving tasks into an empty cpuset if on default hierarchy */ ret = -ENOSPC; - if (!cgroup_sane_behavior(css->cgroup) && + if (!cgroup_on_dfl(css->cgroup) && (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; @@ -1452,8 +1481,6 @@ static void cpuset_attach(struct cgroup_subsys_state *css, struct task_struct *leader = cgroup_taskset_first(tset); struct cpuset *cs = css_cs(css); struct cpuset *oldcs = cpuset_attach_old_cs; - struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); - struct cpuset *mems_cs = effective_nodemask_cpuset(cs); mutex_lock(&cpuset_mutex); @@ -1461,9 +1488,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css, if (cs == &top_cpuset) cpumask_copy(cpus_attach, cpu_possible_mask); else - guarantee_online_cpus(cpus_cs, cpus_attach); + guarantee_online_cpus(cs, cpus_attach); - guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); + guarantee_online_mems(cs, &cpuset_attach_nodemask_to); cgroup_taskset_for_each(task, tset) { /* @@ -1480,11 +1507,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css, * Change mm, possibly for multiple threads in a threadgroup. This is * expensive and may sleep. */ - cpuset_attach_nodemask_to = cs->mems_allowed; + cpuset_attach_nodemask_to = cs->effective_mems; mm = get_task_mm(leader); if (mm) { - struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs); - mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); /* @@ -1495,7 +1520,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, * mm from. */ if (is_memory_migrate(cs)) { - cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed, + cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, &cpuset_attach_nodemask_to); } mmput(mm); @@ -1516,6 +1541,8 @@ typedef enum { FILE_MEMORY_MIGRATE, FILE_CPULIST, FILE_MEMLIST, + FILE_EFFECTIVE_CPULIST, + FILE_EFFECTIVE_MEMLIST, FILE_CPU_EXCLUSIVE, FILE_MEM_EXCLUSIVE, FILE_MEM_HARDWALL, @@ -1694,6 +1721,12 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) case FILE_MEMLIST: s += nodelist_scnprintf(s, count, cs->mems_allowed); break; + case FILE_EFFECTIVE_CPULIST: + s += cpulist_scnprintf(s, count, cs->effective_cpus); + break; + case FILE_EFFECTIVE_MEMLIST: + s += nodelist_scnprintf(s, count, cs->effective_mems); + break; default: ret = -EINVAL; goto out_unlock; @@ -1779,6 +1812,18 @@ static struct cftype files[] = { }, { + .name = "effective_cpus", + .seq_show = cpuset_common_seq_show, + .private = FILE_EFFECTIVE_CPULIST, + }, + + { + .name = "effective_mems", + .seq_show = cpuset_common_seq_show, + .private = FILE_EFFECTIVE_MEMLIST, + }, + + { .name = "cpu_exclusive", .read_u64 = cpuset_read_u64, .write_u64 = cpuset_write_u64, @@ -1869,18 +1914,26 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) cs = kzalloc(sizeof(*cs), GFP_KERNEL); if (!cs) return ERR_PTR(-ENOMEM); - if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { - kfree(cs); - return ERR_PTR(-ENOMEM); - } + if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) + goto free_cs; + if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL)) + goto free_cpus; set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); + cpumask_clear(cs->effective_cpus); + nodes_clear(cs->effective_mems); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; return &cs->css; + +free_cpus: + free_cpumask_var(cs->cpus_allowed); +free_cs: + kfree(cs); + return ERR_PTR(-ENOMEM); } static int cpuset_css_online(struct cgroup_subsys_state *css) @@ -1903,6 +1956,13 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpuset_inc(); + mutex_lock(&callback_mutex); + if (cgroup_on_dfl(cs->css.cgroup)) { + cpumask_copy(cs->effective_cpus, parent->effective_cpus); + cs->effective_mems = parent->effective_mems; + } + mutex_unlock(&callback_mutex); + if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) goto out_unlock; @@ -1962,20 +2022,40 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); + free_cpumask_var(cs->effective_cpus); free_cpumask_var(cs->cpus_allowed); kfree(cs); } +static void cpuset_bind(struct cgroup_subsys_state *root_css) +{ + mutex_lock(&cpuset_mutex); + mutex_lock(&callback_mutex); + + if (cgroup_on_dfl(root_css->cgroup)) { + cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); + top_cpuset.mems_allowed = node_possible_map; + } else { + cpumask_copy(top_cpuset.cpus_allowed, + top_cpuset.effective_cpus); + top_cpuset.mems_allowed = top_cpuset.effective_mems; + } + + mutex_unlock(&callback_mutex); + mutex_unlock(&cpuset_mutex); +} + struct cgroup_subsys cpuset_cgrp_subsys = { - .css_alloc = cpuset_css_alloc, - .css_online = cpuset_css_online, - .css_offline = cpuset_css_offline, - .css_free = cpuset_css_free, - .can_attach = cpuset_can_attach, - .cancel_attach = cpuset_cancel_attach, - .attach = cpuset_attach, - .base_cftypes = files, - .early_init = 1, + .css_alloc = cpuset_css_alloc, + .css_online = cpuset_css_online, + .css_offline = cpuset_css_offline, + .css_free = cpuset_css_free, + .can_attach = cpuset_can_attach, + .cancel_attach = cpuset_cancel_attach, + .attach = cpuset_attach, + .bind = cpuset_bind, + .legacy_cftypes = files, + .early_init = 1, }; /** @@ -1990,9 +2070,13 @@ int __init cpuset_init(void) if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) BUG(); + if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)) + BUG(); cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); + cpumask_setall(top_cpuset.effective_cpus); + nodes_setall(top_cpuset.effective_mems); fmeter_init(&top_cpuset.fmeter); set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); @@ -2035,6 +2119,66 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) } } +static void +hotplug_update_tasks_legacy(struct cpuset *cs, + struct cpumask *new_cpus, nodemask_t *new_mems, + bool cpus_updated, bool mems_updated) +{ + bool is_empty; + + mutex_lock(&callback_mutex); + cpumask_copy(cs->cpus_allowed, new_cpus); + cpumask_copy(cs->effective_cpus, new_cpus); + cs->mems_allowed = *new_mems; + cs->effective_mems = *new_mems; + mutex_unlock(&callback_mutex); + + /* + * Don't call update_tasks_cpumask() if the cpuset becomes empty, + * as the tasks will be migratecd to an ancestor. + */ + if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) + update_tasks_cpumask(cs); + if (mems_updated && !nodes_empty(cs->mems_allowed)) + update_tasks_nodemask(cs); + + is_empty = cpumask_empty(cs->cpus_allowed) || + nodes_empty(cs->mems_allowed); + + mutex_unlock(&cpuset_mutex); + + /* + * Move tasks to the nearest ancestor with execution resources, + * This is full cgroup operation which will also call back into + * cpuset. Should be done outside any lock. + */ + if (is_empty) + remove_tasks_in_empty_cpuset(cs); + + mutex_lock(&cpuset_mutex); +} + +static void +hotplug_update_tasks(struct cpuset *cs, + struct cpumask *new_cpus, nodemask_t *new_mems, + bool cpus_updated, bool mems_updated) +{ + if (cpumask_empty(new_cpus)) + cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus); + if (nodes_empty(*new_mems)) + *new_mems = parent_cs(cs)->effective_mems; + + mutex_lock(&callback_mutex); + cpumask_copy(cs->effective_cpus, new_cpus); + cs->effective_mems = *new_mems; + mutex_unlock(&callback_mutex); + + if (cpus_updated) + update_tasks_cpumask(cs); + if (mems_updated) + update_tasks_nodemask(cs); +} + /** * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug * @cs: cpuset in interest @@ -2045,11 +2189,10 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) */ static void cpuset_hotplug_update_tasks(struct cpuset *cs) { - static cpumask_t off_cpus; - static nodemask_t off_mems; - bool is_empty; - bool sane = cgroup_sane_behavior(cs->css.cgroup); - + static cpumask_t new_cpus; + static nodemask_t new_mems; + bool cpus_updated; + bool mems_updated; retry: wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); @@ -2064,51 +2207,20 @@ retry: goto retry; } - cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); - nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); - - mutex_lock(&callback_mutex); - cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); - mutex_unlock(&callback_mutex); - - /* - * If sane_behavior flag is set, we need to update tasks' cpumask - * for empty cpuset to take on ancestor's cpumask. Otherwise, don't - * call update_tasks_cpumask() if the cpuset becomes empty, as - * the tasks in it will be migrated to an ancestor. - */ - if ((sane && cpumask_empty(cs->cpus_allowed)) || - (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed))) - update_tasks_cpumask(cs); + cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus); + nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems); - mutex_lock(&callback_mutex); - nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); - mutex_unlock(&callback_mutex); - - /* - * If sane_behavior flag is set, we need to update tasks' nodemask - * for empty cpuset to take on ancestor's nodemask. Otherwise, don't - * call update_tasks_nodemask() if the cpuset becomes empty, as - * the tasks in it will be migratd to an ancestor. - */ - if ((sane && nodes_empty(cs->mems_allowed)) || - (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed))) - update_tasks_nodemask(cs); + cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); + mems_updated = !nodes_equal(new_mems, cs->effective_mems); - is_empty = cpumask_empty(cs->cpus_allowed) || - nodes_empty(cs->mems_allowed); + if (cgroup_on_dfl(cs->css.cgroup)) + hotplug_update_tasks(cs, &new_cpus, &new_mems, + cpus_updated, mems_updated); + else + hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, + cpus_updated, mems_updated); mutex_unlock(&cpuset_mutex); - - /* - * If sane_behavior flag is set, we'll keep tasks in empty cpusets. - * - * Otherwise move tasks to the nearest ancestor with execution - * resources. This is full cgroup operation which will - * also call back into cpuset. Should be done outside any lock. - */ - if (!sane && is_empty) - remove_tasks_in_empty_cpuset(cs); } /** @@ -2132,6 +2244,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) static cpumask_t new_cpus; static nodemask_t new_mems; bool cpus_updated, mems_updated; + bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup); mutex_lock(&cpuset_mutex); @@ -2139,13 +2252,15 @@ static void cpuset_hotplug_workfn(struct work_struct *work) cpumask_copy(&new_cpus, cpu_active_mask); new_mems = node_states[N_MEMORY]; - cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); - mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); + cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus); + mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); /* synchronize cpus_allowed to cpu_active_mask */ if (cpus_updated) { mutex_lock(&callback_mutex); - cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); + if (!on_dfl) + cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); + cpumask_copy(top_cpuset.effective_cpus, &new_cpus); mutex_unlock(&callback_mutex); /* we don't mess with cpumasks of tasks in top_cpuset */ } @@ -2153,7 +2268,9 @@ static void cpuset_hotplug_workfn(struct work_struct *work) /* synchronize mems_allowed to N_MEMORY */ if (mems_updated) { mutex_lock(&callback_mutex); - top_cpuset.mems_allowed = new_mems; + if (!on_dfl) + top_cpuset.mems_allowed = new_mems; + top_cpuset.effective_mems = new_mems; mutex_unlock(&callback_mutex); update_tasks_nodemask(&top_cpuset); } @@ -2228,6 +2345,9 @@ void __init cpuset_init_smp(void) top_cpuset.mems_allowed = node_states[N_MEMORY]; top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; + cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); + top_cpuset.effective_mems = node_states[N_MEMORY]; + register_hotmemory_notifier(&cpuset_track_online_nodes_nb); } @@ -2244,23 +2364,17 @@ void __init cpuset_init_smp(void) void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) { - struct cpuset *cpus_cs; - mutex_lock(&callback_mutex); rcu_read_lock(); - cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); - guarantee_online_cpus(cpus_cs, pmask); + guarantee_online_cpus(task_cs(tsk), pmask); rcu_read_unlock(); mutex_unlock(&callback_mutex); } void cpuset_cpus_allowed_fallback(struct task_struct *tsk) { - struct cpuset *cpus_cs; - rcu_read_lock(); - cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); - do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed); + do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus); rcu_read_unlock(); /* @@ -2299,13 +2413,11 @@ void cpuset_init_current_mems_allowed(void) nodemask_t cpuset_mems_allowed(struct task_struct *tsk) { - struct cpuset *mems_cs; nodemask_t mask; mutex_lock(&callback_mutex); rcu_read_lock(); - mems_cs = effective_nodemask_cpuset(task_cs(tsk)); - guarantee_online_mems(mems_cs, &mask); + guarantee_online_mems(task_cs(tsk), &mask); rcu_read_unlock(); mutex_unlock(&callback_mutex); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bc1638b33449..126f7e3f04e7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8083,7 +8083,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, - .base_cftypes = cpu_files, + .legacy_cftypes = cpu_files, .early_init = 1, }; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 9cf350c94ec4..dd7cbb55bbf2 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -278,6 +278,6 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) struct cgroup_subsys cpuacct_cgrp_subsys = { .css_alloc = cpuacct_css_alloc, .css_free = cpuacct_css_free, - .base_cftypes = files, + .legacy_cftypes = files, .early_init = 1, }; |