diff options
Diffstat (limited to 'kernel/cgroup/cgroup-v1.c')
-rw-r--r-- | kernel/cgroup/cgroup-v1.c | 428 |
1 files changed, 210 insertions, 218 deletions
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 583b969b0c0e..c126b34fd4ff 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -13,9 +13,12 @@ #include <linux/delayacct.h> #include <linux/pid_namespace.h> #include <linux/cgroupstats.h> +#include <linux/fs_parser.h> #include <trace/events/cgroup.h> +#define cg_invalf(fc, fmt, ...) invalf(fc, fmt, ## __VA_ARGS__) + /* * pidlists linger the following amount before being destroyed. The goal * is avoiding frequent destruction in the middle of consecutive read calls @@ -906,172 +909,195 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo return 0; } -static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) -{ - char *token, *o = data; - bool all_ss = false, one_ss = false; - u16 mask = U16_MAX; - struct cgroup_subsys *ss; - int nr_opts = 0; - int i; - -#ifdef CONFIG_CPUSETS - mask = ~((u16)1 << cpuset_cgrp_id); -#endif - - memset(opts, 0, sizeof(*opts)); +enum cgroup1_param { + Opt_all, + Opt_clone_children, + Opt_cpuset_v2_mode, + Opt_name, + Opt_none, + Opt_noprefix, + Opt_release_agent, + Opt_xattr, +}; - while ((token = strsep(&o, ",")) != NULL) { - nr_opts++; +static const struct fs_parameter_spec cgroup1_param_specs[] = { + fsparam_flag ("all", Opt_all), + fsparam_flag ("clone_children", Opt_clone_children), + fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode), + fsparam_string("name", Opt_name), + fsparam_flag ("none", Opt_none), + fsparam_flag ("noprefix", Opt_noprefix), + fsparam_string("release_agent", Opt_release_agent), + fsparam_flag ("xattr", Opt_xattr), + {} +}; - if (!*token) - return -EINVAL; - if (!strcmp(token, "none")) { - /* Explicitly have no subsystems */ - opts->none = true; - continue; - } - if (!strcmp(token, "all")) { - /* Mutually exclusive option 'all' + subsystem name */ - if (one_ss) - return -EINVAL; - all_ss = true; - continue; - } - if (!strcmp(token, "noprefix")) { - opts->flags |= CGRP_ROOT_NOPREFIX; - continue; - } - if (!strcmp(token, "clone_children")) { - opts->cpuset_clone_children = true; - continue; - } - if (!strcmp(token, "cpuset_v2_mode")) { - opts->flags |= CGRP_ROOT_CPUSET_V2_MODE; - continue; - } - if (!strcmp(token, "xattr")) { - opts->flags |= CGRP_ROOT_XATTR; - continue; - } - if (!strncmp(token, "release_agent=", 14)) { - /* Specifying two release agents is forbidden */ - if (opts->release_agent) - return -EINVAL; - opts->release_agent = - kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); - if (!opts->release_agent) - return -ENOMEM; - continue; - } - if (!strncmp(token, "name=", 5)) { - const char *name = token + 5; - - /* blocked by boot param? */ - if (cgroup_no_v1_named) - return -ENOENT; - /* Can't specify an empty name */ - if (!strlen(name)) - return -EINVAL; - /* Must match [\w.-]+ */ - for (i = 0; i < strlen(name); i++) { - char c = name[i]; - if (isalnum(c)) - continue; - if ((c == '.') || (c == '-') || (c == '_')) - continue; - return -EINVAL; - } - /* Specifying two names is forbidden */ - if (opts->name) - return -EINVAL; - opts->name = kstrndup(name, - MAX_CGROUP_ROOT_NAMELEN - 1, - GFP_KERNEL); - if (!opts->name) - return -ENOMEM; +const struct fs_parameter_description cgroup1_fs_parameters = { + .name = "cgroup1", + .specs = cgroup1_param_specs, +}; - continue; +int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + struct cgroup_subsys *ss; + struct fs_parse_result result; + int opt, i; + + opt = fs_parse(fc, &cgroup1_fs_parameters, param, &result); + if (opt == -ENOPARAM) { + if (strcmp(param->key, "source") == 0) { + fc->source = param->string; + param->string = NULL; + return 0; } - for_each_subsys(ss, i) { - if (strcmp(token, ss->legacy_name)) + if (strcmp(param->key, ss->legacy_name)) continue; - if (!cgroup_ssid_enabled(i)) + ctx->subsys_mask |= (1 << i); + return 0; + } + return cg_invalf(fc, "cgroup1: Unknown subsys name '%s'", param->key); + } + if (opt < 0) + return opt; + + switch (opt) { + case Opt_none: + /* Explicitly have no subsystems */ + ctx->none = true; + break; + case Opt_all: + ctx->all_ss = true; + break; + case Opt_noprefix: + ctx->flags |= CGRP_ROOT_NOPREFIX; + break; + case Opt_clone_children: + ctx->cpuset_clone_children = true; + break; + case Opt_cpuset_v2_mode: + ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE; + break; + case Opt_xattr: + ctx->flags |= CGRP_ROOT_XATTR; + break; + case Opt_release_agent: + /* Specifying two release agents is forbidden */ + if (ctx->release_agent) + return cg_invalf(fc, "cgroup1: release_agent respecified"); + ctx->release_agent = param->string; + param->string = NULL; + break; + case Opt_name: + /* blocked by boot param? */ + if (cgroup_no_v1_named) + return -ENOENT; + /* Can't specify an empty name */ + if (!param->size) + return cg_invalf(fc, "cgroup1: Empty name"); + if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1) + return cg_invalf(fc, "cgroup1: Name too long"); + /* Must match [\w.-]+ */ + for (i = 0; i < param->size; i++) { + char c = param->string[i]; + if (isalnum(c)) continue; - if (cgroup1_ssid_disabled(i)) + if ((c == '.') || (c == '-') || (c == '_')) continue; - - /* Mutually exclusive option 'all' + subsystem name */ - if (all_ss) - return -EINVAL; - opts->subsys_mask |= (1 << i); - one_ss = true; - - break; + return cg_invalf(fc, "cgroup1: Invalid name"); } - if (i == CGROUP_SUBSYS_COUNT) - return -ENOENT; + /* Specifying two names is forbidden */ + if (ctx->name) + return cg_invalf(fc, "cgroup1: name respecified"); + ctx->name = param->string; + param->string = NULL; + break; } + return 0; +} + +static int check_cgroupfs_options(struct fs_context *fc) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + u16 mask = U16_MAX; + u16 enabled = 0; + struct cgroup_subsys *ss; + int i; + +#ifdef CONFIG_CPUSETS + mask = ~((u16)1 << cpuset_cgrp_id); +#endif + for_each_subsys(ss, i) + if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i)) + enabled |= 1 << i; + + ctx->subsys_mask &= enabled; /* - * If the 'all' option was specified select all the subsystems, - * otherwise if 'none', 'name=' and a subsystem name options were - * not specified, let's default to 'all' + * In absense of 'none', 'name=' or subsystem name options, + * let's default to 'all'. */ - if (all_ss || (!one_ss && !opts->none && !opts->name)) - for_each_subsys(ss, i) - if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i)) - opts->subsys_mask |= (1 << i); + if (!ctx->subsys_mask && !ctx->none && !ctx->name) + ctx->all_ss = true; + + if (ctx->all_ss) { + /* Mutually exclusive option 'all' + subsystem name */ + if (ctx->subsys_mask) + return cg_invalf(fc, "cgroup1: subsys name conflicts with all"); + /* 'all' => select all the subsystems */ + ctx->subsys_mask = enabled; + } /* * We either have to specify by name or by subsystems. (So all * empty hierarchies must have a name). */ - if (!opts->subsys_mask && !opts->name) - return -EINVAL; + if (!ctx->subsys_mask && !ctx->name) + return cg_invalf(fc, "cgroup1: Need name or subsystem set"); /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just * the cpuset subsystem. */ - if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) - return -EINVAL; + if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask)) + return cg_invalf(fc, "cgroup1: noprefix used incorrectly"); /* Can't specify "none" and some subsystems */ - if (opts->subsys_mask && opts->none) - return -EINVAL; + if (ctx->subsys_mask && ctx->none) + return cg_invalf(fc, "cgroup1: none used incorrectly"); return 0; } -static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data) +int cgroup1_reconfigure(struct fs_context *fc) { - int ret = 0; + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); - struct cgroup_sb_opts opts; + int ret = 0; u16 added_mask, removed_mask; cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* See what subsystems are wanted */ - ret = parse_cgroupfs_options(data, &opts); + ret = check_cgroupfs_options(fc); if (ret) goto out_unlock; - if (opts.subsys_mask != root->subsys_mask || opts.release_agent) + if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent) pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); - added_mask = opts.subsys_mask & ~root->subsys_mask; - removed_mask = root->subsys_mask & ~opts.subsys_mask; + added_mask = ctx->subsys_mask & ~root->subsys_mask; + removed_mask = root->subsys_mask & ~ctx->subsys_mask; /* Don't allow flags or name to change at remount */ - if ((opts.flags ^ root->flags) || - (opts.name && strcmp(opts.name, root->name))) { - pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", - opts.flags, opts.name ?: "", root->flags, root->name); + if ((ctx->flags ^ root->flags) || + (ctx->name && strcmp(ctx->name, root->name))) { + cg_invalf(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"", + ctx->flags, ctx->name ?: "", root->flags, root->name); ret = -EINVAL; goto out_unlock; } @@ -1088,17 +1114,15 @@ static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data) WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); - if (opts.release_agent) { + if (ctx->release_agent) { spin_lock(&release_agent_path_lock); - strcpy(root->release_agent_path, opts.release_agent); + strcpy(root->release_agent_path, ctx->release_agent); spin_unlock(&release_agent_path_lock); } trace_cgroup_remount(root); out_unlock: - kfree(opts.release_agent); - kfree(opts.name); mutex_unlock(&cgroup_mutex); return ret; } @@ -1106,30 +1130,30 @@ static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data) struct kernfs_syscall_ops cgroup1_kf_syscall_ops = { .rename = cgroup1_rename, .show_options = cgroup1_show_options, - .remount_fs = cgroup1_remount, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .show_path = cgroup_show_path, }; -struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, - void *data, unsigned long magic, - struct cgroup_namespace *ns) +/* + * The guts of cgroup1 mount - find or create cgroup_root to use. + * Called with cgroup_mutex held; returns 0 on success, -E... on + * error and positive - in case when the candidate is busy dying. + * On success it stashes a reference to cgroup_root into given + * cgroup_fs_context; that reference is *NOT* counting towards the + * cgroup_root refcount. + */ +static int cgroup1_root_to_use(struct fs_context *fc) { - struct super_block *pinned_sb = NULL; - struct cgroup_sb_opts opts; + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct cgroup_root *root; struct cgroup_subsys *ss; - struct dentry *dentry; int i, ret; - bool new_root = false; - - cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* First find the desired set of subsystems */ - ret = parse_cgroupfs_options(data, &opts); + ret = check_cgroupfs_options(fc); if (ret) - goto out_unlock; + return ret; /* * Destruction of cgroup root is asynchronous, so subsystems may @@ -1139,16 +1163,12 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, * starting. Testing ref liveliness is good enough. */ for_each_subsys(ss, i) { - if (!(opts.subsys_mask & (1 << i)) || + if (!(ctx->subsys_mask & (1 << i)) || ss->root == &cgrp_dfl_root) continue; - if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) { - mutex_unlock(&cgroup_mutex); - msleep(10); - ret = restart_syscall(); - goto out_free; - } + if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) + return 1; /* restart */ cgroup_put(&ss->root->cgrp); } @@ -1163,8 +1183,8 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, * name matches but sybsys_mask doesn't, we should fail. * Remember whether name matched. */ - if (opts.name) { - if (strcmp(opts.name, root->name)) + if (ctx->name) { + if (strcmp(ctx->name, root->name)) continue; name_match = true; } @@ -1173,42 +1193,18 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, * If we asked for subsystems (or explicitly for no * subsystems) then they must match. */ - if ((opts.subsys_mask || opts.none) && - (opts.subsys_mask != root->subsys_mask)) { + if ((ctx->subsys_mask || ctx->none) && + (ctx->subsys_mask != root->subsys_mask)) { if (!name_match) continue; - ret = -EBUSY; - goto out_unlock; + return -EBUSY; } - if (root->flags ^ opts.flags) + if (root->flags ^ ctx->flags) pr_warn("new mount options do not match the existing superblock, will be ignored\n"); - /* - * We want to reuse @root whose lifetime is governed by its - * ->cgrp. Let's check whether @root is alive and keep it - * that way. As cgroup_kill_sb() can happen anytime, we - * want to block it by pinning the sb so that @root doesn't - * get killed before mount is complete. - * - * With the sb pinned, tryget_live can reliably indicate - * whether @root can be reused. If it's being killed, - * drain it. We can use wait_queue for the wait but this - * path is super cold. Let's just sleep a bit and retry. - */ - pinned_sb = kernfs_pin_sb(root->kf_root, NULL); - if (IS_ERR(pinned_sb) || - !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { - mutex_unlock(&cgroup_mutex); - if (!IS_ERR_OR_NULL(pinned_sb)) - deactivate_super(pinned_sb); - msleep(10); - ret = restart_syscall(); - goto out_free; - } - - ret = 0; - goto out_unlock; + ctx->root = root; + return 0; } /* @@ -1216,62 +1212,58 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, * specification is allowed for already existing hierarchies but we * can't create new one without subsys specification. */ - if (!opts.subsys_mask && !opts.none) { - ret = -EINVAL; - goto out_unlock; - } + if (!ctx->subsys_mask && !ctx->none) + return cg_invalf(fc, "cgroup1: No subsys list or none specified"); /* Hierarchies may only be created in the initial cgroup namespace. */ - if (ns != &init_cgroup_ns) { - ret = -EPERM; - goto out_unlock; - } + if (ctx->ns != &init_cgroup_ns) + return -EPERM; root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) { - ret = -ENOMEM; - goto out_unlock; - } - new_root = true; + if (!root) + return -ENOMEM; - init_cgroup_root(root, &opts); + ctx->root = root; + init_cgroup_root(ctx); - ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD); + ret = cgroup_setup_root(root, ctx->subsys_mask); if (ret) cgroup_free_root(root); + return ret; +} -out_unlock: - mutex_unlock(&cgroup_mutex); -out_free: - kfree(opts.release_agent); - kfree(opts.name); +int cgroup1_get_tree(struct fs_context *fc) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + int ret; - if (ret) - return ERR_PTR(ret); + /* Check if the caller has permission to mount. */ + if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; - dentry = cgroup_do_mount(&cgroup_fs_type, flags, root, - CGROUP_SUPER_MAGIC, ns); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); - /* - * There's a race window after we release cgroup_mutex and before - * allocating a superblock. Make sure a concurrent process won't - * be able to re-use the root during this window by delaying the - * initialization of root refcnt. - */ - if (new_root) { - mutex_lock(&cgroup_mutex); - percpu_ref_reinit(&root->cgrp.self.refcnt); - mutex_unlock(&cgroup_mutex); - } + ret = cgroup1_root_to_use(fc); + if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt)) + ret = 1; /* restart */ - /* - * If @pinned_sb, we're reusing an existing root and holding an - * extra ref on its sb. Mount is complete. Put the extra ref. - */ - if (pinned_sb) - deactivate_super(pinned_sb); + mutex_unlock(&cgroup_mutex); - return dentry; + if (!ret) + ret = cgroup_do_get_tree(fc); + + if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) { + struct super_block *sb = fc->root->d_sb; + dput(fc->root); + deactivate_locked_super(sb); + ret = 1; + } + + if (unlikely(ret > 0)) { + msleep(10); + return restart_syscall(); + } + return ret; } static int __init cgroup1_wq_init(void) |