diff options
Diffstat (limited to 'kernel/cgroup.c')
| -rw-r--r-- | kernel/cgroup.c | 306 | 
1 files changed, 208 insertions, 98 deletions
| diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f1603c153890..c03a640ef6da 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -57,8 +57,8 @@  #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */  #include <linux/kthread.h>  #include <linux/delay.h> -  #include <linux/atomic.h> +#include <net/sock.h>  /*   * pidlists linger the following amount before being destroyed.  The goal @@ -98,6 +98,12 @@ static DEFINE_SPINLOCK(css_set_lock);  static DEFINE_SPINLOCK(cgroup_idr_lock);  /* + * Protects cgroup_file->kn for !self csses.  It synchronizes notifications + * against file removal/re-creation across css hiding. + */ +static DEFINE_SPINLOCK(cgroup_file_kn_lock); + +/*   * Protects cgroup_subsys->release_agent_path.  Modifying it also requires   * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.   */ @@ -205,6 +211,7 @@ static unsigned long have_free_callback __read_mostly;  /* Ditto for the can_fork callback. */  static unsigned long have_canfork_callback __read_mostly; +static struct file_system_type cgroup2_fs_type;  static struct cftype cgroup_dfl_base_files[];  static struct cftype cgroup_legacy_base_files[]; @@ -434,11 +441,6 @@ static bool cgroup_tryget(struct cgroup *cgrp)  	return css_tryget(&cgrp->self);  } -static void cgroup_put(struct cgroup *cgrp) -{ -	css_put(&cgrp->self); -} -  struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)  {  	struct cgroup *cgrp = of->kn->parent->priv; @@ -459,25 +461,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)  }  EXPORT_SYMBOL_GPL(of_css); -/** - * cgroup_is_descendant - test ancestry - * @cgrp: the cgroup to be tested - * @ancestor: possible ancestor of @cgrp - * - * Test whether @cgrp is a descendant of @ancestor.  It also returns %true - * if @cgrp == @ancestor.  This function is safe to call as long as @cgrp - * and @ancestor are accessible. - */ -bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) -{ -	while (cgrp) { -		if (cgrp == ancestor) -			return true; -		cgrp = cgroup_parent(cgrp); -	} -	return false; -} -  static int notify_on_release(const struct cgroup *cgrp)  {  	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -754,9 +737,11 @@ static void put_css_set_locked(struct css_set *cset)  	if (!atomic_dec_and_test(&cset->refcount))  		return; -	/* This css_set is dead. unlink it and release cgroup refcounts */ -	for_each_subsys(ss, ssid) +	/* This css_set is dead. unlink it and release cgroup and css refs */ +	for_each_subsys(ss, ssid) {  		list_del(&cset->e_cset_node[ssid]); +		css_put(cset->subsys[ssid]); +	}  	hash_del(&cset->hlist);  	css_set_count--; @@ -1056,9 +1041,13 @@ static struct css_set *find_css_set(struct css_set *old_cset,  	key = css_set_hash(cset->subsys);  	hash_add(css_set_table, &cset->hlist, key); -	for_each_subsys(ss, ssid) +	for_each_subsys(ss, ssid) { +		struct cgroup_subsys_state *css = cset->subsys[ssid]; +  		list_add_tail(&cset->e_cset_node[ssid], -			      &cset->subsys[ssid]->cgroup->e_csets[ssid]); +			      &css->cgroup->e_csets[ssid]); +		css_get(css); +	}  	spin_unlock_bh(&css_set_lock); @@ -1393,6 +1382,16 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)  	char name[CGROUP_FILE_NAME_MAX];  	lockdep_assert_held(&cgroup_mutex); + +	if (cft->file_offset) { +		struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss); +		struct cgroup_file *cfile = (void *)css + cft->file_offset; + +		spin_lock_irq(&cgroup_file_kn_lock); +		cfile->kn = NULL; +		spin_unlock_irq(&cgroup_file_kn_lock); +	} +  	kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));  } @@ -1625,10 +1624,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  			all_ss = true;  			continue;  		} -		if (!strcmp(token, "__DEVEL__sane_behavior")) { -			opts->flags |= CGRP_ROOT_SANE_BEHAVIOR; -			continue; -		}  		if (!strcmp(token, "noprefix")) {  			opts->flags |= CGRP_ROOT_NOPREFIX;  			continue; @@ -1695,15 +1690,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  			return -ENOENT;  	} -	if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { -		pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); -		if (nr_opts != 1) { -			pr_err("sane_behavior: no other mount options allowed\n"); -			return -EINVAL; -		} -		return 0; -	} -  	/*  	 * If the 'all' option was specified select all the subsystems,  	 * otherwise if 'none', 'name=' and a subsystem name options were @@ -1856,7 +1842,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)  	INIT_LIST_HEAD(&cgrp->self.sibling);  	INIT_LIST_HEAD(&cgrp->self.children); -	INIT_LIST_HEAD(&cgrp->self.files);  	INIT_LIST_HEAD(&cgrp->cset_links);  	INIT_LIST_HEAD(&cgrp->pidlists);  	mutex_init(&cgrp->pidlist_mutex); @@ -1903,6 +1888,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)  	if (ret < 0)  		goto out;  	root_cgrp->id = ret; +	root_cgrp->ancestor_ids[0] = ret;  	ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,  			      GFP_KERNEL); @@ -1983,6 +1969,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  			 int flags, const char *unused_dev_name,  			 void *data)  { +	bool is_v2 = fs_type == &cgroup2_fs_type;  	struct super_block *pinned_sb = NULL;  	struct cgroup_subsys *ss;  	struct cgroup_root *root; @@ -1999,6 +1986,17 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  	if (!use_task_css_set_links)  		cgroup_enable_task_cg_lists(); +	if (is_v2) { +		if (data) { +			pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); +			return ERR_PTR(-EINVAL); +		} +		cgrp_dfl_root_visible = true; +		root = &cgrp_dfl_root; +		cgroup_get(&root->cgrp); +		goto out_mount; +	} +  	mutex_lock(&cgroup_mutex);  	/* First find the desired set of subsystems */ @@ -2006,15 +2004,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  	if (ret)  		goto out_unlock; -	/* look for a matching existing root */ -	if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) { -		cgrp_dfl_root_visible = true; -		root = &cgrp_dfl_root; -		cgroup_get(&root->cgrp); -		ret = 0; -		goto out_unlock; -	} -  	/*  	 * Destruction of cgroup root is asynchronous, so subsystems may  	 * still be dying after the previous unmount.  Let's drain the @@ -2125,9 +2114,10 @@ out_free:  	if (ret)  		return ERR_PTR(ret); - +out_mount:  	dentry = kernfs_mount(fs_type, flags, root->kf_root, -				CGROUP_SUPER_MAGIC, &new_sb); +			      is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC, +			      &new_sb);  	if (IS_ERR(dentry) || !new_sb)  		cgroup_put(&root->cgrp); @@ -2170,6 +2160,12 @@ static struct file_system_type cgroup_fs_type = {  	.kill_sb = cgroup_kill_sb,  }; +static struct file_system_type cgroup2_fs_type = { +	.name = "cgroup2", +	.mount = cgroup_mount, +	.kill_sb = cgroup_kill_sb, +}; +  /**   * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy   * @task: target task @@ -2216,6 +2212,9 @@ struct cgroup_taskset {  	struct list_head	src_csets;  	struct list_head	dst_csets; +	/* the subsys currently being processed */ +	int			ssid; +  	/*  	 * Fields for cgroup_taskset_*() iteration.  	 * @@ -2278,25 +2277,29 @@ static void cgroup_taskset_add(struct task_struct *task,  /**   * cgroup_taskset_first - reset taskset and return the first task   * @tset: taskset of interest + * @dst_cssp: output variable for the destination css   *   * @tset iteration is initialized and the first task is returned.   */ -struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) +struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, +					 struct cgroup_subsys_state **dst_cssp)  {  	tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);  	tset->cur_task = NULL; -	return cgroup_taskset_next(tset); +	return cgroup_taskset_next(tset, dst_cssp);  }  /**   * cgroup_taskset_next - iterate to the next task in taskset   * @tset: taskset of interest + * @dst_cssp: output variable for the destination css   *   * Return the next task in @tset.  Iteration must have been initialized   * with cgroup_taskset_first().   */ -struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) +struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, +					struct cgroup_subsys_state **dst_cssp)  {  	struct css_set *cset = tset->cur_cset;  	struct task_struct *task = tset->cur_task; @@ -2311,6 +2314,18 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)  		if (&task->cg_list != &cset->mg_tasks) {  			tset->cur_cset = cset;  			tset->cur_task = task; + +			/* +			 * This function may be called both before and +			 * after cgroup_taskset_migrate().  The two cases +			 * can be distinguished by looking at whether @cset +			 * has its ->mg_dst_cset set. +			 */ +			if (cset->mg_dst_cset) +				*dst_cssp = cset->mg_dst_cset->subsys[tset->ssid]; +			else +				*dst_cssp = cset->subsys[tset->ssid]; +  			return task;  		} @@ -2346,7 +2361,8 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,  	/* check that we can legitimately attach to the cgroup */  	for_each_e_css(css, i, dst_cgrp) {  		if (css->ss->can_attach) { -			ret = css->ss->can_attach(css, tset); +			tset->ssid = i; +			ret = css->ss->can_attach(tset);  			if (ret) {  				failed_css = css;  				goto out_cancel_attach; @@ -2379,9 +2395,12 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,  	 */  	tset->csets = &tset->dst_csets; -	for_each_e_css(css, i, dst_cgrp) -		if (css->ss->attach) -			css->ss->attach(css, tset); +	for_each_e_css(css, i, dst_cgrp) { +		if (css->ss->attach) { +			tset->ssid = i; +			css->ss->attach(tset); +		} +	}  	ret = 0;  	goto out_release_tset; @@ -2390,8 +2409,10 @@ out_cancel_attach:  	for_each_e_css(css, i, dst_cgrp) {  		if (css == failed_css)  			break; -		if (css->ss->cancel_attach) -			css->ss->cancel_attach(css, tset); +		if (css->ss->cancel_attach) { +			tset->ssid = i; +			css->ss->cancel_attach(tset); +		}  	}  out_release_tset:  	spin_lock_bh(&css_set_lock); @@ -3313,9 +3334,9 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,  	if (cft->file_offset) {  		struct cgroup_file *cfile = (void *)css + cft->file_offset; -		kernfs_get(kn); +		spin_lock_irq(&cgroup_file_kn_lock);  		cfile->kn = kn; -		list_add(&cfile->node, &css->files); +		spin_unlock_irq(&cgroup_file_kn_lock);  	}  	return 0; @@ -3553,6 +3574,22 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)  }  /** + * cgroup_file_notify - generate a file modified event for a cgroup_file + * @cfile: target cgroup_file + * + * @cfile must have been obtained by setting cftype->file_offset. + */ +void cgroup_file_notify(struct cgroup_file *cfile) +{ +	unsigned long flags; + +	spin_lock_irqsave(&cgroup_file_kn_lock, flags); +	if (cfile->kn) +		kernfs_notify(cfile->kn); +	spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); +} + +/**   * cgroup_task_count - count the number of tasks in a cgroup.   * @cgrp: the cgroup in question   * @@ -4000,7 +4037,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)  		goto out_err;  	/* -	 * Migrate tasks one-by-one until @form is empty.  This fails iff +	 * Migrate tasks one-by-one until @from is empty.  This fails iff  	 * ->can_attach() fails.  	 */  	do { @@ -4613,13 +4650,9 @@ static void css_free_work_fn(struct work_struct *work)  		container_of(work, struct cgroup_subsys_state, destroy_work);  	struct cgroup_subsys *ss = css->ss;  	struct cgroup *cgrp = css->cgroup; -	struct cgroup_file *cfile;  	percpu_ref_exit(&css->refcnt); -	list_for_each_entry(cfile, &css->files, node) -		kernfs_put(cfile->kn); -  	if (ss) {  		/* css free path */  		int id = css->id; @@ -4724,7 +4757,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css,  	css->ss = ss;  	INIT_LIST_HEAD(&css->sibling);  	INIT_LIST_HEAD(&css->children); -	INIT_LIST_HEAD(&css->files);  	css->serial_nr = css_serial_nr_next++;  	if (cgroup_parent(cgrp)) { @@ -4846,11 +4878,11 @@ err_free_css:  static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,  			umode_t mode)  { -	struct cgroup *parent, *cgrp; +	struct cgroup *parent, *cgrp, *tcgrp;  	struct cgroup_root *root;  	struct cgroup_subsys *ss;  	struct kernfs_node *kn; -	int ssid, ret; +	int level, ssid, ret;  	/* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.  	 */ @@ -4861,9 +4893,11 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,  	if (!parent)  		return -ENODEV;  	root = parent->root; +	level = parent->level + 1;  	/* allocate the cgroup and its ID, 0 is reserved for the root */ -	cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); +	cgrp = kzalloc(sizeof(*cgrp) + +		       sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL);  	if (!cgrp) {  		ret = -ENOMEM;  		goto out_unlock; @@ -4887,6 +4921,10 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,  	cgrp->self.parent = &parent->self;  	cgrp->root = root; +	cgrp->level = level; + +	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) +		cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;  	if (notify_on_release(parent))  		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -5131,7 +5169,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)  {  	struct cgroup_subsys_state *css; -	printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); +	pr_debug("Initializing cgroup subsys %s\n", ss->name);  	mutex_lock(&cgroup_mutex); @@ -5289,6 +5327,7 @@ int __init cgroup_init(void)  	WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));  	WARN_ON(register_filesystem(&cgroup_fs_type)); +	WARN_ON(register_filesystem(&cgroup2_fs_type));  	WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));  	return 0; @@ -5432,19 +5471,6 @@ static const struct file_operations proc_cgroupstats_operations = {  	.release = single_release,  }; -static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i) -{ -	if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END) -		return &ss_priv[i - CGROUP_CANFORK_START]; -	return NULL; -} - -static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i) -{ -	void **private = subsys_canfork_priv_p(ss_priv, i); -	return private ? *private : NULL; -} -  /**   * cgroup_fork - initialize cgroup related fields during copy_process()   * @child: pointer to task_struct of forking parent process. @@ -5467,14 +5493,13 @@ void cgroup_fork(struct task_struct *child)   * returns an error, the fork aborts with that error code. This allows for   * a cgroup subsystem to conditionally allow or deny new forks.   */ -int cgroup_can_fork(struct task_struct *child, -		    void *ss_priv[CGROUP_CANFORK_COUNT]) +int cgroup_can_fork(struct task_struct *child)  {  	struct cgroup_subsys *ss;  	int i, j, ret;  	for_each_subsys_which(ss, i, &have_canfork_callback) { -		ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i)); +		ret = ss->can_fork(child);  		if (ret)  			goto out_revert;  	} @@ -5486,7 +5511,7 @@ out_revert:  		if (j >= i)  			break;  		if (ss->cancel_fork) -			ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j)); +			ss->cancel_fork(child);  	}  	return ret; @@ -5499,15 +5524,14 @@ out_revert:   * This calls the cancel_fork() callbacks if a fork failed *after*   * cgroup_can_fork() succeded.   */ -void cgroup_cancel_fork(struct task_struct *child, -			void *ss_priv[CGROUP_CANFORK_COUNT]) +void cgroup_cancel_fork(struct task_struct *child)  {  	struct cgroup_subsys *ss;  	int i;  	for_each_subsys(ss, i)  		if (ss->cancel_fork) -			ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i)); +			ss->cancel_fork(child);  }  /** @@ -5520,8 +5544,7 @@ void cgroup_cancel_fork(struct task_struct *child,   * cgroup_task_iter_start() - to guarantee that the new task ends up on its   * list.   */ -void cgroup_post_fork(struct task_struct *child, -		      void *old_ss_priv[CGROUP_CANFORK_COUNT]) +void cgroup_post_fork(struct task_struct *child)  {  	struct cgroup_subsys *ss;  	int i; @@ -5565,7 +5588,7 @@ void cgroup_post_fork(struct task_struct *child,  	 * and addition to css_set.  	 */  	for_each_subsys_which(ss, i, &have_fork_callback) -		ss->fork(child, subsys_canfork_priv(old_ss_priv, i)); +		ss->fork(child);  }  /** @@ -5765,6 +5788,93 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)  	return id > 0 ? idr_find(&ss->css_idr, id) : NULL;  } +/** + * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path + * @path: path on the default hierarchy + * + * Find the cgroup at @path on the default hierarchy, increment its + * reference count and return it.  Returns pointer to the found cgroup on + * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR) + * if @path points to a non-directory. + */ +struct cgroup *cgroup_get_from_path(const char *path) +{ +	struct kernfs_node *kn; +	struct cgroup *cgrp; + +	mutex_lock(&cgroup_mutex); + +	kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path); +	if (kn) { +		if (kernfs_type(kn) == KERNFS_DIR) { +			cgrp = kn->priv; +			cgroup_get(cgrp); +		} else { +			cgrp = ERR_PTR(-ENOTDIR); +		} +		kernfs_put(kn); +	} else { +		cgrp = ERR_PTR(-ENOENT); +	} + +	mutex_unlock(&cgroup_mutex); +	return cgrp; +} +EXPORT_SYMBOL_GPL(cgroup_get_from_path); + +/* + * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data + * definition in cgroup-defs.h. + */ +#ifdef CONFIG_SOCK_CGROUP_DATA + +#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) + +DEFINE_SPINLOCK(cgroup_sk_update_lock); +static bool cgroup_sk_alloc_disabled __read_mostly; + +void cgroup_sk_alloc_disable(void) +{ +	if (cgroup_sk_alloc_disabled) +		return; +	pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n"); +	cgroup_sk_alloc_disabled = true; +} + +#else + +#define cgroup_sk_alloc_disabled	false + +#endif + +void cgroup_sk_alloc(struct sock_cgroup_data *skcd) +{ +	if (cgroup_sk_alloc_disabled) +		return; + +	rcu_read_lock(); + +	while (true) { +		struct css_set *cset; + +		cset = task_css_set(current); +		if (likely(cgroup_tryget(cset->dfl_cgrp))) { +			skcd->val = (unsigned long)cset->dfl_cgrp; +			break; +		} +		cpu_relax(); +	} + +	rcu_read_unlock(); +} + +void cgroup_sk_free(struct sock_cgroup_data *skcd) +{ +	cgroup_put(sock_cgroup_ptr(skcd)); +} + +#endif	/* CONFIG_SOCK_CGROUP_DATA */ +  #ifdef CONFIG_CGROUP_DEBUG  static struct cgroup_subsys_state *  debug_css_alloc(struct cgroup_subsys_state *parent_css) | 
