diff options
Diffstat (limited to 'kernel')
38 files changed, 1964 insertions, 1107 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 9c323a6daa46..ed470aac53da 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,12 +5,13 @@ obj-y = fork.o exec_domain.o panic.o \ cpu.o exit.o softirq.o resource.o \ sysctl.o sysctl_binary.o capability.o ptrace.o user.o \ - signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ + signal.o sys.o umh.o workqueue.o pid.o task_work.o \ extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o +obj-$(CONFIG_MODULES) += kmod.o obj-$(CONFIG_MULTIUSER) += groups.o ifdef CONFIG_FUNCTION_TRACER diff --git a/kernel/audit.c b/kernel/audit.c index 6dd556931739..be1c28fd4d57 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1662,7 +1662,7 @@ static inline void audit_get_stamp(struct audit_context *ctx, struct timespec64 *t, unsigned int *serial) { if (!ctx || !auditsc_get_stamp(ctx, t, serial)) { - ktime_get_real_ts64(t); + *t = current_kernel_time64(); *serial = audit_serial(); } } @@ -1833,7 +1833,7 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) } /** - * audit_log_hex - convert a buffer to hex and append it to the audit skb + * audit_log_n_hex - convert a buffer to hex and append it to the audit skb * @ab: the audit_buffer * @buf: buffer to convert to hex * @len: length of @buf to be converted diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3260ba2312a9..aac1a41f82bd 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1462,7 +1462,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts } /** - * audit_free - free a per-task audit context + * __audit_free - free a per-task audit context * @tsk: task whose audit context block to free * * Called from copy_process and do_exit @@ -1489,7 +1489,7 @@ void __audit_free(struct task_struct *tsk) } /** - * audit_syscall_entry - fill in an audit record at syscall entry + * __audit_syscall_entry - fill in an audit record at syscall entry * @major: major syscall type (function) * @a1: additional syscall register 1 * @a2: additional syscall register 2 @@ -1536,14 +1536,14 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, return; context->serial = 0; - ktime_get_real_ts64(&context->ctime); + context->ctime = current_kernel_time64(); context->in_syscall = 1; context->current_state = state; context->ppid = 0; } /** - * audit_syscall_exit - deallocate audit context after a system call + * __audit_syscall_exit - deallocate audit context after a system call * @success: success value of the syscall * @return_code: return value of the syscall * @@ -1705,7 +1705,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context, } /** - * audit_reusename - fill out filename with info from existing entry + * __audit_reusename - fill out filename with info from existing entry * @uptr: userland ptr to pathname * * Search the audit_names list for the current audit context. If there is an @@ -1730,7 +1730,7 @@ __audit_reusename(const __user char *uptr) } /** - * audit_getname - add a name to the list + * __audit_getname - add a name to the list * @name: name to add * * Add a name to the list of audit names for this context. @@ -2135,7 +2135,7 @@ void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) } /** - * audit_ipc_obj - record audit data for ipc object + * __audit_ipc_obj - record audit data for ipc object * @ipcp: ipc permissions * */ @@ -2151,7 +2151,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp) } /** - * audit_ipc_set_perm - record audit data for new ipc permissions + * __audit_ipc_set_perm - record audit data for new ipc permissions * @qbytes: msgq bytes * @uid: msgq user id * @gid: msgq group id @@ -2180,7 +2180,7 @@ void __audit_bprm(struct linux_binprm *bprm) /** - * audit_socketcall - record audit data for sys_socketcall + * __audit_socketcall - record audit data for sys_socketcall * @nargs: number of args, which should not be more than AUDITSC_ARGS. * @args: args array * @@ -2211,7 +2211,7 @@ void __audit_fd_pair(int fd1, int fd2) } /** - * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto + * __audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto * @len: data length in user space * @a: data address in kernel space * diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 8b4c3c2f2509..5151ff256c29 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -156,6 +156,8 @@ static inline void get_css_set(struct css_set *cset) bool cgroup_ssid_enabled(int ssid); bool cgroup_on_dfl(const struct cgroup *cgrp); +bool cgroup_is_thread_root(struct cgroup *cgrp); +bool cgroup_is_threaded(struct cgroup *cgrp); struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root); struct cgroup *task_cgroup_from_root(struct task_struct *task, @@ -173,7 +175,7 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, struct cgroup_root *root, unsigned long magic, struct cgroup_namespace *ns); -bool cgroup_may_migrate_to(struct cgroup *dst_cgrp); +int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp); void cgroup_migrate_finish(struct cgroup_mgctx *mgctx); void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, struct cgroup_mgctx *mgctx); @@ -183,10 +185,10 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); -ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, loff_t off, bool threadgroup); -ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, - loff_t off); +struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) + __acquires(&cgroup_threadgroup_rwsem); +void cgroup_procs_write_finish(struct task_struct *task) + __releases(&cgroup_threadgroup_rwsem); void cgroup_lock_and_drain_offline(struct cgroup *cgrp); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 7bf4b1533f34..024085daab1a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -99,8 +99,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) if (cgroup_on_dfl(to)) return -EINVAL; - if (!cgroup_may_migrate_to(to)) - return -EBUSY; + ret = cgroup_migrate_vet_dst(to); + if (ret) + return ret; mutex_lock(&cgroup_mutex); @@ -121,7 +122,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) * ->can_attach() fails. */ do { - css_task_iter_start(&from->self, &it); + css_task_iter_start(&from->self, 0, &it); task = css_task_iter_next(&it); if (task) get_task_struct(task); @@ -373,7 +374,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (!array) return -ENOMEM; /* now, populate the array */ - css_task_iter_start(&cgrp->self, &it); + css_task_iter_start(&cgrp->self, 0, &it); while ((tsk = css_task_iter_next(&it))) { if (unlikely(n == length)) break; @@ -510,10 +511,58 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v) return 0; } -static ssize_t cgroup_tasks_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) +static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off, + bool threadgroup) { - return __cgroup_procs_write(of, buf, nbytes, off, false); + struct cgroup *cgrp; + struct task_struct *task; + const struct cred *cred, *tcred; + ssize_t ret; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; + + task = cgroup_procs_write_start(buf, threadgroup); + ret = PTR_ERR_OR_ZERO(task); + if (ret) + goto out_unlock; + + /* + * Even if we're attaching all tasks in the thread group, we only + * need to check permissions on one of them. + */ + cred = current_cred(); + tcred = get_task_cred(task); + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && + !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->euid, tcred->suid)) + ret = -EACCES; + put_cred(tcred); + if (ret) + goto out_finish; + + ret = cgroup_attach_task(cgrp, task, threadgroup); + +out_finish: + cgroup_procs_write_finish(task); +out_unlock: + cgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + +static ssize_t cgroup1_procs_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return __cgroup1_procs_write(of, buf, nbytes, off, true); +} + +static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return __cgroup1_procs_write(of, buf, nbytes, off, false); } static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, @@ -592,7 +641,7 @@ struct cftype cgroup1_base_files[] = { .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_PROCS, - .write = cgroup_procs_write, + .write = cgroup1_procs_write, }, { .name = "cgroup.clone_children", @@ -611,7 +660,7 @@ struct cftype cgroup1_base_files[] = { .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_TASKS, - .write = cgroup_tasks_write, + .write = cgroup1_tasks_write, }, { .name = "notify_on_release", @@ -701,7 +750,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) } rcu_read_unlock(); - css_task_iter_start(&cgrp->self, &it); + css_task_iter_start(&cgrp->self, 0, &it); while ((tsk = css_task_iter_next(&it))) { switch (tsk->state) { case TASK_RUNNING: @@ -846,6 +895,8 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo seq_puts(seq, ",noprefix"); if (root->flags & CGRP_ROOT_XATTR) seq_puts(seq, ",xattr"); + if (root->flags & CGRP_ROOT_CPUSET_V2_MODE) + seq_puts(seq, ",cpuset_v2_mode"); spin_lock(&release_agent_path_lock); if (strlen(root->release_agent_path)) @@ -900,6 +951,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) opts->cpuset_clone_children = true; continue; } + if (!strcmp(token, "cpuset_v2_mode")) { + opts->flags |= CGRP_ROOT_CPUSET_V2_MODE; + continue; + } if (!strcmp(token, "xattr")) { opts->flags |= CGRP_ROOT_XATTR; continue; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index df2e0f14a95d..d6551cd45238 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -162,6 +162,9 @@ static u16 cgrp_dfl_inhibit_ss_mask; /* some controllers are implicitly enabled on the default hierarchy */ static u16 cgrp_dfl_implicit_ss_mask; +/* some controllers can be threaded on the default hierarchy */ +static u16 cgrp_dfl_threaded_ss_mask; + /* The list of hierarchy roots */ LIST_HEAD(cgroup_roots); static int cgroup_root_count; @@ -316,13 +319,87 @@ static void cgroup_idr_remove(struct idr *idr, int id) spin_unlock_bh(&cgroup_idr_lock); } -static struct cgroup *cgroup_parent(struct cgroup *cgrp) +static bool cgroup_has_tasks(struct cgroup *cgrp) { - struct cgroup_subsys_state *parent_css = cgrp->self.parent; + return cgrp->nr_populated_csets; +} - if (parent_css) - return container_of(parent_css, struct cgroup, self); - return NULL; +bool cgroup_is_threaded(struct cgroup *cgrp) +{ + return cgrp->dom_cgrp != cgrp; +} + +/* can @cgrp host both domain and threaded children? */ +static bool cgroup_is_mixable(struct cgroup *cgrp) +{ + /* + * Root isn't under domain level resource control exempting it from + * the no-internal-process constraint, so it can serve as a thread + * root and a parent of resource domains at the same time. + */ + return !cgroup_parent(cgrp); +} + +/* can @cgrp become a thread root? should always be true for a thread root */ +static bool cgroup_can_be_thread_root(struct cgroup *cgrp) +{ + /* mixables don't care */ + if (cgroup_is_mixable(cgrp)) + return true; + + /* domain roots can't be nested under threaded */ + if (cgroup_is_threaded(cgrp)) + return false; + + /* can only have either domain or threaded children */ + if (cgrp->nr_populated_domain_children) + return false; + + /* and no domain controllers can be enabled */ + if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask) + return false; + + return true; +} + +/* is @cgrp root of a threaded subtree? */ +bool cgroup_is_thread_root(struct cgroup *cgrp) +{ + /* thread root should be a domain */ + if (cgroup_is_threaded(cgrp)) + return false; + + /* a domain w/ threaded children is a thread root */ + if (cgrp->nr_threaded_children) + return true; + + /* + * A domain which has tasks and explicit threaded controllers + * enabled is a thread root. + */ + if (cgroup_has_tasks(cgrp) && + (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask)) + return true; + + return false; +} + +/* a domain which isn't connected to the root w/o brekage can't be used */ +static bool cgroup_is_valid_domain(struct cgroup *cgrp) +{ + /* the cgroup itself can be a thread root */ + if (cgroup_is_threaded(cgrp)) + return false; + + /* but the ancestors can't be unless mixable */ + while ((cgrp = cgroup_parent(cgrp))) { + if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp)) + return false; + if (cgroup_is_threaded(cgrp)) + return false; + } + + return true; } /* subsystems visibly enabled on a cgroup */ @@ -331,8 +408,14 @@ static u16 cgroup_control(struct cgroup *cgrp) struct cgroup *parent = cgroup_parent(cgrp); u16 root_ss_mask = cgrp->root->subsys_mask; - if (parent) - return parent->subtree_control; + if (parent) { + u16 ss_mask = parent->subtree_control; + + /* threaded cgroups can only have threaded controllers */ + if (cgroup_is_threaded(cgrp)) + ss_mask &= cgrp_dfl_threaded_ss_mask; + return ss_mask; + } if (cgroup_on_dfl(cgrp)) root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask | @@ -345,8 +428,14 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); - if (parent) - return parent->subtree_ss_mask; + if (parent) { + u16 ss_mask = parent->subtree_ss_mask; + + /* threaded cgroups can only have threaded controllers */ + if (cgroup_is_threaded(cgrp)) + ss_mask &= cgrp_dfl_threaded_ss_mask; + return ss_mask; + } return cgrp->root->subsys_mask; } @@ -436,22 +525,12 @@ out_unlock: return css; } -static void __maybe_unused cgroup_get(struct cgroup *cgrp) -{ - css_get(&cgrp->self); -} - static void cgroup_get_live(struct cgroup *cgrp) { WARN_ON_ONCE(cgroup_is_dead(cgrp)); css_get(&cgrp->self); } -static bool cgroup_tryget(struct cgroup *cgrp) -{ - return css_tryget(&cgrp->self); -} - struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { struct cgroup *cgrp = of->kn->parent->priv; @@ -560,9 +639,11 @@ EXPORT_SYMBOL_GPL(of_css); */ struct css_set init_css_set = { .refcount = REFCOUNT_INIT(1), + .dom_cset = &init_css_set, .tasks = LIST_HEAD_INIT(init_css_set.tasks), .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), + .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), @@ -570,6 +651,11 @@ struct css_set init_css_set = { static int css_set_count = 1; /* 1 for init_css_set */ +static bool css_set_threaded(struct css_set *cset) +{ + return cset->dom_cset != cset; +} + /** * css_set_populated - does a css_set contain any tasks? * @cset: target css_set @@ -587,39 +673,48 @@ static bool css_set_populated(struct css_set *cset) } /** - * cgroup_update_populated - updated populated count of a cgroup + * cgroup_update_populated - update the populated count of a cgroup * @cgrp: the target cgroup * @populated: inc or dec populated count * * One of the css_sets associated with @cgrp is either getting its first - * task or losing the last. Update @cgrp->populated_cnt accordingly. The - * count is propagated towards root so that a given cgroup's populated_cnt - * is zero iff the cgroup and all its descendants don't contain any tasks. + * task or losing the last. Update @cgrp->nr_populated_* accordingly. The + * count is propagated towards root so that a given cgroup's + * nr_populated_children is zero iff none of its descendants contain any + * tasks. * - * @cgrp's interface file "cgroup.populated" is zero if - * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt - * changes from or to zero, userland is notified that the content of the - * interface file has changed. This can be used to detect when @cgrp and - * its descendants become populated or empty. + * @cgrp's interface file "cgroup.populated" is zero if both + * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and + * 1 otherwise. When the sum changes from or to zero, userland is notified + * that the content of the interface file has changed. This can be used to + * detect when @cgrp and its descendants become populated or empty. */ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) { + struct cgroup *child = NULL; + int adj = populated ? 1 : -1; + lockdep_assert_held(&css_set_lock); do { - bool trigger; + bool was_populated = cgroup_is_populated(cgrp); - if (populated) - trigger = !cgrp->populated_cnt++; - else - trigger = !--cgrp->populated_cnt; + if (!child) { + cgrp->nr_populated_csets += adj; + } else { + if (cgroup_is_threaded(child)) + cgrp->nr_populated_threaded_children += adj; + else + cgrp->nr_populated_domain_children += adj; + } - if (!trigger) + if (was_populated == cgroup_is_populated(cgrp)) break; cgroup1_check_for_release(cgrp); cgroup_file_notify(&cgrp->events_file); + child = cgrp; cgrp = cgroup_parent(cgrp); } while (cgrp); } @@ -630,7 +725,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) * @populated: whether @cset is populated or depopulated * * @cset is either getting the first task or losing the last. Update the - * ->populated_cnt of all associated cgroups accordingly. + * populated counters of all associated cgroups accordingly. */ static void css_set_update_populated(struct css_set *cset, bool populated) { @@ -653,7 +748,7 @@ static void css_set_update_populated(struct css_set *cset, bool populated) * css_set, @from_cset can be NULL. If @task is being disassociated * instead of moved, @to_cset can be NULL. * - * This function automatically handles populated_cnt updates and + * This function automatically handles populated counter updates and * css_task_iter adjustments but the caller is responsible for managing * @from_cset and @to_cset's reference counts. */ @@ -737,6 +832,8 @@ void put_css_set_locked(struct css_set *cset) if (!refcount_dec_and_test(&cset->refcount)) return; + WARN_ON_ONCE(!list_empty(&cset->threaded_csets)); + /* This css_set is dead. unlink it and release cgroup and css refs */ for_each_subsys(ss, ssid) { list_del(&cset->e_cset_node[ssid]); @@ -753,6 +850,11 @@ void put_css_set_locked(struct css_set *cset) kfree(link); } + if (css_set_threaded(cset)) { + list_del(&cset->threaded_csets_node); + put_css_set_locked(cset->dom_cset); + } + kfree_rcu(cset, rcu_head); } @@ -771,6 +873,7 @@ static bool compare_css_sets(struct css_set *cset, struct cgroup *new_cgrp, struct cgroup_subsys_state *template[]) { + struct cgroup *new_dfl_cgrp; struct list_head *l1, *l2; /* @@ -781,6 +884,16 @@ static bool compare_css_sets(struct css_set *cset, if (memcmp(template, cset->subsys, sizeof(cset->subsys))) return false; + + /* @cset's domain should match the default cgroup's */ + if (cgroup_on_dfl(new_cgrp)) + new_dfl_cgrp = new_cgrp; + else + new_dfl_cgrp = old_cset->dfl_cgrp; + + if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp) + return false; + /* * Compare cgroup pointers in order to distinguish between * different cgroups in hierarchies. As different cgroups may @@ -988,9 +1101,11 @@ static struct css_set *find_css_set(struct css_set *old_cset, } refcount_set(&cset->refcount, 1); + cset->dom_cset = cset; INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); INIT_LIST_HEAD(&cset->task_iters); + INIT_LIST_HEAD(&cset->threaded_csets); INIT_HLIST_NODE(&cset->hlist); INIT_LIST_HEAD(&cset->cgrp_links); INIT_LIST_HEAD(&cset->mg_preload_node); @@ -1028,6 +1143,28 @@ static struct css_set *find_css_set(struct css_set *old_cset, spin_unlock_irq(&css_set_lock); + /* + * If @cset should be threaded, look up the matching dom_cset and + * link them up. We first fully initialize @cset then look for the + * dom_cset. It's simpler this way and safe as @cset is guaranteed + * to stay empty until we return. + */ + if (cgroup_is_threaded(cset->dfl_cgrp)) { + struct css_set *dcset; + + dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp); + if (!dcset) { + put_css_set(cset); + return NULL; + } + + spin_lock_irq(&css_set_lock); + cset->dom_cset = dcset; + list_add_tail(&cset->threaded_csets_node, + &dcset->threaded_csets); + spin_unlock_irq(&css_set_lock); + } + return cset; } @@ -1155,6 +1292,8 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, if (cset == &init_css_set) { res = &root->cgrp; + } else if (root == &cgrp_dfl_root) { + res = cset->dfl_cgrp; } else { struct cgrp_cset_link *link; @@ -1670,6 +1809,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) mutex_init(&cgrp->pidlist_mutex); cgrp->self.cgroup = cgrp; cgrp->self.flags |= CSS_ONLINE; + cgrp->dom_cgrp = cgrp; + cgrp->max_descendants = INT_MAX; + cgrp->max_depth = INT_MAX; for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); @@ -1737,7 +1879,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops; root->kf_root = kernfs_create_root(kf_sops, - KERNFS_ROOT_CREATE_DEACTIVATED, + KERNFS_ROOT_CREATE_DEACTIVATED | + KERNFS_ROOT_SUPPORT_EXPORTOP, root_cgrp); if (IS_ERR(root->kf_root)) { ret = PTR_ERR(root->kf_root); @@ -2172,17 +2315,40 @@ out_release_tset: } /** - * cgroup_may_migrate_to - verify whether a cgroup can be migration destination + * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination * @dst_cgrp: destination cgroup to test * - * On the default hierarchy, except for the root, subtree_control must be - * zero for migration destination cgroups with tasks so that child cgroups - * don't compete against tasks. + * On the default hierarchy, except for the mixable, (possible) thread root + * and threaded cgroups, subtree_control must be zero for migration + * destination cgroups with tasks so that child cgroups don't compete + * against tasks. */ -bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) +int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp) { - return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) || - !dst_cgrp->subtree_control; + /* v1 doesn't have any restriction */ + if (!cgroup_on_dfl(dst_cgrp)) + return 0; + + /* verify @dst_cgrp can host resources */ + if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp)) + return -EOPNOTSUPP; + + /* mixables don't care */ + if (cgroup_is_mixable(dst_cgrp)) + return 0; + + /* + * If @dst_cgrp is already or can become a thread root or is + * threaded, it doesn't matter. + */ + if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp)) + return 0; + + /* apply no-internal-process constraint */ + if (dst_cgrp->subtree_control) + return -EBUSY; + + return 0; } /** @@ -2387,8 +2553,9 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, struct task_struct *task; int ret; - if (!cgroup_may_migrate_to(dst_cgrp)) - return -EBUSY; + ret = cgroup_migrate_vet_dst(dst_cgrp); + if (ret) + return ret; /* look up all src csets */ spin_lock_irq(&css_set_lock); @@ -2415,96 +2582,23 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, return ret; } -static int cgroup_procs_write_permission(struct task_struct *task, - struct cgroup *dst_cgrp, - struct kernfs_open_file *of) -{ - struct super_block *sb = of->file->f_path.dentry->d_sb; - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; - struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp; - struct cgroup *src_cgrp, *com_cgrp; - struct inode *inode; - int ret; - - if (!cgroup_on_dfl(dst_cgrp)) { - const struct cred *cred = current_cred(); - const struct cred *tcred = get_task_cred(task); - - /* - * even if we're attaching all tasks in the thread group, - * we only need to check permissions on one of them. - */ - if (uid_eq(cred->euid, GLOBAL_ROOT_UID) || - uid_eq(cred->euid, tcred->uid) || - uid_eq(cred->euid, tcred->suid)) - ret = 0; - else - ret = -EACCES; - - put_cred(tcred); - return ret; - } - - /* find the source cgroup */ - spin_lock_irq(&css_set_lock); - src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); - spin_unlock_irq(&css_set_lock); - - /* and the common ancestor */ - com_cgrp = src_cgrp; - while (!cgroup_is_descendant(dst_cgrp, com_cgrp)) - com_cgrp = cgroup_parent(com_cgrp); - - /* %current should be authorized to migrate to the common ancestor */ - inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn); - if (!inode) - return -ENOMEM; - - ret = inode_permission(inode, MAY_WRITE); - iput(inode); - if (ret) - return ret; - - /* - * If namespaces are delegation boundaries, %current must be able - * to see both source and destination cgroups from its namespace. - */ - if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) && - (!cgroup_is_descendant(src_cgrp, root_cgrp) || - !cgroup_is_descendant(dst_cgrp, root_cgrp))) - return -ENOENT; - - return 0; -} - -/* - * Find the task_struct of the task to attach by vpid and pass it along to the - * function to attach either it or all tasks in its threadgroup. Will lock - * cgroup_mutex and threadgroup. - */ -ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, loff_t off, bool threadgroup) +struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) + __acquires(&cgroup_threadgroup_rwsem) { struct task_struct *tsk; - struct cgroup_subsys *ss; - struct cgroup *cgrp; pid_t pid; - int ssid, ret; if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) - return -EINVAL; - - cgrp = cgroup_kn_lock_live(of->kn, false); - if (!cgrp) - return -ENODEV; + return ERR_PTR(-EINVAL); percpu_down_write(&cgroup_threadgroup_rwsem); + rcu_read_lock(); if (pid) { tsk = find_task_by_vpid(pid); if (!tsk) { - ret = -ESRCH; - goto out_unlock_rcu; + tsk = ERR_PTR(-ESRCH); + goto out_unlock_threadgroup; } } else { tsk = current; @@ -2520,35 +2614,33 @@ ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, * cgroup with no rt_runtime allocated. Just say no. */ if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) { - ret = -EINVAL; - goto out_unlock_rcu; + tsk = ERR_PTR(-EINVAL); + goto out_unlock_threadgroup; } get_task_struct(tsk); + goto out_unlock_rcu; + +out_unlock_threadgroup: + percpu_up_write(&cgroup_threadgroup_rwsem); +out_unlock_rcu: rcu_read_unlock(); + return tsk; +} - ret = cgroup_procs_write_permission(tsk, cgrp, of); - if (!ret) - ret = cgroup_attach_task(cgrp, tsk, threadgroup); +void cgroup_procs_write_finish(struct task_struct *task) + __releases(&cgroup_threadgroup_rwsem) +{ + struct cgroup_subsys *ss; + int ssid; - put_task_struct(tsk); - goto out_unlock_threadgroup; + /* release reference from cgroup_procs_write_start() */ + put_task_struct(task); -out_unlock_rcu: - rcu_read_unlock(); -out_unlock_threadgroup: percpu_up_write(&cgroup_threadgroup_rwsem); for_each_subsys(ss, ssid) if (ss->post_attach) ss->post_attach(); - cgroup_kn_unlock(of->kn); - return ret ?: nbytes; -} - -ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, - loff_t off) -{ - return __cgroup_procs_write(of, buf, nbytes, off, true); } static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) @@ -2891,6 +2983,46 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret) cgroup_apply_control_disable(cgrp); } +static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable) +{ + u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask; + + /* if nothing is getting enabled, nothing to worry about */ + if (!enable) + return 0; + + /* can @cgrp host any resources? */ + if (!cgroup_is_valid_domain(cgrp->dom_cgrp)) + return -EOPNOTSUPP; + + /* mixables don't care */ + if (cgroup_is_mixable(cgrp)) + return 0; + + if (domain_enable) { + /* can't enable domain controllers inside a thread subtree */ + if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp)) + return -EOPNOTSUPP; + } else { + /* + * Threaded controllers can handle internal competitions + * and are always allowed inside a (prospective) thread + * subtree. + */ + if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp)) + return 0; + } + + /* + * Controllers can't be enabled for a cgroup with tasks to avoid + * child cgroups competing against tasks. + */ + if (cgroup_has_tasks(cgrp)) + return -EBUSY; + + return 0; +} + /* change the enabled child controllers for a cgroup in the default hierarchy */ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, @@ -2966,33 +3098,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, goto out_unlock; } - /* - * Except for the root, subtree_control must be zero for a cgroup - * with tasks so that child cgroups don't compete against tasks. - */ - if (enable && cgroup_parent(cgrp)) { - struct cgrp_cset_link *link; - - /* - * Because namespaces pin csets too, @cgrp->cset_links - * might not be empty even when @cgrp is empty. Walk and - * verify each cset. - */ - spin_lock_irq(&css_set_lock); - - ret = 0; - list_for_each_entry(link, &cgrp->cset_links, cset_link) { - if (css_set_populated(link->cset)) { - ret = -EBUSY; - break; - } - } - - spin_unlock_irq(&css_set_lock); - - if (ret) - goto out_unlock; - } + ret = cgroup_vet_subtree_control_enable(cgrp, enable); + if (ret) + goto out_unlock; /* save and update control masks and prepare csses */ cgroup_save_control(cgrp); @@ -3011,6 +3119,172 @@ out_unlock: return ret ?: nbytes; } +/** + * cgroup_enable_threaded - make @cgrp threaded + * @cgrp: the target cgroup + * + * Called when "threaded" is written to the cgroup.type interface file and + * tries to make @cgrp threaded and join the parent's resource domain. + * This function is never called on the root cgroup as cgroup.type doesn't + * exist on it. + */ +static int cgroup_enable_threaded(struct cgroup *cgrp) +{ + struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup *dom_cgrp = parent->dom_cgrp; + int ret; + + lockdep_assert_held(&cgroup_mutex); + + /* noop if already threaded */ + if (cgroup_is_threaded(cgrp)) + return 0; + + /* we're joining the parent's domain, ensure its validity */ + if (!cgroup_is_valid_domain(dom_cgrp) || + !cgroup_can_be_thread_root(dom_cgrp)) + return -EOPNOTSUPP; + + /* + * The following shouldn't cause actual migrations and should + * always succeed. + */ + cgroup_save_control(cgrp); + + cgrp->dom_cgrp = dom_cgrp; + ret = cgroup_apply_control(cgrp); + if (!ret) + parent->nr_threaded_children++; + else + cgrp->dom_cgrp = cgrp; + + cgroup_finalize_control(cgrp, ret); + return ret; +} + +static int cgroup_type_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + if (cgroup_is_threaded(cgrp)) + seq_puts(seq, "threaded\n"); + else if (!cgroup_is_valid_domain(cgrp)) + seq_puts(seq, "domain invalid\n"); + else if (cgroup_is_thread_root(cgrp)) + seq_puts(seq, "domain threaded\n"); + else + seq_puts(seq, "domain\n"); + + return 0; +} + +static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct cgroup *cgrp; + int ret; + + /* only switching to threaded mode is supported */ + if (strcmp(strstrip(buf), "threaded")) + return -EINVAL; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENOENT; + + /* threaded can only be enabled */ + ret = cgroup_enable_threaded(cgrp); + + cgroup_kn_unlock(of->kn); + return ret ?: nbytes; +} + +static int cgroup_max_descendants_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + int descendants = READ_ONCE(cgrp->max_descendants); + + if (descendants == INT_MAX) + seq_puts(seq, "max\n"); + else + seq_printf(seq, "%d\n", descendants); + + return 0; +} + +static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup *cgrp; + int descendants; + ssize_t ret; + + buf = strstrip(buf); + if (!strcmp(buf, "max")) { + descendants = INT_MAX; + } else { + ret = kstrtoint(buf, 0, &descendants); + if (ret) + return ret; + } + + if (descendants < 0) + return -ERANGE; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENOENT; + + cgrp->max_descendants = descendants; + + cgroup_kn_unlock(of->kn); + + return nbytes; +} + +static int cgroup_max_depth_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + int depth = READ_ONCE(cgrp->max_depth); + + if (depth == INT_MAX) + seq_puts(seq, "max\n"); + else + seq_printf(seq, "%d\n", depth); + + return 0; +} + +static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup *cgrp; + ssize_t ret; + int depth; + + buf = strstrip(buf); + if (!strcmp(buf, "max")) { + depth = INT_MAX; + } else { + ret = kstrtoint(buf, 0, &depth); + if (ret) + return ret; + } + + if (depth < 0) + return -ERANGE; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENOENT; + + cgrp->max_depth = depth; + + cgroup_kn_unlock(of->kn); + + return nbytes; +} + static int cgroup_events_show(struct seq_file *seq, void *v) { seq_printf(seq, "populated %d\n", @@ -3018,6 +3292,18 @@ static int cgroup_events_show(struct seq_file *seq, void *v) return 0; } +static int cgroup_stat_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgroup = seq_css(seq)->cgroup; + + seq_printf(seq, "nr_descendants %d\n", + cgroup->nr_descendants); + seq_printf(seq, "nr_dying_descendants %d\n", + cgroup->nr_dying_descendants); + + return 0; +} + static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; @@ -3234,7 +3520,6 @@ restart: static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) { - LIST_HEAD(pending); struct cgroup_subsys *ss = cfts[0].ss; struct cgroup *root = &ss->root->cgrp; struct cgroup_subsys_state *css; @@ -3659,6 +3944,58 @@ bool css_has_online_children(struct cgroup_subsys_state *css) return ret; } +static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it) +{ + struct list_head *l; + struct cgrp_cset_link *link; + struct css_set *cset; + + lockdep_assert_held(&css_set_lock); + + /* find the next threaded cset */ + if (it->tcset_pos) { + l = it->tcset_pos->next; + + if (l != it->tcset_head) { + it->tcset_pos = l; + return container_of(l, struct css_set, + threaded_csets_node); + } + + it->tcset_pos = NULL; + } + + /* find the next cset */ + l = it->cset_pos; + l = l->next; + if (l == it->cset_head) { + it->cset_pos = NULL; + return NULL; + } + + if (it->ss) { + cset = container_of(l, struct css_set, e_cset_node[it->ss->id]); + } else { + link = list_entry(l, struct cgrp_cset_link, cset_link); + cset = link->cset; + } + + it->cset_pos = l; + + /* initialize threaded css_set walking */ + if (it->flags & CSS_TASK_ITER_THREADED) { + if (it->cur_dcset) + put_css_set_locked(it->cur_dcset); + it->cur_dcset = cset; + get_css_set(cset); + + it->tcset_head = &cset->threaded_csets; + it->tcset_pos = &cset->threaded_csets; + } + + return cset; +} + /** * css_task_iter_advance_css_set - advance a task itererator to the next css_set * @it: the iterator to advance @@ -3667,32 +4004,19 @@ bool css_has_online_children(struct cgroup_subsys_state *css) */ static void css_task_iter_advance_css_set(struct css_task_iter *it) { - struct list_head *l = it->cset_pos; - struct cgrp_cset_link *link; struct css_set *cset; lockdep_assert_held(&css_set_lock); /* Advance to the next non-empty css_set */ do { - l = l->next; - if (l == it->cset_head) { - it->cset_pos = NULL; + cset = css_task_iter_next_css_set(it); + if (!cset) { it->task_pos = NULL; return; } - - if (it->ss) { - cset = container_of(l, struct css_set, - e_cset_node[it->ss->id]); - } else { - link = list_entry(l, struct cgrp_cset_link, cset_link); - cset = link->cset; - } } while (!css_set_populated(cset)); - it->cset_pos = l; - if (!list_empty(&cset->tasks)) it->task_pos = cset->tasks.next; else @@ -3732,6 +4056,7 @@ static void css_task_iter_advance(struct css_task_iter *it) lockdep_assert_held(&css_set_lock); WARN_ON_ONCE(!l); +repeat: /* * Advance iterator to find next entry. cset->tasks is consumed * first and then ->mg_tasks. After ->mg_tasks, we move onto the @@ -3746,11 +4071,18 @@ static void css_task_iter_advance(struct css_task_iter *it) css_task_iter_advance_css_set(it); else it->task_pos = l; + + /* if PROCS, skip over tasks which aren't group leaders */ + if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos && + !thread_group_leader(list_entry(it->task_pos, struct task_struct, + cg_list))) + goto repeat; } /** * css_task_iter_start - initiate task iteration * @css: the css to walk tasks of + * @flags: CSS_TASK_ITER_* flags * @it: the task iterator to use * * Initiate iteration through the tasks of @css. The caller can call @@ -3758,7 +4090,7 @@ static void css_task_iter_advance(struct css_task_iter *it) * returns NULL. On completion of iteration, css_task_iter_end() must be * called. */ -void css_task_iter_start(struct cgroup_subsys_state *css, +void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it) { /* no one should try to iterate before mounting cgroups */ @@ -3769,6 +4101,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, spin_lock_irq(&css_set_lock); it->ss = css->ss; + it->flags = flags; if (it->ss) it->cset_pos = &css->cgroup->e_csets[css->ss->id]; @@ -3826,6 +4159,9 @@ void css_task_iter_end(struct css_task_iter *it) spin_unlock_irq(&css_set_lock); } + if (it->cur_dcset) + put_css_set(it->cur_dcset); + if (it->cur_task) put_task_struct(it->cur_task); } @@ -3842,16 +4178,12 @@ static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; struct css_task_iter *it = of->priv; - struct task_struct *task; - - do { - task = css_task_iter_next(it); - } while (task && !thread_group_leader(task)); - return task; + return css_task_iter_next(it); } -static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) +static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, + unsigned int iter_flags) { struct kernfs_open_file *of = s->private; struct cgroup *cgrp = seq_css(s)->cgroup; @@ -3869,24 +4201,169 @@ static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) if (!it) return ERR_PTR(-ENOMEM); of->priv = it; - css_task_iter_start(&cgrp->self, it); + css_task_iter_start(&cgrp->self, iter_flags, it); } else if (!(*pos)++) { css_task_iter_end(it); - css_task_iter_start(&cgrp->self, it); + css_task_iter_start(&cgrp->self, iter_flags, it); } return cgroup_procs_next(s, NULL, NULL); } +static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) +{ + struct cgroup *cgrp = seq_css(s)->cgroup; + + /* + * All processes of a threaded subtree belong to the domain cgroup + * of the subtree. Only threads can be distributed across the + * subtree. Reject reads on cgroup.procs in the subtree proper. + * They're always empty anyway. + */ + if (cgroup_is_threaded(cgrp)) + return ERR_PTR(-EOPNOTSUPP); + + return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS | + CSS_TASK_ITER_THREADED); +} + static int cgroup_procs_show(struct seq_file *s, void *v) { - seq_printf(s, "%d\n", task_tgid_vnr(v)); + seq_printf(s, "%d\n", task_pid_vnr(v)); + return 0; +} + +static int cgroup_procs_write_permission(struct cgroup *src_cgrp, + struct cgroup *dst_cgrp, + struct super_block *sb) +{ + struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; + struct cgroup *com_cgrp = src_cgrp; + struct inode *inode; + int ret; + + lockdep_assert_held(&cgroup_mutex); + + /* find the common ancestor */ + while (!cgroup_is_descendant(dst_cgrp, com_cgrp)) + com_cgrp = cgroup_parent(com_cgrp); + + /* %current should be authorized to migrate to the common ancestor */ + inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn); + if (!inode) + return -ENOMEM; + + ret = inode_permission(inode, MAY_WRITE); + iput(inode); + if (ret) + return ret; + + /* + * If namespaces are delegation boundaries, %current must be able + * to see both source and destination cgroups from its namespace. + */ + if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) && + (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) || + !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp))) + return -ENOENT; + return 0; } +static ssize_t cgroup_procs_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup *src_cgrp, *dst_cgrp; + struct task_struct *task; + ssize_t ret; + + dst_cgrp = cgroup_kn_lock_live(of->kn, false); + if (!dst_cgrp) + return -ENODEV; + + task = cgroup_procs_write_start(buf, true); + ret = PTR_ERR_OR_ZERO(task); + if (ret) + goto out_unlock; + + /* find the source cgroup */ + spin_lock_irq(&css_set_lock); + src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); + spin_unlock_irq(&css_set_lock); + + ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, + of->file->f_path.dentry->d_sb); + if (ret) + goto out_finish; + + ret = cgroup_attach_task(dst_cgrp, task, true); + +out_finish: + cgroup_procs_write_finish(task); +out_unlock: + cgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + +static void *cgroup_threads_start(struct seq_file *s, loff_t *pos) +{ + return __cgroup_procs_start(s, pos, 0); +} + +static ssize_t cgroup_threads_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup *src_cgrp, *dst_cgrp; + struct task_struct *task; + ssize_t ret; + + buf = strstrip(buf); + + dst_cgrp = cgroup_kn_lock_live(of->kn, false); + if (!dst_cgrp) + return -ENODEV; + + task = cgroup_procs_write_start(buf, false); + ret = PTR_ERR_OR_ZERO(task); + if (ret) + goto out_unlock; + + /* find the source cgroup */ + spin_lock_irq(&css_set_lock); + src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); + spin_unlock_irq(&css_set_lock); + + /* thread migrations follow the cgroup.procs delegation rule */ + ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, + of->file->f_path.dentry->d_sb); + if (ret) + goto out_finish; + + /* and must be contained in the same domain */ + ret = -EOPNOTSUPP; + if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp) + goto out_finish; + + ret = cgroup_attach_task(dst_cgrp, task, false); + +out_finish: + cgroup_procs_write_finish(task); +out_unlock: + cgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + /* cgroup core interface files for the default hierarchy */ static struct cftype cgroup_base_files[] = { { + .name = "cgroup.type", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_type_show, + .write = cgroup_type_write, + }, + { .name = "cgroup.procs", .flags = CFTYPE_NS_DELEGATABLE, .file_offset = offsetof(struct cgroup, procs_file), @@ -3897,6 +4374,14 @@ static struct cftype cgroup_base_files[] = { .write = cgroup_procs_write, }, { + .name = "cgroup.threads", + .release = cgroup_procs_release, + .seq_start = cgroup_threads_start, + .seq_next = cgroup_procs_next, + .seq_show = cgroup_procs_show, + .write = cgroup_threads_write, + }, + { .name = "cgroup.controllers", .seq_show = cgroup_controllers_show, }, @@ -3912,6 +4397,20 @@ static struct cftype cgroup_base_files[] = { .file_offset = offsetof(struct cgroup, events_file), .seq_show = cgroup_events_show, }, + { + .name = "cgroup.max.descendants", + .seq_show = cgroup_max_descendants_show, + .write = cgroup_max_descendants_write, + }, + { + .name = "cgroup.max.depth", + .seq_show = cgroup_max_depth_show, + .write = cgroup_max_depth_write, + }, + { + .name = "cgroup.stat", + .seq_show = cgroup_stat_show, + }, { } /* terminate */ }; @@ -4011,9 +4510,15 @@ static void css_release_work_fn(struct work_struct *work) if (ss->css_released) ss->css_released(css); } else { + struct cgroup *tcgrp; + /* cgroup release path */ trace_cgroup_release(cgrp); + for (tcgrp = cgroup_parent(cgrp); tcgrp; + tcgrp = cgroup_parent(tcgrp)) + tcgrp->nr_dying_descendants--; + cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; @@ -4100,9 +4605,6 @@ static void offline_css(struct cgroup_subsys_state *css) if (!(css->flags & CSS_ONLINE)) return; - if (ss->css_reset) - ss->css_reset(css); - if (ss->css_offline) ss->css_offline(css); @@ -4212,9 +4714,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent) cgrp->root = root; cgrp->level = level; - for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) + for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; + if (tcgrp != cgrp) + tcgrp->nr_descendants++; + } + if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -4255,6 +4761,29 @@ out_free_cgrp: return ERR_PTR(ret); } +static bool cgroup_check_hierarchy_limits(struct cgroup *parent) +{ + struct cgroup *cgroup; + int ret = false; + int level = 1; + + lockdep_assert_held(&cgroup_mutex); + + for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) { + if (cgroup->nr_descendants >= cgroup->max_descendants) + goto fail; + + if (level > cgroup->max_depth) + goto fail; + + level++; + } + + ret = true; +fail: + return ret; +} + int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { struct cgroup *parent, *cgrp; @@ -4269,6 +4798,11 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) if (!parent) return -ENODEV; + if (!cgroup_check_hierarchy_limits(parent)) { + ret = -EAGAIN; + goto out_unlock; + } + cgrp = cgroup_create(parent); if (IS_ERR(cgrp)) { ret = PTR_ERR(cgrp); @@ -4420,6 +4954,7 @@ static void kill_css(struct cgroup_subsys_state *css) static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { + struct cgroup *tcgrp, *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *css; struct cgrp_cset_link *link; int ssid; @@ -4464,7 +4999,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ kernfs_remove(cgrp->kn); - cgroup1_check_for_release(cgroup_parent(cgrp)); + if (parent && cgroup_is_threaded(cgrp)) + parent->nr_threaded_children--; + + for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) { + tcgrp->nr_descendants--; + tcgrp->nr_dying_descendants++; + } + + cgroup1_check_for_release(parent); /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); @@ -4659,11 +5202,17 @@ int __init cgroup_init(void) cgrp_dfl_root.subsys_mask |= 1 << ss->id; + /* implicit controllers must be threaded too */ + WARN_ON(ss->implicit_on_dfl && !ss->threaded); + if (ss->implicit_on_dfl) cgrp_dfl_implicit_ss_mask |= 1 << ss->id; else if (!ss->dfl_cftypes) cgrp_dfl_inhibit_ss_mask |= 1 << ss->id; + if (ss->threaded) + cgrp_dfl_threaded_ss_mask |= 1 << ss->id; + if (ss->dfl_cftypes == ss->legacy_cftypes) { WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes)); } else { @@ -4708,6 +5257,18 @@ static int __init cgroup_wq_init(void) } core_initcall(cgroup_wq_init); +void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, + char *buf, size_t buflen) +{ + struct kernfs_node *kn; + + kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id); + if (!kn) + return; + kernfs_path(kn, buf, buflen); + kernfs_put(kn); +} + /* * proc_cgroup_show() * - Print task's cgroup paths into seq_file, one line for each hierarchy diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 2f4039bafebb..67230ecf2ce1 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -56,6 +56,7 @@ #include <linux/time64.h> #include <linux/backing-dev.h> #include <linux/sort.h> +#include <linux/oom.h> #include <linux/uaccess.h> #include <linux/atomic.h> @@ -300,6 +301,16 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); /* + * Cgroup v2 behavior is used when on default hierarchy or the + * cgroup_v2_mode flag is set. + */ +static inline bool is_in_v2_mode(void) +{ + return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || + (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); +} + +/* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead @@ -489,8 +500,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) /* On legacy hiearchy, we must be a subset of our parent cpuset. */ ret = -EACCES; - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - !is_cpuset_subset(trial, par)) + if (!is_in_v2_mode() && !is_cpuset_subset(trial, par)) goto out; /* @@ -869,7 +879,7 @@ static void update_tasks_cpumask(struct cpuset *cs) struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&cs->css, &it); + css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) set_cpus_allowed_ptr(task, cs->effective_cpus); css_task_iter_end(&it); @@ -903,8 +913,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some CPUs. */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - cpumask_empty(new_cpus)) + if (is_in_v2_mode() && cpumask_empty(new_cpus)) cpumask_copy(new_cpus, parent->effective_cpus); /* Skip the whole subtree if the cpumask remains the same. */ @@ -921,7 +930,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) cpumask_copy(cp->effective_cpus, new_cpus); spin_unlock_irq(&callback_lock); - WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); update_tasks_cpumask(cp); @@ -1099,7 +1108,7 @@ static void update_tasks_nodemask(struct cpuset *cs) * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. */ - css_task_iter_start(&cs->css, &it); + css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) { struct mm_struct *mm; bool migrate; @@ -1157,8 +1166,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some MEMs. */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - nodes_empty(*new_mems)) + if (is_in_v2_mode() && nodes_empty(*new_mems)) *new_mems = parent->effective_mems; /* Skip the whole subtree if the nodemask remains the same. */ @@ -1175,7 +1183,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) cp->effective_mems = *new_mems; spin_unlock_irq(&callback_lock); - WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + WARN_ON(!is_in_v2_mode() && !nodes_equal(cp->mems_allowed, cp->effective_mems)); update_tasks_nodemask(cp); @@ -1292,7 +1300,7 @@ static void update_tasks_flags(struct cpuset *cs) struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&cs->css, &it); + css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) cpuset_update_task_spread_flag(cs, task); css_task_iter_end(&it); @@ -1467,7 +1475,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) /* allow moving tasks into an empty cpuset if on default hierarchy */ ret = -ENOSPC; - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + if (!is_in_v2_mode() && (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; @@ -1986,7 +1994,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpuset_inc(); spin_lock_irq(&callback_lock); - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { + if (is_in_v2_mode()) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; } @@ -2063,7 +2071,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) mutex_lock(&cpuset_mutex); spin_lock_irq(&callback_lock); - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { + if (is_in_v2_mode()) { cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); top_cpuset.mems_allowed = node_possible_map; } else { @@ -2257,7 +2265,7 @@ retry: cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); mems_updated = !nodes_equal(new_mems, cs->effective_mems); - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + if (is_in_v2_mode()) hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); else @@ -2288,7 +2296,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) static cpumask_t new_cpus; static nodemask_t new_mems; bool cpus_updated, mems_updated; - bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys); + bool on_dfl = is_in_v2_mode(); mutex_lock(&cpuset_mutex); @@ -2500,12 +2508,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * If we're in interrupt, yes, we can always allocate. If @node is set in * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, - * yes. If current has access to memory reserves due to TIF_MEMDIE, yes. + * yes. If current has access to memory reserves as an oom victim, yes. * Otherwise, no. * * GFP_USER allocations are marked with the __GFP_HARDWALL bit, * and do not allow allocations outside the current tasks cpuset - * unless the task has been OOM killed as is marked TIF_MEMDIE. + * unless the task has been OOM killed. * GFP_KERNEL allocations are not so marked, so can escape to the * nearest enclosing hardwalled ancestor cpuset. * @@ -2528,7 +2536,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * affect that: * in_interrupt - any node ok (current task context irrelevant) * GFP_ATOMIC - any node ok - * TIF_MEMDIE - any node ok + * tsk_is_oom_victim - any node ok * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ @@ -2546,7 +2554,7 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask) * Allow tasks that have access to memory reserves because they have * been OOM killed to get memory anywhere. */ - if (unlikely(test_thread_flag(TIF_MEMDIE))) + if (unlikely(tsk_is_oom_victim(current))) return true; if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ return false; diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index dac46af22782..f661b4cc5efd 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -114,27 +114,49 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) { struct cgroup_subsys_state *css = seq_css(seq); struct cgrp_cset_link *link; - int dead_cnt = 0, extra_refs = 0; + int dead_cnt = 0, extra_refs = 0, threaded_csets = 0; spin_lock_irq(&css_set_lock); + list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { struct css_set *cset = link->cset; struct task_struct *task; int count = 0; int refcnt = refcount_read(&cset->refcount); - seq_printf(seq, " %d", refcnt); - if (refcnt - cset->nr_tasks > 0) { - int extra = refcnt - cset->nr_tasks; - - seq_printf(seq, " +%d", extra); - /* - * Take out the one additional reference in - * init_css_set. - */ - if (cset == &init_css_set) - extra--; - extra_refs += extra; + /* + * Print out the proc_cset and threaded_cset relationship + * and highlight difference between refcount and task_count. + */ + seq_printf(seq, "css_set %pK", cset); + if (rcu_dereference_protected(cset->dom_cset, 1) != cset) { + threaded_csets++; + seq_printf(seq, "=>%pK", cset->dom_cset); + } + if (!list_empty(&cset->threaded_csets)) { + struct css_set *tcset; + int idx = 0; + + list_for_each_entry(tcset, &cset->threaded_csets, + threaded_csets_node) { + seq_puts(seq, idx ? "," : "<="); + seq_printf(seq, "%pK", tcset); + idx++; + } + } else { + seq_printf(seq, " %d", refcnt); + if (refcnt - cset->nr_tasks > 0) { + int extra = refcnt - cset->nr_tasks; + + seq_printf(seq, " +%d", extra); + /* + * Take out the one additional reference in + * init_css_set. + */ + if (cset == &init_css_set) + extra--; + extra_refs += extra; + } } seq_puts(seq, "\n"); @@ -163,10 +185,12 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) } spin_unlock_irq(&css_set_lock); - if (!dead_cnt && !extra_refs) + if (!dead_cnt && !extra_refs && !threaded_csets) return 0; seq_puts(seq, "\n"); + if (threaded_csets) + seq_printf(seq, "threaded css_sets = %d\n", threaded_csets); if (extra_refs) seq_printf(seq, "extra references = %d\n", extra_refs); if (dead_cnt) @@ -352,6 +376,7 @@ static int __init enable_cgroup_debug(char *str) { debug_cgrp_subsys.dfl_cftypes = debug_files; debug_cgrp_subsys.implicit_on_dfl = true; + debug_cgrp_subsys.threaded = true; return 1; } __setup("cgroup_debug", enable_cgroup_debug); diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index 1b72d56edce5..08236798d173 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -268,7 +268,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css) rcu_read_unlock(); /* are all tasks frozen? */ - css_task_iter_start(css, &it); + css_task_iter_start(css, 0, &it); while ((task = css_task_iter_next(&it))) { if (freezing(task)) { @@ -320,7 +320,7 @@ static void freeze_cgroup(struct freezer *freezer) struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&freezer->css, &it); + css_task_iter_start(&freezer->css, 0, &it); while ((task = css_task_iter_next(&it))) freeze_task(task); css_task_iter_end(&it); @@ -331,7 +331,7 @@ static void unfreeze_cgroup(struct freezer *freezer) struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&freezer->css, &it); + css_task_iter_start(&freezer->css, 0, &it); while ((task = css_task_iter_next(&it))) __thaw_task(task); css_task_iter_end(&it); diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 2237201d66d5..9829c67ebc0a 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -345,4 +345,5 @@ struct cgroup_subsys pids_cgrp_subsys = { .free = pids_free, .legacy_cftypes = pids_files, .dfl_cftypes = pids_files, + .threaded = true, }; diff --git a/kernel/events/core.c b/kernel/events/core.c index fb415e3d824b..3e691b75b2db 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11293,5 +11293,6 @@ struct cgroup_subsys perf_event_cgrp_subsys = { * controller is not mounted on a legacy hierarchy. */ .implicit_on_dfl = true, + .threaded = true, }; #endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/fork.c b/kernel/fork.c index 4e5345c07344..6f1b0af00bda 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -37,6 +37,7 @@ #include <linux/binfmts.h> #include <linux/mman.h> #include <linux/mmu_notifier.h> +#include <linux/hmm.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/vmacache.h> @@ -657,7 +658,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, retval = dup_userfaultfd(tmp, &uf); if (retval) goto fail_nomem_anon_vma_fork; - if (anon_vma_fork(tmp, mpnt)) + if (tmp->vm_flags & VM_WIPEONFORK) { + /* VM_WIPEONFORK gets a clean slate in the child. */ + tmp->anon_vma = NULL; + if (anon_vma_prepare(tmp)) + goto fail_nomem_anon_vma_fork; + } else if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); tmp->vm_next = tmp->vm_prev = NULL; @@ -701,7 +707,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, rb_parent = &tmp->vm_rb; mm->map_count++; - retval = copy_page_range(mm, oldmm, mpnt); + if (!(tmp->vm_flags & VM_WIPEONFORK)) + retval = copy_page_range(mm, oldmm, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); @@ -818,6 +825,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_owner(mm, p); RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_mm_init(mm); + hmm_mm_init(mm); init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; @@ -897,6 +905,7 @@ void __mmdrop(struct mm_struct *mm) BUG_ON(mm == &init_mm); mm_free_pgd(mm); destroy_context(mm); + hmm_mm_destroy(mm); mmu_notifier_mm_destroy(mm); check_mm(mm); put_user_ns(mm->user_ns); @@ -922,7 +931,6 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); - set_bit(MMF_OOM_SKIP, &mm->flags); mmdrop(mm); } @@ -938,22 +946,6 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); -#ifdef CONFIG_MMU -static void mmput_async_fn(struct work_struct *work) -{ - struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); - __mmput(mm); -} - -void mmput_async(struct mm_struct *mm) -{ - if (atomic_dec_and_test(&mm->mm_users)) { - INIT_WORK(&mm->async_put_work, mmput_async_fn); - schedule_work(&mm->async_put_work); - } -} -#endif - /** * set_mm_exe_file - change a reference to the mm's executable file * @@ -1470,8 +1462,7 @@ static void rt_mutex_init_task(struct task_struct *p) { raw_spin_lock_init(&p->pi_lock); #ifdef CONFIG_RT_MUTEXES - p->pi_waiters = RB_ROOT; - p->pi_waiters_leftmost = NULL; + p->pi_waiters = RB_ROOT_CACHED; p->pi_top_task = NULL; p->pi_blocked_on = NULL; #endif diff --git a/kernel/kcov.c b/kernel/kcov.c index cd771993f96f..3f693a0f6f3e 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -270,6 +270,7 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) static const struct file_operations kcov_fops = { .open = kcov_open, .unlocked_ioctl = kcov_ioctl, + .compat_ioctl = kcov_ioctl, .mmap = kcov_mmap, .release = kcov_close, }; diff --git a/kernel/kmod.c b/kernel/kmod.c index 2f37acde640b..bc6addd9152b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -1,23 +1,6 @@ /* - kmod, the new module loader (replaces kerneld) - Kirk Petersen - - Reorganized not to be a daemon by Adam Richter, with guidance - from Greg Zornetzer. - - Modified to avoid chroot and file sharing problems. - Mikael Pettersson - - Limit the concurrent number of kmod modprobes to catch loops from - "modprobe needs a service that is in a module". - Keith Owens <kaos@ocs.com.au> December 1999 - - Unblock all signals when we exec a usermode process. - Shuu Yamaguchi <shuu@wondernetworkresources.com> December 2000 - - call_usermodehelper wait flag, and remove exec_usermodehelper. - Rusty Russell <rusty@rustcorp.com.au> Jan 2003 -*/ + * kmod - the kernel module loader + */ #include <linux/module.h> #include <linux/sched.h> #include <linux/sched/task.h> @@ -45,15 +28,6 @@ #include <trace/events/module.h> -#define CAP_BSET (void *)1 -#define CAP_PI (void *)2 - -static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; -static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; -static DEFINE_SPINLOCK(umh_sysctl_lock); -static DECLARE_RWSEM(umhelper_sem); - -#ifdef CONFIG_MODULES /* * Assuming: * @@ -202,536 +176,3 @@ int __request_module(bool wait, const char *fmt, ...) return ret; } EXPORT_SYMBOL(__request_module); - -#endif /* CONFIG_MODULES */ - -static void call_usermodehelper_freeinfo(struct subprocess_info *info) -{ - if (info->cleanup) - (*info->cleanup)(info); - kfree(info); -} - -static void umh_complete(struct subprocess_info *sub_info) -{ - struct completion *comp = xchg(&sub_info->complete, NULL); - /* - * See call_usermodehelper_exec(). If xchg() returns NULL - * we own sub_info, the UMH_KILLABLE caller has gone away - * or the caller used UMH_NO_WAIT. - */ - if (comp) - complete(comp); - else - call_usermodehelper_freeinfo(sub_info); -} - -/* - * This is the task which runs the usermode application - */ -static int call_usermodehelper_exec_async(void *data) -{ - struct subprocess_info *sub_info = data; - struct cred *new; - int retval; - - spin_lock_irq(¤t->sighand->siglock); - flush_signal_handlers(current, 1); - spin_unlock_irq(¤t->sighand->siglock); - - /* - * Our parent (unbound workqueue) runs with elevated scheduling - * priority. Avoid propagating that into the userspace child. - */ - set_user_nice(current, 0); - - retval = -ENOMEM; - new = prepare_kernel_cred(current); - if (!new) - goto out; - - spin_lock(&umh_sysctl_lock); - new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); - new->cap_inheritable = cap_intersect(usermodehelper_inheritable, - new->cap_inheritable); - spin_unlock(&umh_sysctl_lock); - - if (sub_info->init) { - retval = sub_info->init(sub_info, new); - if (retval) { - abort_creds(new); - goto out; - } - } - - commit_creds(new); - - retval = do_execve(getname_kernel(sub_info->path), - (const char __user *const __user *)sub_info->argv, - (const char __user *const __user *)sub_info->envp); -out: - sub_info->retval = retval; - /* - * call_usermodehelper_exec_sync() will call umh_complete - * if UHM_WAIT_PROC. - */ - if (!(sub_info->wait & UMH_WAIT_PROC)) - umh_complete(sub_info); - if (!retval) - return 0; - do_exit(0); -} - -/* Handles UMH_WAIT_PROC. */ -static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) -{ - pid_t pid; - - /* If SIGCLD is ignored sys_wait4 won't populate the status. */ - kernel_sigaction(SIGCHLD, SIG_DFL); - pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); - if (pid < 0) { - sub_info->retval = pid; - } else { - int ret = -ECHILD; - /* - * Normally it is bogus to call wait4() from in-kernel because - * wait4() wants to write the exit code to a userspace address. - * But call_usermodehelper_exec_sync() always runs as kernel - * thread (workqueue) and put_user() to a kernel address works - * OK for kernel threads, due to their having an mm_segment_t - * which spans the entire address space. - * - * Thus the __user pointer cast is valid here. - */ - sys_wait4(pid, (int __user *)&ret, 0, NULL); - - /* - * If ret is 0, either call_usermodehelper_exec_async failed and - * the real error code is already in sub_info->retval or - * sub_info->retval is 0 anyway, so don't mess with it then. - */ - if (ret) - sub_info->retval = ret; - } - - /* Restore default kernel sig handler */ - kernel_sigaction(SIGCHLD, SIG_IGN); - - umh_complete(sub_info); -} - -/* - * We need to create the usermodehelper kernel thread from a task that is affine - * to an optimized set of CPUs (or nohz housekeeping ones) such that they - * inherit a widest affinity irrespective of call_usermodehelper() callers with - * possibly reduced affinity (eg: per-cpu workqueues). We don't want - * usermodehelper targets to contend a busy CPU. - * - * Unbound workqueues provide such wide affinity and allow to block on - * UMH_WAIT_PROC requests without blocking pending request (up to some limit). - * - * Besides, workqueues provide the privilege level that caller might not have - * to perform the usermodehelper request. - * - */ -static void call_usermodehelper_exec_work(struct work_struct *work) -{ - struct subprocess_info *sub_info = - container_of(work, struct subprocess_info, work); - - if (sub_info->wait & UMH_WAIT_PROC) { - call_usermodehelper_exec_sync(sub_info); - } else { - pid_t pid; - /* - * Use CLONE_PARENT to reparent it to kthreadd; we do not - * want to pollute current->children, and we need a parent - * that always ignores SIGCHLD to ensure auto-reaping. - */ - pid = kernel_thread(call_usermodehelper_exec_async, sub_info, - CLONE_PARENT | SIGCHLD); - if (pid < 0) { - sub_info->retval = pid; - umh_complete(sub_info); - } - } -} - -/* - * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY - * (used for preventing user land processes from being created after the user - * land has been frozen during a system-wide hibernation or suspend operation). - * Should always be manipulated under umhelper_sem acquired for write. - */ -static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED; - -/* Number of helpers running */ -static atomic_t running_helpers = ATOMIC_INIT(0); - -/* - * Wait queue head used by usermodehelper_disable() to wait for all running - * helpers to finish. - */ -static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); - -/* - * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled - * to become 'false'. - */ -static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq); - -/* - * Time to wait for running_helpers to become zero before the setting of - * usermodehelper_disabled in usermodehelper_disable() fails - */ -#define RUNNING_HELPERS_TIMEOUT (5 * HZ) - -int usermodehelper_read_trylock(void) -{ - DEFINE_WAIT(wait); - int ret = 0; - - down_read(&umhelper_sem); - for (;;) { - prepare_to_wait(&usermodehelper_disabled_waitq, &wait, - TASK_INTERRUPTIBLE); - if (!usermodehelper_disabled) - break; - - if (usermodehelper_disabled == UMH_DISABLED) - ret = -EAGAIN; - - up_read(&umhelper_sem); - - if (ret) - break; - - schedule(); - try_to_freeze(); - - down_read(&umhelper_sem); - } - finish_wait(&usermodehelper_disabled_waitq, &wait); - return ret; -} -EXPORT_SYMBOL_GPL(usermodehelper_read_trylock); - -long usermodehelper_read_lock_wait(long timeout) -{ - DEFINE_WAIT(wait); - - if (timeout < 0) - return -EINVAL; - - down_read(&umhelper_sem); - for (;;) { - prepare_to_wait(&usermodehelper_disabled_waitq, &wait, - TASK_UNINTERRUPTIBLE); - if (!usermodehelper_disabled) - break; - - up_read(&umhelper_sem); - - timeout = schedule_timeout(timeout); - if (!timeout) - break; - - down_read(&umhelper_sem); - } - finish_wait(&usermodehelper_disabled_waitq, &wait); - return timeout; -} -EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait); - -void usermodehelper_read_unlock(void) -{ - up_read(&umhelper_sem); -} -EXPORT_SYMBOL_GPL(usermodehelper_read_unlock); - -/** - * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled. - * @depth: New value to assign to usermodehelper_disabled. - * - * Change the value of usermodehelper_disabled (under umhelper_sem locked for - * writing) and wakeup tasks waiting for it to change. - */ -void __usermodehelper_set_disable_depth(enum umh_disable_depth depth) -{ - down_write(&umhelper_sem); - usermodehelper_disabled = depth; - wake_up(&usermodehelper_disabled_waitq); - up_write(&umhelper_sem); -} - -/** - * __usermodehelper_disable - Prevent new helpers from being started. - * @depth: New value to assign to usermodehelper_disabled. - * - * Set usermodehelper_disabled to @depth and wait for running helpers to exit. - */ -int __usermodehelper_disable(enum umh_disable_depth depth) -{ - long retval; - - if (!depth) - return -EINVAL; - - down_write(&umhelper_sem); - usermodehelper_disabled = depth; - up_write(&umhelper_sem); - - /* - * From now on call_usermodehelper_exec() won't start any new - * helpers, so it is sufficient if running_helpers turns out to - * be zero at one point (it may be increased later, but that - * doesn't matter). - */ - retval = wait_event_timeout(running_helpers_waitq, - atomic_read(&running_helpers) == 0, - RUNNING_HELPERS_TIMEOUT); - if (retval) - return 0; - - __usermodehelper_set_disable_depth(UMH_ENABLED); - return -EAGAIN; -} - -static void helper_lock(void) -{ - atomic_inc(&running_helpers); - smp_mb__after_atomic(); -} - -static void helper_unlock(void) -{ - if (atomic_dec_and_test(&running_helpers)) - wake_up(&running_helpers_waitq); -} - -/** - * call_usermodehelper_setup - prepare to call a usermode helper - * @path: path to usermode executable - * @argv: arg vector for process - * @envp: environment for process - * @gfp_mask: gfp mask for memory allocation - * @cleanup: a cleanup function - * @init: an init function - * @data: arbitrary context sensitive data - * - * Returns either %NULL on allocation failure, or a subprocess_info - * structure. This should be passed to call_usermodehelper_exec to - * exec the process and free the structure. - * - * The init function is used to customize the helper process prior to - * exec. A non-zero return code causes the process to error out, exit, - * and return the failure to the calling process - * - * The cleanup function is just before ethe subprocess_info is about to - * be freed. This can be used for freeing the argv and envp. The - * Function must be runnable in either a process context or the - * context in which call_usermodehelper_exec is called. - */ -struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv, - char **envp, gfp_t gfp_mask, - int (*init)(struct subprocess_info *info, struct cred *new), - void (*cleanup)(struct subprocess_info *info), - void *data) -{ - struct subprocess_info *sub_info; - sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask); - if (!sub_info) - goto out; - - INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); - -#ifdef CONFIG_STATIC_USERMODEHELPER - sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH; -#else - sub_info->path = path; -#endif - sub_info->argv = argv; - sub_info->envp = envp; - - sub_info->cleanup = cleanup; - sub_info->init = init; - sub_info->data = data; - out: - return sub_info; -} -EXPORT_SYMBOL(call_usermodehelper_setup); - -/** - * call_usermodehelper_exec - start a usermode application - * @sub_info: information about the subprocessa - * @wait: wait for the application to finish and return status. - * when UMH_NO_WAIT don't wait at all, but you get no useful error back - * when the program couldn't be exec'ed. This makes it safe to call - * from interrupt context. - * - * Runs a user-space application. The application is started - * asynchronously if wait is not set, and runs as a child of system workqueues. - * (ie. it runs with full root capabilities and optimized affinity). - */ -int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) -{ - DECLARE_COMPLETION_ONSTACK(done); - int retval = 0; - - if (!sub_info->path) { - call_usermodehelper_freeinfo(sub_info); - return -EINVAL; - } - helper_lock(); - if (usermodehelper_disabled) { - retval = -EBUSY; - goto out; - } - - /* - * If there is no binary for us to call, then just return and get out of - * here. This allows us to set STATIC_USERMODEHELPER_PATH to "" and - * disable all call_usermodehelper() calls. - */ - if (strlen(sub_info->path) == 0) - goto out; - - /* - * Set the completion pointer only if there is a waiter. - * This makes it possible to use umh_complete to free - * the data structure in case of UMH_NO_WAIT. - */ - sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; - sub_info->wait = wait; - - queue_work(system_unbound_wq, &sub_info->work); - if (wait == UMH_NO_WAIT) /* task has freed sub_info */ - goto unlock; - - if (wait & UMH_KILLABLE) { - retval = wait_for_completion_killable(&done); - if (!retval) - goto wait_done; - - /* umh_complete() will see NULL and free sub_info */ - if (xchg(&sub_info->complete, NULL)) - goto unlock; - /* fallthrough, umh_complete() was already called */ - } - - wait_for_completion(&done); -wait_done: - retval = sub_info->retval; -out: - call_usermodehelper_freeinfo(sub_info); -unlock: - helper_unlock(); - return retval; -} -EXPORT_SYMBOL(call_usermodehelper_exec); - -/** - * call_usermodehelper() - prepare and start a usermode application - * @path: path to usermode executable - * @argv: arg vector for process - * @envp: environment for process - * @wait: wait for the application to finish and return status. - * when UMH_NO_WAIT don't wait at all, but you get no useful error back - * when the program couldn't be exec'ed. This makes it safe to call - * from interrupt context. - * - * This function is the equivalent to use call_usermodehelper_setup() and - * call_usermodehelper_exec(). - */ -int call_usermodehelper(const char *path, char **argv, char **envp, int wait) -{ - struct subprocess_info *info; - gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; - - info = call_usermodehelper_setup(path, argv, envp, gfp_mask, - NULL, NULL, NULL); - if (info == NULL) - return -ENOMEM; - - return call_usermodehelper_exec(info, wait); -} -EXPORT_SYMBOL(call_usermodehelper); - -static int proc_cap_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table t; - unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; - kernel_cap_t new_cap; - int err, i; - - if (write && (!capable(CAP_SETPCAP) || - !capable(CAP_SYS_MODULE))) - return -EPERM; - - /* - * convert from the global kernel_cap_t to the ulong array to print to - * userspace if this is a read. - */ - spin_lock(&umh_sysctl_lock); - for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) { - if (table->data == CAP_BSET) - cap_array[i] = usermodehelper_bset.cap[i]; - else if (table->data == CAP_PI) - cap_array[i] = usermodehelper_inheritable.cap[i]; - else - BUG(); - } - spin_unlock(&umh_sysctl_lock); - - t = *table; - t.data = &cap_array; - - /* - * actually read or write and array of ulongs from userspace. Remember - * these are least significant 32 bits first - */ - err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); - if (err < 0) - return err; - - /* - * convert from the sysctl array of ulongs to the kernel_cap_t - * internal representation - */ - for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) - new_cap.cap[i] = cap_array[i]; - - /* - * Drop everything not in the new_cap (but don't add things) - */ - spin_lock(&umh_sysctl_lock); - if (write) { - if (table->data == CAP_BSET) - usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap); - if (table->data == CAP_PI) - usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap); - } - spin_unlock(&umh_sysctl_lock); - - return 0; -} - -struct ctl_table usermodehelper_table[] = { - { - .procname = "bset", - .data = CAP_BSET, - .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), - .mode = 0600, - .proc_handler = proc_cap_handler, - }, - { - .procname = "inheritable", - .data = CAP_PI, - .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), - .mode = 0600, - .proc_handler = proc_cap_handler, - }, - { } -}; diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index ac35e648b0e5..f4a74e78d467 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -58,7 +58,7 @@ static void printk_lock(struct rt_mutex *lock, int print_owner) void rt_mutex_debug_task_free(struct task_struct *task) { - DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters)); + DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root)); DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); } diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 649dc9d3951a..6f3dba6e4e9e 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -271,10 +271,10 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left, static void rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) { - struct rb_node **link = &lock->waiters.rb_node; + struct rb_node **link = &lock->waiters.rb_root.rb_node; struct rb_node *parent = NULL; struct rt_mutex_waiter *entry; - int leftmost = 1; + bool leftmost = true; while (*link) { parent = *link; @@ -283,15 +283,12 @@ rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) link = &parent->rb_left; } else { link = &parent->rb_right; - leftmost = 0; + leftmost = false; } } - if (leftmost) - lock->waiters_leftmost = &waiter->tree_entry; - rb_link_node(&waiter->tree_entry, parent, link); - rb_insert_color(&waiter->tree_entry, &lock->waiters); + rb_insert_color_cached(&waiter->tree_entry, &lock->waiters, leftmost); } static void @@ -300,20 +297,17 @@ rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) if (RB_EMPTY_NODE(&waiter->tree_entry)) return; - if (lock->waiters_leftmost == &waiter->tree_entry) - lock->waiters_leftmost = rb_next(&waiter->tree_entry); - - rb_erase(&waiter->tree_entry, &lock->waiters); + rb_erase_cached(&waiter->tree_entry, &lock->waiters); RB_CLEAR_NODE(&waiter->tree_entry); } static void rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) { - struct rb_node **link = &task->pi_waiters.rb_node; + struct rb_node **link = &task->pi_waiters.rb_root.rb_node; struct rb_node *parent = NULL; struct rt_mutex_waiter *entry; - int leftmost = 1; + bool leftmost = true; while (*link) { parent = *link; @@ -322,15 +316,12 @@ rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) link = &parent->rb_left; } else { link = &parent->rb_right; - leftmost = 0; + leftmost = false; } } - if (leftmost) - task->pi_waiters_leftmost = &waiter->pi_tree_entry; - rb_link_node(&waiter->pi_tree_entry, parent, link); - rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters); + rb_insert_color_cached(&waiter->pi_tree_entry, &task->pi_waiters, leftmost); } static void @@ -339,10 +330,7 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) if (RB_EMPTY_NODE(&waiter->pi_tree_entry)) return; - if (task->pi_waiters_leftmost == &waiter->pi_tree_entry) - task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry); - - rb_erase(&waiter->pi_tree_entry, &task->pi_waiters); + rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters); RB_CLEAR_NODE(&waiter->pi_tree_entry); } @@ -1657,8 +1645,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name, { lock->owner = NULL; raw_spin_lock_init(&lock->wait_lock); - lock->waiters = RB_ROOT; - lock->waiters_leftmost = NULL; + lock->waiters = RB_ROOT_CACHED; if (name && key) debug_rt_mutex_init(lock, name, key); diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 8d039b928d61..7453be0485a5 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -45,7 +45,7 @@ struct rt_mutex_waiter { static inline int rt_mutex_has_waiters(struct rt_mutex *lock) { - return !RB_EMPTY_ROOT(&lock->waiters); + return !RB_EMPTY_ROOT(&lock->waiters.rb_root); } static inline struct rt_mutex_waiter * @@ -53,8 +53,8 @@ rt_mutex_top_waiter(struct rt_mutex *lock) { struct rt_mutex_waiter *w; - w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter, - tree_entry); + w = rb_entry(lock->waiters.rb_leftmost, + struct rt_mutex_waiter, tree_entry); BUG_ON(w->lock != lock); return w; @@ -62,14 +62,14 @@ rt_mutex_top_waiter(struct rt_mutex *lock) static inline int task_has_pi_waiters(struct task_struct *p) { - return !RB_EMPTY_ROOT(&p->pi_waiters); + return !RB_EMPTY_ROOT(&p->pi_waiters.rb_root); } static inline struct rt_mutex_waiter * task_top_pi_waiter(struct task_struct *p) { - return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter, - pi_tree_entry); + return rb_entry(p->pi_waiters.rb_leftmost, + struct rt_mutex_waiter, pi_tree_entry); } #else diff --git a/kernel/memremap.c b/kernel/memremap.c index 9afdc434fb49..6bcbfbf1a8fd 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -11,13 +11,14 @@ * General Public License for more details. */ #include <linux/radix-tree.h> -#include <linux/memremap.h> #include <linux/device.h> #include <linux/types.h> #include <linux/pfn_t.h> #include <linux/io.h> #include <linux/mm.h> #include <linux/memory_hotplug.h> +#include <linux/swap.h> +#include <linux/swapops.h> #ifndef ioremap_cache /* temporary while we convert existing ioremap_cache users to memremap */ @@ -194,18 +195,69 @@ struct page_map { struct vmem_altmap altmap; }; -static void pgmap_radix_release(struct resource *res) +static unsigned long order_at(struct resource *res, unsigned long pgoff) { - resource_size_t key, align_start, align_size, align_end; + unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff; + unsigned long nr_pages, mask; - align_start = res->start & ~(SECTION_SIZE - 1); - align_size = ALIGN(resource_size(res), SECTION_SIZE); - align_end = align_start + align_size - 1; + nr_pages = PHYS_PFN(resource_size(res)); + if (nr_pages == pgoff) + return ULONG_MAX; + + /* + * What is the largest aligned power-of-2 range available from + * this resource pgoff to the end of the resource range, + * considering the alignment of the current pgoff? + */ + mask = phys_pgoff | rounddown_pow_of_two(nr_pages - pgoff); + if (!mask) + return ULONG_MAX; + + return find_first_bit(&mask, BITS_PER_LONG); +} + +#define foreach_order_pgoff(res, order, pgoff) \ + for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \ + pgoff += 1UL << order, order = order_at((res), pgoff)) + +#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) +int device_private_entry_fault(struct vm_area_struct *vma, + unsigned long addr, + swp_entry_t entry, + unsigned int flags, + pmd_t *pmdp) +{ + struct page *page = device_private_entry_to_page(entry); + + /* + * The page_fault() callback must migrate page back to system memory + * so that CPU can access it. This might fail for various reasons + * (device issue, device was unsafely unplugged, ...). When such + * error conditions happen, the callback must return VM_FAULT_SIGBUS. + * + * Note that because memory cgroup charges are accounted to the device + * memory, this should never fail because of memory restrictions (but + * allocation of regular system page might still fail because we are + * out of memory). + * + * There is a more in-depth description of what that callback can and + * cannot do, in include/linux/memremap.h + */ + return page->pgmap->page_fault(vma, addr, page, flags, pmdp); +} +EXPORT_SYMBOL(device_private_entry_fault); +#endif /* CONFIG_DEVICE_PRIVATE */ + +static void pgmap_radix_release(struct resource *res) +{ + unsigned long pgoff, order; mutex_lock(&pgmap_lock); - for (key = res->start; key <= res->end; key += SECTION_SIZE) - radix_tree_delete(&pgmap_radix, key >> PA_SECTION_SHIFT); + foreach_order_pgoff(res, order, pgoff) + radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff); mutex_unlock(&pgmap_lock); + + synchronize_rcu(); } static unsigned long pfn_first(struct page_map *page_map) @@ -268,7 +320,7 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys) WARN_ON_ONCE(!rcu_read_lock_held()); - page_map = radix_tree_lookup(&pgmap_radix, phys >> PA_SECTION_SHIFT); + page_map = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys)); return page_map ? &page_map->pgmap : NULL; } @@ -293,12 +345,12 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys) void *devm_memremap_pages(struct device *dev, struct resource *res, struct percpu_ref *ref, struct vmem_altmap *altmap) { - resource_size_t key, align_start, align_size, align_end; + resource_size_t align_start, align_size, align_end; + unsigned long pfn, pgoff, order; pgprot_t pgprot = PAGE_KERNEL; struct dev_pagemap *pgmap; struct page_map *page_map; int error, nid, is_ram; - unsigned long pfn; align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) @@ -333,15 +385,20 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, } pgmap->ref = ref; pgmap->res = &page_map->res; + pgmap->type = MEMORY_DEVICE_HOST; + pgmap->page_fault = NULL; + pgmap->page_free = NULL; + pgmap->data = NULL; mutex_lock(&pgmap_lock); error = 0; align_end = align_start + align_size - 1; - for (key = align_start; key <= align_end; key += SECTION_SIZE) { + + foreach_order_pgoff(res, order, pgoff) { struct dev_pagemap *dup; rcu_read_lock(); - dup = find_dev_pagemap(key); + dup = find_dev_pagemap(res->start + PFN_PHYS(pgoff)); rcu_read_unlock(); if (dup) { dev_err(dev, "%s: %pr collides with mapping for %s\n", @@ -349,8 +406,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, error = -EBUSY; break; } - error = radix_tree_insert(&pgmap_radix, key >> PA_SECTION_SHIFT, - page_map); + error = __radix_tree_insert(&pgmap_radix, + PHYS_PFN(res->start) + pgoff, order, page_map); if (error) { dev_err(dev, "%s: failed: %d\n", __func__, error); break; @@ -442,3 +499,28 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) return pgmap ? pgmap->altmap : NULL; } #endif /* CONFIG_ZONE_DEVICE */ + + +#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) +void put_zone_device_private_or_public_page(struct page *page) +{ + int count = page_ref_dec_return(page); + + /* + * If refcount is 1 then page is freed and refcount is stable as nobody + * holds a reference on the page. + */ + if (count == 1) { + /* Clear Active bit in case of parallel mark_page_accessed */ + __ClearPageActive(page); + __ClearPageWaiters(page); + + page->mapping = NULL; + mem_cgroup_uncharge(page); + + page->pgmap->page_free(page, page->pgmap->data); + } else if (!count) + __put_page(page); +} +EXPORT_SYMBOL(put_zone_device_private_or_public_page); +#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 57d22571f306..d7cdc426ee38 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -242,8 +242,7 @@ static void hib_end_io(struct bio *bio) if (bio->bi_status) { printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", - imajor(bio->bi_bdev->bd_inode), - iminor(bio->bi_bdev->bd_inode), + MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), (unsigned long long)bio->bi_iter.bi_sector); } @@ -270,7 +269,7 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); - bio->bi_bdev = hib_resume_bdev; + bio_set_dev(bio, hib_resume_bdev); bio_set_op_attrs(bio, op, op_flags); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fc47863f629c..512f7c2baedd 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -649,7 +649,7 @@ static int syslog_action_restricted(int type) type != SYSLOG_ACTION_SIZE_BUFFER; } -int check_syslog_permissions(int type, int source) +static int check_syslog_permissions(int type, int source) { /* * If this is from /proc/kmsg and we've already opened it, then we've @@ -677,7 +677,6 @@ int check_syslog_permissions(int type, int source) ok: return security_syslog(type); } -EXPORT_SYMBOL_GPL(check_syslog_permissions); static void append_char(char **pp, char *e, char c) { @@ -1435,7 +1434,7 @@ int do_syslog(int type, char __user *buf, int len, int source) error = check_syslog_permissions(type, source); if (error) - goto out; + return error; switch (type) { case SYSLOG_ACTION_CLOSE: /* Close log */ @@ -1443,20 +1442,16 @@ int do_syslog(int type, char __user *buf, int len, int source) case SYSLOG_ACTION_OPEN: /* Open log */ break; case SYSLOG_ACTION_READ: /* Read from log */ - error = -EINVAL; if (!buf || len < 0) - goto out; - error = 0; + return -EINVAL; if (!len) - goto out; - if (!access_ok(VERIFY_WRITE, buf, len)) { - error = -EFAULT; - goto out; - } + return 0; + if (!access_ok(VERIFY_WRITE, buf, len)) + return -EFAULT; error = wait_event_interruptible(log_wait, syslog_seq != log_next_seq); if (error) - goto out; + return error; error = syslog_print(buf, len); break; /* Read/clear last kernel messages */ @@ -1465,16 +1460,12 @@ int do_syslog(int type, char __user *buf, int len, int source) /* FALL THRU */ /* Read last kernel messages */ case SYSLOG_ACTION_READ_ALL: - error = -EINVAL; if (!buf || len < 0) - goto out; - error = 0; + return -EINVAL; if (!len) - goto out; - if (!access_ok(VERIFY_WRITE, buf, len)) { - error = -EFAULT; - goto out; - } + return 0; + if (!access_ok(VERIFY_WRITE, buf, len)) + return -EFAULT; error = syslog_print_all(buf, len, clear); break; /* Clear ring buffer */ @@ -1496,15 +1487,13 @@ int do_syslog(int type, char __user *buf, int len, int source) break; /* Set level of messages printed to console */ case SYSLOG_ACTION_CONSOLE_LEVEL: - error = -EINVAL; if (len < 1 || len > 8) - goto out; + return -EINVAL; if (len < minimum_console_loglevel) len = minimum_console_loglevel; console_loglevel = len; /* Implicitly re-enable logging to console */ saved_console_loglevel = LOGLEVEL_DEFAULT; - error = 0; break; /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: @@ -1526,7 +1515,6 @@ int do_syslog(int type, char __user *buf, int len, int source) u64 seq = syslog_seq; u32 idx = syslog_idx; - error = 0; while (seq < log_next_seq) { struct printk_log *msg = log_from_idx(idx); @@ -1546,7 +1534,7 @@ int do_syslog(int type, char __user *buf, int len, int source) error = -EINVAL; break; } -out: + return error; } @@ -1698,10 +1686,10 @@ asmlinkage int vprintk_emit(int facility, int level, { static char textbuf[LOG_LINE_MAX]; char *text = textbuf; - size_t text_len = 0; + size_t text_len; enum log_flags lflags = 0; unsigned long flags; - int printed_len = 0; + int printed_len; bool in_sched = false; if (level == LOGLEVEL_SCHED) { @@ -1754,7 +1742,7 @@ asmlinkage int vprintk_emit(int facility, int level, if (dict) lflags |= LOG_PREFIX|LOG_NEWLINE; - printed_len += log_output(facility, level, lflags, dict, dictlen, text, text_len); + printed_len = log_output(facility, level, lflags, dict, dictlen, text, text_len); logbuf_unlock_irqrestore(flags); @@ -2650,9 +2638,8 @@ void __init console_init(void) * makes it difficult to diagnose problems that occur during this time. * * To mitigate this problem somewhat, only unregister consoles whose memory - * intersects with the init section. Note that code exists elsewhere to get - * rid of the boot console as soon as the proper console shows up, so there - * won't be side-effects from postponing the removal. + * intersects with the init section. Note that all other boot consoles will + * get unregistred when the real preferred console is registered. */ static int __init printk_late_init(void) { @@ -2660,16 +2647,23 @@ static int __init printk_late_init(void) int ret; for_each_console(con) { - if (!keep_bootcon && con->flags & CON_BOOT) { + if (!(con->flags & CON_BOOT)) + continue; + + /* Check addresses that might be used for enabled consoles. */ + if (init_section_intersects(con, sizeof(*con)) || + init_section_contains(con->write, 0) || + init_section_contains(con->read, 0) || + init_section_contains(con->device, 0) || + init_section_contains(con->unblank, 0) || + init_section_contains(con->data, 0)) { /* - * Make sure to unregister boot consoles whose data - * resides in the init section before the init section - * is discarded. Boot consoles whose data will stick - * around will automatically be unregistered when the - * proper console replaces them. + * Please, consider moving the reported consoles out + * of the init section. */ - if (init_section_intersects(con, sizeof(*con))) - unregister_console(con); + pr_warn("bootconsole [%s%d] uses init memory and must be disabled even before the real one is ready\n", + con->name, con->index); + unregister_console(con); } } ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL, diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 84fe96641b2e..1250e4bd4b85 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4091,7 +4091,7 @@ static void __init rcu_init_geometry(void) if (rcu_fanout_leaf == RCU_FANOUT_LEAF && nr_cpu_ids == NR_CPUS) return; - pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n", + pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n", rcu_fanout_leaf, nr_cpu_ids); /* diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 55bde94b9572..e012b9be777e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -89,7 +89,7 @@ static void __init rcu_bootup_announce_oddness(void) if (rcu_fanout_leaf != RCU_FANOUT_LEAF) pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) - pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); + pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%u.\n", NR_CPUS, nr_cpu_ids); #ifdef CONFIG_RCU_BOOST pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY); #endif diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9e38df7649f4..0191ec7667c3 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -296,7 +296,7 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) { struct sched_dl_entity *dl_se = &p->dl; - return dl_rq->rb_leftmost == &dl_se->rb_node; + return dl_rq->root.rb_leftmost == &dl_se->rb_node; } void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) @@ -320,7 +320,7 @@ void init_dl_bw(struct dl_bw *dl_b) void init_dl_rq(struct dl_rq *dl_rq) { - dl_rq->rb_root = RB_ROOT; + dl_rq->root = RB_ROOT_CACHED; #ifdef CONFIG_SMP /* zero means no -deadline tasks */ @@ -328,7 +328,7 @@ void init_dl_rq(struct dl_rq *dl_rq) dl_rq->dl_nr_migratory = 0; dl_rq->overloaded = 0; - dl_rq->pushable_dl_tasks_root = RB_ROOT; + dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED; #else init_dl_bw(&dl_rq->dl_bw); #endif @@ -410,10 +410,10 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) { struct dl_rq *dl_rq = &rq->dl; - struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node; + struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_root.rb_node; struct rb_node *parent = NULL; struct task_struct *entry; - int leftmost = 1; + bool leftmost = true; BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); @@ -425,17 +425,16 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) link = &parent->rb_left; else { link = &parent->rb_right; - leftmost = 0; + leftmost = false; } } - if (leftmost) { - dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks; + if (leftmost) dl_rq->earliest_dl.next = p->dl.deadline; - } rb_link_node(&p->pushable_dl_tasks, parent, link); - rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); + rb_insert_color_cached(&p->pushable_dl_tasks, + &dl_rq->pushable_dl_tasks_root, leftmost); } static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) @@ -445,24 +444,23 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) if (RB_EMPTY_NODE(&p->pushable_dl_tasks)) return; - if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) { + if (dl_rq->pushable_dl_tasks_root.rb_leftmost == &p->pushable_dl_tasks) { struct rb_node *next_node; next_node = rb_next(&p->pushable_dl_tasks); - dl_rq->pushable_dl_tasks_leftmost = next_node; if (next_node) { dl_rq->earliest_dl.next = rb_entry(next_node, struct task_struct, pushable_dl_tasks)->dl.deadline; } } - rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); + rb_erase_cached(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); RB_CLEAR_NODE(&p->pushable_dl_tasks); } static inline int has_pushable_dl_tasks(struct rq *rq) { - return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root); + return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root); } static int push_dl_task(struct rq *rq); @@ -1266,7 +1264,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) dl_rq->earliest_dl.next = 0; cpudl_clear(&rq->rd->cpudl, rq->cpu); } else { - struct rb_node *leftmost = dl_rq->rb_leftmost; + struct rb_node *leftmost = dl_rq->root.rb_leftmost; struct sched_dl_entity *entry; entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); @@ -1313,7 +1311,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - struct rb_node **link = &dl_rq->rb_root.rb_node; + struct rb_node **link = &dl_rq->root.rb_root.rb_node; struct rb_node *parent = NULL; struct sched_dl_entity *entry; int leftmost = 1; @@ -1331,11 +1329,8 @@ static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) } } - if (leftmost) - dl_rq->rb_leftmost = &dl_se->rb_node; - rb_link_node(&dl_se->rb_node, parent, link); - rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root); + rb_insert_color_cached(&dl_se->rb_node, &dl_rq->root, leftmost); inc_dl_tasks(dl_se, dl_rq); } @@ -1347,14 +1342,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) if (RB_EMPTY_NODE(&dl_se->rb_node)) return; - if (dl_rq->rb_leftmost == &dl_se->rb_node) { - struct rb_node *next_node; - - next_node = rb_next(&dl_se->rb_node); - dl_rq->rb_leftmost = next_node; - } - - rb_erase(&dl_se->rb_node, &dl_rq->rb_root); + rb_erase_cached(&dl_se->rb_node, &dl_rq->root); RB_CLEAR_NODE(&dl_se->rb_node); dec_dl_tasks(dl_se, dl_rq); @@ -1647,7 +1635,7 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p) static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, struct dl_rq *dl_rq) { - struct rb_node *left = dl_rq->rb_leftmost; + struct rb_node *left = rb_first_cached(&dl_rq->root); if (!left) return NULL; @@ -1771,7 +1759,7 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) */ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu) { - struct rb_node *next_node = rq->dl.pushable_dl_tasks_leftmost; + struct rb_node *next_node = rq->dl.pushable_dl_tasks_root.rb_leftmost; struct task_struct *p = NULL; if (!has_pushable_dl_tasks(rq)) @@ -1945,7 +1933,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) if (!has_pushable_dl_tasks(rq)) return NULL; - p = rb_entry(rq->dl.pushable_dl_tasks_leftmost, + p = rb_entry(rq->dl.pushable_dl_tasks_root.rb_leftmost, struct task_struct, pushable_dl_tasks); BUG_ON(rq->cpu != task_cpu(p)); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4a23bbc3111b..8e536d963652 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -530,7 +530,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(cfs_rq->exec_clock)); raw_spin_lock_irqsave(&rq->lock, flags); - if (cfs_rq->rb_leftmost) + if (rb_first_cached(&cfs_rq->tasks_timeline)) MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; last = __pick_last_entity(cfs_rq); if (last) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8bc0a883d190..a5d83ed8dd82 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -513,6 +513,7 @@ static inline int entity_before(struct sched_entity *a, static void update_min_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; + struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline); u64 vruntime = cfs_rq->min_vruntime; @@ -523,10 +524,9 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) curr = NULL; } - if (cfs_rq->rb_leftmost) { - struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, - struct sched_entity, - run_node); + if (leftmost) { /* non-empty tree */ + struct sched_entity *se; + se = rb_entry(leftmost, struct sched_entity, run_node); if (!curr) vruntime = se->vruntime; @@ -547,10 +547,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; + struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node; struct rb_node *parent = NULL; struct sched_entity *entry; - int leftmost = 1; + bool leftmost = true; /* * Find the right place in the rbtree: @@ -566,36 +566,23 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) link = &parent->rb_left; } else { link = &parent->rb_right; - leftmost = 0; + leftmost = false; } } - /* - * Maintain a cache of leftmost tree entries (it is frequently - * used): - */ - if (leftmost) - cfs_rq->rb_leftmost = &se->run_node; - rb_link_node(&se->run_node, parent, link); - rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); + rb_insert_color_cached(&se->run_node, + &cfs_rq->tasks_timeline, leftmost); } static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (cfs_rq->rb_leftmost == &se->run_node) { - struct rb_node *next_node; - - next_node = rb_next(&se->run_node); - cfs_rq->rb_leftmost = next_node; - } - - rb_erase(&se->run_node, &cfs_rq->tasks_timeline); + rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline); } struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) { - struct rb_node *left = cfs_rq->rb_leftmost; + struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline); if (!left) return NULL; @@ -616,7 +603,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se) #ifdef CONFIG_SCHED_DEBUG struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { - struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); + struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); if (!last) return NULL; @@ -9312,7 +9299,7 @@ static void set_curr_task_fair(struct rq *rq) void init_cfs_rq(struct cfs_rq *cfs_rq) { - cfs_rq->tasks_timeline = RB_ROOT; + cfs_rq->tasks_timeline = RB_ROOT_CACHED; cfs_rq->min_vruntime = (u64)(-(1LL << 20)); #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6ed7962dc896..746ac78ff492 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -426,8 +426,7 @@ struct cfs_rq { u64 min_vruntime_copy; #endif - struct rb_root tasks_timeline; - struct rb_node *rb_leftmost; + struct rb_root_cached tasks_timeline; /* * 'curr' points to currently running entity on this cfs_rq. @@ -550,8 +549,7 @@ struct rt_rq { /* Deadline class' related fields in a runqueue */ struct dl_rq { /* runqueue is an rbtree, ordered by deadline */ - struct rb_root rb_root; - struct rb_node *rb_leftmost; + struct rb_root_cached root; unsigned long dl_nr_running; @@ -575,8 +573,7 @@ struct dl_rq { * an rb-tree, ordered by tasks' deadlines, with caching * of the leftmost (earliest deadline) element. */ - struct rb_root pushable_dl_tasks_root; - struct rb_node *pushable_dl_tasks_leftmost; + struct rb_root_cached pushable_dl_tasks_root; #else struct dl_bw dl_bw; #endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 6f7b43982f73..5d0062cc10cb 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -473,7 +473,7 @@ static int __init isolated_cpu_setup(char *str) alloc_bootmem_cpumask_var(&cpu_isolated_map); ret = cpulist_parse(str, cpu_isolated_map); if (ret) { - pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); + pr_err("sched: Error, all isolcpus= values must be between 0 and %u\n", nr_cpu_ids); return 0; } return 1; diff --git a/kernel/smp.c b/kernel/smp.c index 81cfca9b4cc3..c94dd85c8d41 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -550,7 +550,7 @@ static int __init maxcpus(char *str) early_param("maxcpus", maxcpus); /* Setup number of possible processor ids */ -int nr_cpu_ids __read_mostly = NR_CPUS; +unsigned int nr_cpu_ids __read_mostly = NR_CPUS; EXPORT_SYMBOL(nr_cpu_ids); /* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 8ea4fb315719..2cafb49aa65e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2316,7 +2316,7 @@ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } EXPORT_SYMBOL(hardpps); -#endif +#endif /* CONFIG_NTP_PPS */ /** * xtime_update() - advances the timekeeping infrastructure diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index bc364f86100a..2a685b45b73b 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -27,6 +27,7 @@ #include <linux/time.h> #include <linux/uaccess.h> #include <linux/list.h> +#include <linux/blk-cgroup.h> #include "../../block/blk.h" @@ -46,10 +47,16 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock); /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 +#define TRACE_BLK_OPT_CGROUP 0x2 +#define TRACE_BLK_OPT_CGNAME 0x4 static struct tracer_opt blk_tracer_opts[] = { /* Default disable the minimalistic output */ { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) }, +#ifdef CONFIG_BLK_CGROUP + { TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) }, + { TRACER_OPT(blk_cgname, TRACE_BLK_OPT_CGNAME) }, +#endif { } }; @@ -68,7 +75,8 @@ static void blk_unregister_tracepoints(void); * Send out a notify message. */ static void trace_note(struct blk_trace *bt, pid_t pid, int action, - const void *data, size_t len) + const void *data, size_t len, + union kernfs_node_id *cgid) { struct blk_io_trace *t; struct ring_buffer_event *event = NULL; @@ -76,12 +84,13 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, int pc = 0; int cpu = smp_processor_id(); bool blk_tracer = blk_tracer_enabled; + ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; if (blk_tracer) { buffer = blk_tr->trace_buffer.buffer; pc = preempt_count(); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, - sizeof(*t) + len, + sizeof(*t) + len + cgid_len, 0, pc); if (!event) return; @@ -92,17 +101,19 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, if (!bt->rchan) return; - t = relay_reserve(bt->rchan, sizeof(*t) + len); + t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len); if (t) { t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; t->time = ktime_to_ns(ktime_get()); record_it: t->device = bt->dev; - t->action = action; + t->action = action | (cgid ? __BLK_TN_CGROUP : 0); t->pid = pid; t->cpu = cpu; - t->pdu_len = len; - memcpy((void *) t + sizeof(*t), data, len); + t->pdu_len = len + cgid_len; + if (cgid) + memcpy((void *)t + sizeof(*t), cgid, cgid_len); + memcpy((void *) t + sizeof(*t) + cgid_len, data, len); if (blk_tracer) trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc); @@ -122,7 +133,7 @@ static void trace_note_tsk(struct task_struct *tsk) spin_lock_irqsave(&running_trace_lock, flags); list_for_each_entry(bt, &running_trace_list, running_list) { trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, - sizeof(tsk->comm)); + sizeof(tsk->comm), NULL); } spin_unlock_irqrestore(&running_trace_lock, flags); } @@ -139,11 +150,12 @@ static void trace_note_time(struct blk_trace *bt) words[1] = now.tv_nsec; local_irq_save(flags); - trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words)); + trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), NULL); local_irq_restore(flags); } -void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) +void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, + const char *fmt, ...) { int n; va_list args; @@ -167,7 +179,14 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); - trace_note(bt, 0, BLK_TN_MESSAGE, buf, n); + if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) + blkcg = NULL; +#ifdef CONFIG_BLK_CGROUP + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, + blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : NULL); +#else + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL); +#endif local_irq_restore(flags); } EXPORT_SYMBOL_GPL(__trace_note_message); @@ -204,7 +223,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, int op, int op_flags, u32 what, int error, int pdu_len, - void *pdu_data) + void *pdu_data, union kernfs_node_id *cgid) { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; @@ -215,6 +234,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, pid_t pid; int cpu, pc = 0; bool blk_tracer = blk_tracer_enabled; + ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; @@ -229,6 +249,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= BLK_TC_ACT(BLK_TC_DISCARD); if (op == REQ_OP_FLUSH) what |= BLK_TC_ACT(BLK_TC_FLUSH); + if (cgid) + what |= __BLK_TA_CGROUP; pid = tsk->pid; if (act_log_check(bt, what, sector, pid)) @@ -241,7 +263,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, buffer = blk_tr->trace_buffer.buffer; pc = preempt_count(); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, - sizeof(*t) + pdu_len, + sizeof(*t) + pdu_len + cgid_len, 0, pc); if (!event) return; @@ -258,7 +280,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, * from coming in and stepping on our toes. */ local_irq_save(flags); - t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); + t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len); if (t) { sequence = per_cpu_ptr(bt->sequence, cpu); @@ -280,10 +302,12 @@ record_it: t->action = what; t->device = bt->dev; t->error = error; - t->pdu_len = pdu_len; + t->pdu_len = pdu_len + cgid_len; + if (cgid_len) + memcpy((void *)t + sizeof(*t), cgid, cgid_len); if (pdu_len) - memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); + memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); if (blk_tracer) { trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc); @@ -359,7 +383,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, return PTR_ERR(msg); bt = filp->private_data; - __trace_note_message(bt, "%s", msg); + __trace_note_message(bt, NULL, "%s", msg); kfree(msg); return count; @@ -684,6 +708,36 @@ void blk_trace_shutdown(struct request_queue *q) } } +#ifdef CONFIG_BLK_CGROUP +static union kernfs_node_id * +blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +{ + struct blk_trace *bt = q->blk_trace; + + if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) + return NULL; + + if (!bio->bi_css) + return NULL; + return cgroup_get_kernfs_id(bio->bi_css->cgroup); +} +#else +static union kernfs_node_id * +blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +{ + return NULL; +} +#endif + +static union kernfs_node_id * +blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) +{ + if (!rq->bio) + return NULL; + /* Use the first bio */ + return blk_trace_bio_get_cgid(q, rq->bio); +} + /* * blktrace probes */ @@ -694,13 +748,15 @@ void blk_trace_shutdown(struct request_queue *q) * @error: return status to log * @nr_bytes: number of completed bytes * @what: the action + * @cgid: the cgroup info * * Description: * Records an action against a request. Will log the bio offset + size. * **/ static void blk_add_trace_rq(struct request *rq, int error, - unsigned int nr_bytes, u32 what) + unsigned int nr_bytes, u32 what, + union kernfs_node_id *cgid) { struct blk_trace *bt = rq->q->blk_trace; @@ -713,32 +769,36 @@ static void blk_add_trace_rq(struct request *rq, int error, what |= BLK_TC_ACT(BLK_TC_FS); __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), - rq->cmd_flags, what, error, 0, NULL); + rq->cmd_flags, what, error, 0, NULL, cgid); } static void blk_add_trace_rq_insert(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT, + blk_trace_request_get_cgid(q, rq)); } static void blk_add_trace_rq_issue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE, + blk_trace_request_get_cgid(q, rq)); } static void blk_add_trace_rq_requeue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE, + blk_trace_request_get_cgid(q, rq)); } static void blk_add_trace_rq_complete(void *ignore, struct request *rq, int error, unsigned int nr_bytes) { - blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE); + blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE, + blk_trace_request_get_cgid(rq->q, rq)); } /** @@ -753,7 +813,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, * **/ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, - u32 what, int error) + u32 what, int error, union kernfs_node_id *cgid) { struct blk_trace *bt = q->blk_trace; @@ -761,20 +821,22 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, return; __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_opf, what, error, 0, NULL); + bio_op(bio), bio->bi_opf, what, error, 0, NULL, cgid); } static void blk_add_trace_bio_bounce(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); + blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_bio_complete(void *ignore, struct request_queue *q, struct bio *bio, int error) { - blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); + blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_bio_backmerge(void *ignore, @@ -782,7 +844,8 @@ static void blk_add_trace_bio_backmerge(void *ignore, struct request *rq, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); + blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_bio_frontmerge(void *ignore, @@ -790,13 +853,15 @@ static void blk_add_trace_bio_frontmerge(void *ignore, struct request *rq, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); + blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_bio_queue(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0); + blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_getrq(void *ignore, @@ -804,13 +869,14 @@ static void blk_add_trace_getrq(void *ignore, struct bio *bio, int rw) { if (bio) - blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); + blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0, + blk_trace_bio_get_cgid(q, bio)); else { struct blk_trace *bt = q->blk_trace; if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, - NULL); + NULL, NULL); } } @@ -820,13 +886,14 @@ static void blk_add_trace_sleeprq(void *ignore, struct bio *bio, int rw) { if (bio) - blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); + blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0, + blk_trace_bio_get_cgid(q, bio)); else { struct blk_trace *bt = q->blk_trace; if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, - 0, 0, NULL); + 0, 0, NULL, NULL); } } @@ -835,7 +902,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) struct blk_trace *bt = q->blk_trace; if (bt) - __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); + __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, @@ -852,7 +919,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, else what = BLK_TA_UNPLUG_TIMER; - __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); + __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL); } } @@ -868,7 +935,7 @@ static void blk_add_trace_split(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu), - &rpdu); + &rpdu, blk_trace_bio_get_cgid(q, bio)); } } @@ -896,12 +963,12 @@ static void blk_add_trace_bio_remap(void *ignore, return; r.device_from = cpu_to_be32(dev); - r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); + r.device_to = cpu_to_be32(bio_dev(bio)); r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status, - sizeof(r), &r); + sizeof(r), &r, blk_trace_bio_get_cgid(q, bio)); } /** @@ -934,7 +1001,7 @@ static void blk_add_trace_rq_remap(void *ignore, __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rq_data_dir(rq), 0, BLK_TA_REMAP, 0, - sizeof(r), &r); + sizeof(r), &r, blk_trace_request_get_cgid(q, rq)); } /** @@ -958,7 +1025,8 @@ void blk_add_driver_data(struct request_queue *q, return; __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, - BLK_TA_DRV_DATA, 0, len, data); + BLK_TA_DRV_DATA, 0, len, data, + blk_trace_request_get_cgid(q, rq)); } EXPORT_SYMBOL_GPL(blk_add_driver_data); @@ -1031,7 +1099,7 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) int i = 0; int tc = t->action >> BLK_TC_SHIFT; - if (t->action == BLK_TN_MESSAGE) { + if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) { rwbs[i++] = 'N'; goto out; } @@ -1066,9 +1134,21 @@ const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) return (const struct blk_io_trace *)ent; } -static inline const void *pdu_start(const struct trace_entry *ent) +static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg) +{ + return (void *)(te_blk_io_trace(ent) + 1) + + (has_cg ? sizeof(union kernfs_node_id) : 0); +} + +static inline const void *cgid_start(const struct trace_entry *ent) +{ + return (void *)(te_blk_io_trace(ent) + 1); +} + +static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg) { - return te_blk_io_trace(ent) + 1; + return te_blk_io_trace(ent)->pdu_len - + (has_cg ? sizeof(union kernfs_node_id) : 0); } static inline u32 t_action(const struct trace_entry *ent) @@ -1096,16 +1176,16 @@ static inline __u16 t_error(const struct trace_entry *ent) return te_blk_io_trace(ent)->error; } -static __u64 get_pdu_int(const struct trace_entry *ent) +static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg) { - const __u64 *val = pdu_start(ent); + const __u64 *val = pdu_start(ent, has_cg); return be64_to_cpu(*val); } static void get_pdu_remap(const struct trace_entry *ent, - struct blk_io_trace_remap *r) + struct blk_io_trace_remap *r, bool has_cg) { - const struct blk_io_trace_remap *__r = pdu_start(ent); + const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg); __u64 sector_from = __r->sector_from; r->device_from = be32_to_cpu(__r->device_from); @@ -1113,9 +1193,11 @@ static void get_pdu_remap(const struct trace_entry *ent, r->sector_from = be64_to_cpu(sector_from); } -typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act); +typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act, + bool has_cg); -static void blk_log_action_classic(struct trace_iterator *iter, const char *act) +static void blk_log_action_classic(struct trace_iterator *iter, const char *act, + bool has_cg) { char rwbs[RWBS_LEN]; unsigned long long ts = iter->ts; @@ -1131,24 +1213,43 @@ static void blk_log_action_classic(struct trace_iterator *iter, const char *act) secs, nsec_rem, iter->ent->pid, act, rwbs); } -static void blk_log_action(struct trace_iterator *iter, const char *act) +static void blk_log_action(struct trace_iterator *iter, const char *act, + bool has_cg) { char rwbs[RWBS_LEN]; const struct blk_io_trace *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); - trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", - MAJOR(t->device), MINOR(t->device), act, rwbs); -} - -static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) + if (has_cg) { + const union kernfs_node_id *id = cgid_start(iter->ent); + + if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) { + char blkcg_name_buf[NAME_MAX + 1] = "<...>"; + + cgroup_path_from_kernfs_id(id, blkcg_name_buf, + sizeof(blkcg_name_buf)); + trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ", + MAJOR(t->device), MINOR(t->device), + blkcg_name_buf, act, rwbs); + } else + trace_seq_printf(&iter->seq, + "%3d,%-3d %x,%-x %2s %3s ", + MAJOR(t->device), MINOR(t->device), + id->ino, id->generation, act, rwbs); + } else + trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", + MAJOR(t->device), MINOR(t->device), act, rwbs); +} + +static void blk_log_dump_pdu(struct trace_seq *s, + const struct trace_entry *ent, bool has_cg) { const unsigned char *pdu_buf; int pdu_len; int i, end; - pdu_buf = pdu_start(ent); - pdu_len = te_blk_io_trace(ent)->pdu_len; + pdu_buf = pdu_start(ent, has_cg); + pdu_len = pdu_real_len(ent, has_cg); if (!pdu_len) return; @@ -1179,7 +1280,7 @@ static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) trace_seq_puts(s, ") "); } -static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; @@ -1187,7 +1288,7 @@ static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { trace_seq_printf(s, "%u ", t_bytes(ent)); - blk_log_dump_pdu(s, ent); + blk_log_dump_pdu(s, ent, has_cg); trace_seq_printf(s, "[%s]\n", cmd); } else { if (t_sec(ent)) @@ -1199,10 +1300,10 @@ static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) } static void blk_log_with_error(struct trace_seq *s, - const struct trace_entry *ent) + const struct trace_entry *ent, bool has_cg) { if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { - blk_log_dump_pdu(s, ent); + blk_log_dump_pdu(s, ent, has_cg); trace_seq_printf(s, "[%d]\n", t_error(ent)); } else { if (t_sec(ent)) @@ -1215,18 +1316,18 @@ static void blk_log_with_error(struct trace_seq *s, } } -static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { struct blk_io_trace_remap r = { .device_from = 0, }; - get_pdu_remap(ent, &r); + get_pdu_remap(ent, &r, has_cg); trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", t_sector(ent), t_sec(ent), MAJOR(r.device_from), MINOR(r.device_from), (unsigned long long)r.sector_from); } -static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; @@ -1235,30 +1336,31 @@ static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) trace_seq_printf(s, "[%s]\n", cmd); } -static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); - trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent)); + trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent, has_cg)); } -static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), - get_pdu_int(ent), cmd); + get_pdu_int(ent, has_cg), cmd); } -static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent, + bool has_cg) { - const struct blk_io_trace *t = te_blk_io_trace(ent); - trace_seq_putmem(s, t + 1, t->pdu_len); + trace_seq_putmem(s, pdu_start(ent, has_cg), + pdu_real_len(ent, has_cg)); trace_seq_putc(s, '\n'); } @@ -1298,7 +1400,8 @@ static void blk_tracer_reset(struct trace_array *tr) static const struct { const char *act[2]; - void (*print)(struct trace_seq *s, const struct trace_entry *ent); + void (*print)(struct trace_seq *s, const struct trace_entry *ent, + bool has_cg); } what2act[] = { [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, @@ -1326,23 +1429,25 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, u16 what; bool long_act; blk_log_action_t *log_action; + bool has_cg; t = te_blk_io_trace(iter->ent); - what = t->action & ((1 << BLK_TC_SHIFT) - 1); + what = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP; long_act = !!(tr->trace_flags & TRACE_ITER_VERBOSE); log_action = classic ? &blk_log_action_classic : &blk_log_action; + has_cg = t->action & __BLK_TA_CGROUP; - if (t->action == BLK_TN_MESSAGE) { - log_action(iter, long_act ? "message" : "m"); - blk_log_msg(s, iter->ent); + if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) { + log_action(iter, long_act ? "message" : "m", has_cg); + blk_log_msg(s, iter->ent, has_cg); return trace_handle_return(s); } if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) trace_seq_printf(s, "Unknown action %x\n", what); else { - log_action(iter, what2act[what].act[long_act]); - what2act[what].print(s, iter->ent); + log_action(iter, what2act[what].act[long_act], has_cg); + what2act[what].print(s, iter->ent, has_cg); } return trace_handle_return(s); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 96cea88fa00f..6abfafd7f173 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2828,13 +2828,14 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) if (!command || !ftrace_enabled) { /* - * If these are per_cpu ops, they still need their - * per_cpu field freed. Since, function tracing is + * If these are dynamic or per_cpu ops, they still + * need their data freed. Since, function tracing is * not currently active, we can just free them * without synchronizing all CPUs. */ - if (ops->flags & FTRACE_OPS_FL_PER_CPU) - per_cpu_ops_free(ops); + if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) + goto free_ops; + return 0; } @@ -2900,6 +2901,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) if (IS_ENABLED(CONFIG_PREEMPT)) synchronize_rcu_tasks(); + free_ops: arch_ftrace_trampoline_free(ops); if (ops->flags & FTRACE_OPS_FL_PER_CPU) @@ -5690,10 +5692,51 @@ static int referenced_filters(struct dyn_ftrace *rec) return cnt; } +static void +clear_mod_from_hash(struct ftrace_page *pg, struct ftrace_hash *hash) +{ + struct ftrace_func_entry *entry; + struct dyn_ftrace *rec; + int i; + + if (ftrace_hash_empty(hash)) + return; + + for (i = 0; i < pg->index; i++) { + rec = &pg->records[i]; + entry = __ftrace_lookup_ip(hash, rec->ip); + /* + * Do not allow this rec to match again. + * Yeah, it may waste some memory, but will be removed + * if/when the hash is modified again. + */ + if (entry) + entry->ip = 0; + } +} + +/* Clear any records from hashs */ +static void clear_mod_from_hashes(struct ftrace_page *pg) +{ + struct trace_array *tr; + + mutex_lock(&trace_types_lock); + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!tr->ops || !tr->ops->func_hash) + continue; + mutex_lock(&tr->ops->func_hash->regex_lock); + clear_mod_from_hash(pg, tr->ops->func_hash->filter_hash); + clear_mod_from_hash(pg, tr->ops->func_hash->notrace_hash); + mutex_unlock(&tr->ops->func_hash->regex_lock); + } + mutex_unlock(&trace_types_lock); +} + void ftrace_release_mod(struct module *mod) { struct dyn_ftrace *rec; struct ftrace_page **last_pg; + struct ftrace_page *tmp_page = NULL; struct ftrace_page *pg; int order; @@ -5723,14 +5766,25 @@ void ftrace_release_mod(struct module *mod) ftrace_update_tot_cnt -= pg->index; *last_pg = pg->next; - order = get_count_order(pg->size / ENTRIES_PER_PAGE); - free_pages((unsigned long)pg->records, order); - kfree(pg); + + pg->next = tmp_page; + tmp_page = pg; } else last_pg = &pg->next; } out_unlock: mutex_unlock(&ftrace_lock); + + for (pg = tmp_page; pg; pg = tmp_page) { + + /* Needs to be called outside of ftrace_lock */ + clear_mod_from_hashes(pg); + + order = get_count_order(pg->size / ENTRIES_PER_PAGE); + free_pages((unsigned long)pg->records, order); + tmp_page = pg->next; + kfree(pg); + } } void ftrace_module_enable(struct module *mod) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 44004d8aa3b3..5360b7aec57a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1702,6 +1702,9 @@ void tracing_reset_all_online_cpus(void) struct trace_array *tr; list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!tr->clear_trace) + continue; + tr->clear_trace = false; tracing_reset_online_cpus(&tr->trace_buffer); #ifdef CONFIG_TRACER_MAX_TRACE tracing_reset_online_cpus(&tr->max_buffer); @@ -2799,11 +2802,17 @@ static char *get_trace_buf(void) if (!buffer || buffer->nesting >= 4) return NULL; - return &buffer->buffer[buffer->nesting++][0]; + buffer->nesting++; + + /* Interrupts must see nesting incremented before we use the buffer */ + barrier(); + return &buffer->buffer[buffer->nesting][0]; } static void put_trace_buf(void) { + /* Don't let the decrement of nesting leak before this */ + barrier(); this_cpu_dec(trace_percpu_buffer->nesting); } @@ -6220,7 +6229,7 @@ static int tracing_set_clock(struct trace_array *tr, const char *clockstr) tracing_reset_online_cpus(&tr->trace_buffer); #ifdef CONFIG_TRACER_MAX_TRACE - if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer) + if (tr->max_buffer.buffer) ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); tracing_reset_online_cpus(&tr->max_buffer); #endif diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 490ba229931d..fb5d54d0d1b3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -245,6 +245,7 @@ struct trace_array { int stop_count; int clock_id; int nr_topts; + bool clear_trace; struct tracer *current_trace; unsigned int trace_flags; unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 36132f9280e6..87468398b9ed 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -406,7 +406,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, if (file->flags & EVENT_FILE_FL_RECORDED_TGID) { tracing_stop_tgid_record(); - clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); + clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); } call->class->reg(call, TRACE_REG_UNREGISTER, file); @@ -466,7 +466,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, set_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags); /* WAS_ENABLED gets set but never cleared. */ - call->flags |= TRACE_EVENT_FL_WAS_ENABLED; + set_bit(EVENT_FILE_FL_WAS_ENABLED_BIT, &file->flags); } break; } @@ -2058,6 +2058,10 @@ static void event_remove(struct trace_event_call *call) do_for_each_event_file(tr, file) { if (file->event_call != call) continue; + + if (file->flags & EVENT_FILE_FL_WAS_ENABLED) + tr->clear_trace = true; + ftrace_event_enable_disable(file, 0); /* * The do_for_each_event_file() is @@ -2396,15 +2400,11 @@ static void trace_module_add_events(struct module *mod) static void trace_module_remove_events(struct module *mod) { struct trace_event_call *call, *p; - bool clear_trace = false; down_write(&trace_event_sem); list_for_each_entry_safe(call, p, &ftrace_events, list) { - if (call->mod == mod) { - if (call->flags & TRACE_EVENT_FL_WAS_ENABLED) - clear_trace = true; + if (call->mod == mod) __trace_remove_event_call(call); - } } up_write(&trace_event_sem); @@ -2416,8 +2416,7 @@ static void trace_module_remove_events(struct module *mod) * over from this module may be passed to the new module events and * unexpected results may occur. */ - if (clear_trace) - tracing_reset_all_online_cpus(); + tracing_reset_all_online_cpus(); } static int trace_module_notify(struct notifier_block *self, diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d56123cdcc89..b8f1f54731af 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1543,7 +1543,7 @@ fs_initcall(init_graph_tracefs); static __init int init_graph_trace(void) { - max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); + max_bytes_for_cpu = snprintf(NULL, 0, "%u", nr_cpu_ids - 1); if (!register_trace_event(&graph_trace_entry_event)) { pr_warn("Warning: could not register graph trace events\n"); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index cb917cebae29..b17ec642793b 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -273,7 +273,7 @@ static int trace_selftest_ops(struct trace_array *tr, int cnt) goto out_free; if (cnt > 1) { if (trace_selftest_test_global_cnt == 0) - goto out; + goto out_free; } if (trace_selftest_test_dyn_cnt == 0) goto out_free; diff --git a/kernel/umh.c b/kernel/umh.c new file mode 100644 index 000000000000..6ff9905250ff --- /dev/null +++ b/kernel/umh.c @@ -0,0 +1,568 @@ +/* + * umh - the kernel usermode helper + */ +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/binfmts.h> +#include <linux/syscalls.h> +#include <linux/unistd.h> +#include <linux/kmod.h> +#include <linux/slab.h> +#include <linux/completion.h> +#include <linux/cred.h> +#include <linux/file.h> +#include <linux/fdtable.h> +#include <linux/workqueue.h> +#include <linux/security.h> +#include <linux/mount.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/resource.h> +#include <linux/notifier.h> +#include <linux/suspend.h> +#include <linux/rwsem.h> +#include <linux/ptrace.h> +#include <linux/async.h> +#include <linux/uaccess.h> + +#include <trace/events/module.h> + +#define CAP_BSET (void *)1 +#define CAP_PI (void *)2 + +static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; +static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; +static DEFINE_SPINLOCK(umh_sysctl_lock); +static DECLARE_RWSEM(umhelper_sem); + +static void call_usermodehelper_freeinfo(struct subprocess_info *info) +{ + if (info->cleanup) + (*info->cleanup)(info); + kfree(info); +} + +static void umh_complete(struct subprocess_info *sub_info) +{ + struct completion *comp = xchg(&sub_info->complete, NULL); + /* + * See call_usermodehelper_exec(). If xchg() returns NULL + * we own sub_info, the UMH_KILLABLE caller has gone away + * or the caller used UMH_NO_WAIT. + */ + if (comp) + complete(comp); + else + call_usermodehelper_freeinfo(sub_info); +} + +/* + * This is the task which runs the usermode application + */ +static int call_usermodehelper_exec_async(void *data) +{ + struct subprocess_info *sub_info = data; + struct cred *new; + int retval; + + spin_lock_irq(¤t->sighand->siglock); + flush_signal_handlers(current, 1); + spin_unlock_irq(¤t->sighand->siglock); + + /* + * Our parent (unbound workqueue) runs with elevated scheduling + * priority. Avoid propagating that into the userspace child. + */ + set_user_nice(current, 0); + + retval = -ENOMEM; + new = prepare_kernel_cred(current); + if (!new) + goto out; + + spin_lock(&umh_sysctl_lock); + new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); + new->cap_inheritable = cap_intersect(usermodehelper_inheritable, + new->cap_inheritable); + spin_unlock(&umh_sysctl_lock); + + if (sub_info->init) { + retval = sub_info->init(sub_info, new); + if (retval) { + abort_creds(new); + goto out; + } + } + + commit_creds(new); + + retval = do_execve(getname_kernel(sub_info->path), + (const char __user *const __user *)sub_info->argv, + (const char __user *const __user *)sub_info->envp); +out: + sub_info->retval = retval; + /* + * call_usermodehelper_exec_sync() will call umh_complete + * if UHM_WAIT_PROC. + */ + if (!(sub_info->wait & UMH_WAIT_PROC)) + umh_complete(sub_info); + if (!retval) + return 0; + do_exit(0); +} + +/* Handles UMH_WAIT_PROC. */ +static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) +{ + pid_t pid; + + /* If SIGCLD is ignored sys_wait4 won't populate the status. */ + kernel_sigaction(SIGCHLD, SIG_DFL); + pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); + if (pid < 0) { + sub_info->retval = pid; + } else { + int ret = -ECHILD; + /* + * Normally it is bogus to call wait4() from in-kernel because + * wait4() wants to write the exit code to a userspace address. + * But call_usermodehelper_exec_sync() always runs as kernel + * thread (workqueue) and put_user() to a kernel address works + * OK for kernel threads, due to their having an mm_segment_t + * which spans the entire address space. + * + * Thus the __user pointer cast is valid here. + */ + sys_wait4(pid, (int __user *)&ret, 0, NULL); + + /* + * If ret is 0, either call_usermodehelper_exec_async failed and + * the real error code is already in sub_info->retval or + * sub_info->retval is 0 anyway, so don't mess with it then. + */ + if (ret) + sub_info->retval = ret; + } + + /* Restore default kernel sig handler */ + kernel_sigaction(SIGCHLD, SIG_IGN); + + umh_complete(sub_info); +} + +/* + * We need to create the usermodehelper kernel thread from a task that is affine + * to an optimized set of CPUs (or nohz housekeeping ones) such that they + * inherit a widest affinity irrespective of call_usermodehelper() callers with + * possibly reduced affinity (eg: per-cpu workqueues). We don't want + * usermodehelper targets to contend a busy CPU. + * + * Unbound workqueues provide such wide affinity and allow to block on + * UMH_WAIT_PROC requests without blocking pending request (up to some limit). + * + * Besides, workqueues provide the privilege level that caller might not have + * to perform the usermodehelper request. + * + */ +static void call_usermodehelper_exec_work(struct work_struct *work) +{ + struct subprocess_info *sub_info = + container_of(work, struct subprocess_info, work); + + if (sub_info->wait & UMH_WAIT_PROC) { + call_usermodehelper_exec_sync(sub_info); + } else { + pid_t pid; + /* + * Use CLONE_PARENT to reparent it to kthreadd; we do not + * want to pollute current->children, and we need a parent + * that always ignores SIGCHLD to ensure auto-reaping. + */ + pid = kernel_thread(call_usermodehelper_exec_async, sub_info, + CLONE_PARENT | SIGCHLD); + if (pid < 0) { + sub_info->retval = pid; + umh_complete(sub_info); + } + } +} + +/* + * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY + * (used for preventing user land processes from being created after the user + * land has been frozen during a system-wide hibernation or suspend operation). + * Should always be manipulated under umhelper_sem acquired for write. + */ +static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED; + +/* Number of helpers running */ +static atomic_t running_helpers = ATOMIC_INIT(0); + +/* + * Wait queue head used by usermodehelper_disable() to wait for all running + * helpers to finish. + */ +static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); + +/* + * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled + * to become 'false'. + */ +static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq); + +/* + * Time to wait for running_helpers to become zero before the setting of + * usermodehelper_disabled in usermodehelper_disable() fails + */ +#define RUNNING_HELPERS_TIMEOUT (5 * HZ) + +int usermodehelper_read_trylock(void) +{ + DEFINE_WAIT(wait); + int ret = 0; + + down_read(&umhelper_sem); + for (;;) { + prepare_to_wait(&usermodehelper_disabled_waitq, &wait, + TASK_INTERRUPTIBLE); + if (!usermodehelper_disabled) + break; + + if (usermodehelper_disabled == UMH_DISABLED) + ret = -EAGAIN; + + up_read(&umhelper_sem); + + if (ret) + break; + + schedule(); + try_to_freeze(); + + down_read(&umhelper_sem); + } + finish_wait(&usermodehelper_disabled_waitq, &wait); + return ret; +} +EXPORT_SYMBOL_GPL(usermodehelper_read_trylock); + +long usermodehelper_read_lock_wait(long timeout) +{ + DEFINE_WAIT(wait); + + if (timeout < 0) + return -EINVAL; + + down_read(&umhelper_sem); + for (;;) { + prepare_to_wait(&usermodehelper_disabled_waitq, &wait, + TASK_UNINTERRUPTIBLE); + if (!usermodehelper_disabled) + break; + + up_read(&umhelper_sem); + + timeout = schedule_timeout(timeout); + if (!timeout) + break; + + down_read(&umhelper_sem); + } + finish_wait(&usermodehelper_disabled_waitq, &wait); + return timeout; +} +EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait); + +void usermodehelper_read_unlock(void) +{ + up_read(&umhelper_sem); +} +EXPORT_SYMBOL_GPL(usermodehelper_read_unlock); + +/** + * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled. + * @depth: New value to assign to usermodehelper_disabled. + * + * Change the value of usermodehelper_disabled (under umhelper_sem locked for + * writing) and wakeup tasks waiting for it to change. + */ +void __usermodehelper_set_disable_depth(enum umh_disable_depth depth) +{ + down_write(&umhelper_sem); + usermodehelper_disabled = depth; + wake_up(&usermodehelper_disabled_waitq); + up_write(&umhelper_sem); +} + +/** + * __usermodehelper_disable - Prevent new helpers from being started. + * @depth: New value to assign to usermodehelper_disabled. + * + * Set usermodehelper_disabled to @depth and wait for running helpers to exit. + */ +int __usermodehelper_disable(enum umh_disable_depth depth) +{ + long retval; + + if (!depth) + return -EINVAL; + + down_write(&umhelper_sem); + usermodehelper_disabled = depth; + up_write(&umhelper_sem); + + /* + * From now on call_usermodehelper_exec() won't start any new + * helpers, so it is sufficient if running_helpers turns out to + * be zero at one point (it may be increased later, but that + * doesn't matter). + */ + retval = wait_event_timeout(running_helpers_waitq, + atomic_read(&running_helpers) == 0, + RUNNING_HELPERS_TIMEOUT); + if (retval) + return 0; + + __usermodehelper_set_disable_depth(UMH_ENABLED); + return -EAGAIN; +} + +static void helper_lock(void) +{ + atomic_inc(&running_helpers); + smp_mb__after_atomic(); +} + +static void helper_unlock(void) +{ + if (atomic_dec_and_test(&running_helpers)) + wake_up(&running_helpers_waitq); +} + +/** + * call_usermodehelper_setup - prepare to call a usermode helper + * @path: path to usermode executable + * @argv: arg vector for process + * @envp: environment for process + * @gfp_mask: gfp mask for memory allocation + * @cleanup: a cleanup function + * @init: an init function + * @data: arbitrary context sensitive data + * + * Returns either %NULL on allocation failure, or a subprocess_info + * structure. This should be passed to call_usermodehelper_exec to + * exec the process and free the structure. + * + * The init function is used to customize the helper process prior to + * exec. A non-zero return code causes the process to error out, exit, + * and return the failure to the calling process + * + * The cleanup function is just before ethe subprocess_info is about to + * be freed. This can be used for freeing the argv and envp. The + * Function must be runnable in either a process context or the + * context in which call_usermodehelper_exec is called. + */ +struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv, + char **envp, gfp_t gfp_mask, + int (*init)(struct subprocess_info *info, struct cred *new), + void (*cleanup)(struct subprocess_info *info), + void *data) +{ + struct subprocess_info *sub_info; + sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask); + if (!sub_info) + goto out; + + INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); + +#ifdef CONFIG_STATIC_USERMODEHELPER + sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH; +#else + sub_info->path = path; +#endif + sub_info->argv = argv; + sub_info->envp = envp; + + sub_info->cleanup = cleanup; + sub_info->init = init; + sub_info->data = data; + out: + return sub_info; +} +EXPORT_SYMBOL(call_usermodehelper_setup); + +/** + * call_usermodehelper_exec - start a usermode application + * @sub_info: information about the subprocessa + * @wait: wait for the application to finish and return status. + * when UMH_NO_WAIT don't wait at all, but you get no useful error back + * when the program couldn't be exec'ed. This makes it safe to call + * from interrupt context. + * + * Runs a user-space application. The application is started + * asynchronously if wait is not set, and runs as a child of system workqueues. + * (ie. it runs with full root capabilities and optimized affinity). + */ +int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) +{ + DECLARE_COMPLETION_ONSTACK(done); + int retval = 0; + + if (!sub_info->path) { + call_usermodehelper_freeinfo(sub_info); + return -EINVAL; + } + helper_lock(); + if (usermodehelper_disabled) { + retval = -EBUSY; + goto out; + } + + /* + * If there is no binary for us to call, then just return and get out of + * here. This allows us to set STATIC_USERMODEHELPER_PATH to "" and + * disable all call_usermodehelper() calls. + */ + if (strlen(sub_info->path) == 0) + goto out; + + /* + * Set the completion pointer only if there is a waiter. + * This makes it possible to use umh_complete to free + * the data structure in case of UMH_NO_WAIT. + */ + sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; + sub_info->wait = wait; + + queue_work(system_unbound_wq, &sub_info->work); + if (wait == UMH_NO_WAIT) /* task has freed sub_info */ + goto unlock; + + if (wait & UMH_KILLABLE) { + retval = wait_for_completion_killable(&done); + if (!retval) + goto wait_done; + + /* umh_complete() will see NULL and free sub_info */ + if (xchg(&sub_info->complete, NULL)) + goto unlock; + /* fallthrough, umh_complete() was already called */ + } + + wait_for_completion(&done); +wait_done: + retval = sub_info->retval; +out: + call_usermodehelper_freeinfo(sub_info); +unlock: + helper_unlock(); + return retval; +} +EXPORT_SYMBOL(call_usermodehelper_exec); + +/** + * call_usermodehelper() - prepare and start a usermode application + * @path: path to usermode executable + * @argv: arg vector for process + * @envp: environment for process + * @wait: wait for the application to finish and return status. + * when UMH_NO_WAIT don't wait at all, but you get no useful error back + * when the program couldn't be exec'ed. This makes it safe to call + * from interrupt context. + * + * This function is the equivalent to use call_usermodehelper_setup() and + * call_usermodehelper_exec(). + */ +int call_usermodehelper(const char *path, char **argv, char **envp, int wait) +{ + struct subprocess_info *info; + gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; + + info = call_usermodehelper_setup(path, argv, envp, gfp_mask, + NULL, NULL, NULL); + if (info == NULL) + return -ENOMEM; + + return call_usermodehelper_exec(info, wait); +} +EXPORT_SYMBOL(call_usermodehelper); + +static int proc_cap_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; + kernel_cap_t new_cap; + int err, i; + + if (write && (!capable(CAP_SETPCAP) || + !capable(CAP_SYS_MODULE))) + return -EPERM; + + /* + * convert from the global kernel_cap_t to the ulong array to print to + * userspace if this is a read. + */ + spin_lock(&umh_sysctl_lock); + for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) { + if (table->data == CAP_BSET) + cap_array[i] = usermodehelper_bset.cap[i]; + else if (table->data == CAP_PI) + cap_array[i] = usermodehelper_inheritable.cap[i]; + else + BUG(); + } + spin_unlock(&umh_sysctl_lock); + + t = *table; + t.data = &cap_array; + + /* + * actually read or write and array of ulongs from userspace. Remember + * these are least significant 32 bits first + */ + err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + + /* + * convert from the sysctl array of ulongs to the kernel_cap_t + * internal representation + */ + for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) + new_cap.cap[i] = cap_array[i]; + + /* + * Drop everything not in the new_cap (but don't add things) + */ + spin_lock(&umh_sysctl_lock); + if (write) { + if (table->data == CAP_BSET) + usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap); + if (table->data == CAP_PI) + usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap); + } + spin_unlock(&umh_sysctl_lock); + + return 0; +} + +struct ctl_table usermodehelper_table[] = { + { + .procname = "bset", + .data = CAP_BSET, + .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), + .mode = 0600, + .proc_handler = proc_cap_handler, + }, + { + .procname = "inheritable", + .data = CAP_PI, + .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), + .mode = 0600, + .proc_handler = proc_cap_handler, + }, + { } +}; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ab3c0dc8c7ed..64d0edf428f8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -21,7 +21,7 @@ * pools for workqueues which are not bound to any specific CPU - the * number of these backing pools is dynamic. * - * Please read Documentation/workqueue.txt for details. + * Please read Documentation/core-api/workqueue.rst for details. */ #include <linux/export.h> @@ -2269,7 +2269,7 @@ sleep: * event. */ worker_enter_idle(worker); - __set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_IDLE); spin_unlock_irq(&pool->lock); schedule(); goto woke_up; @@ -2311,7 +2311,7 @@ static int rescuer_thread(void *__rescuer) */ rescuer->task->flags |= PF_WQ_WORKER; repeat: - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_IDLE); /* * By the time the rescuer is requested to stop, the workqueue |