diff options
Diffstat (limited to 'kernel')
166 files changed, 13905 insertions, 6492 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 88c92fb44618..5068e2a4e75f 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -199,4 +199,4 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE config MUTEX_SPIN_ON_OWNER - def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES + def_bool SMP && !DEBUG_MUTEXES diff --git a/kernel/Makefile b/kernel/Makefile index 353d3fe8ba33..2d64cfcc8b42 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -21,7 +21,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg -CFLAGS_REMOVE_perf_event.o = -pg CFLAGS_REMOVE_irq_work.o = -pg endif @@ -62,7 +61,6 @@ obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup.o obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o obj-$(CONFIG_CPUSETS) += cpuset.o -obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_USER_NS) += user_namespace.o obj-$(CONFIG_PID_NS) += pid_namespace.o @@ -103,10 +101,12 @@ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_IRQ_WORK) += irq_work.o -obj-$(CONFIG_PERF_EVENTS) += perf_event.o -obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o + +obj-$(CONFIG_PERF_EVENTS) += events/ + obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o obj-$(CONFIG_PADATA) += padata.o +obj-$(CONFIG_CRASH_DUMP) += crash_dump.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/audit.c b/kernel/audit.c index e4956244ae50..939500317066 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -74,6 +74,8 @@ static int audit_initialized; int audit_enabled; int audit_ever_enabled; +EXPORT_SYMBOL_GPL(audit_enabled); + /* Default state when kernel boots without any parameters. */ static int audit_default; @@ -671,9 +673,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) pid = NETLINK_CREDS(skb)->pid; uid = NETLINK_CREDS(skb)->uid; - loginuid = NETLINK_CB(skb).loginuid; - sessionid = NETLINK_CB(skb).sessionid; - sid = NETLINK_CB(skb).sid; + loginuid = audit_get_loginuid(current); + sessionid = audit_get_sessionid(current); + security_task_getsecid(current, &sid); seq = nlh->nlmsg_seq; data = NLMSG_DATA(nlh); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 37b2bea170c8..e99dda04b126 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -607,7 +607,7 @@ void audit_trim_trees(void) spin_lock(&hash_lock); list_for_each_entry(node, &tree->chunks, list) { struct audit_chunk *chunk = find_chunk(node); - /* this could be NULL if the watch is dieing else where... */ + /* this could be NULL if the watch is dying else where... */ struct inode *inode = chunk->mark.i.inode; node->index |= 1U<<31; if (iterate_mounts(compare_root, inode, root_mnt)) diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index d2e3c7866460..e683869365d9 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -144,9 +144,9 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) } /* Initialize a parent watch entry. */ -static struct audit_parent *audit_init_parent(struct nameidata *ndp) +static struct audit_parent *audit_init_parent(struct path *path) { - struct inode *inode = ndp->path.dentry->d_inode; + struct inode *inode = path->dentry->d_inode; struct audit_parent *parent; int ret; @@ -353,53 +353,40 @@ static void audit_remove_parent_watches(struct audit_parent *parent) } /* Get path information necessary for adding watches. */ -static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw) +static int audit_get_nd(struct audit_watch *watch, struct path *parent) { - struct nameidata *ndparent, *ndwatch; + struct nameidata nd; + struct dentry *d; int err; - ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); - if (unlikely(!ndparent)) - return -ENOMEM; + err = kern_path_parent(watch->path, &nd); + if (err) + return err; - ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); - if (unlikely(!ndwatch)) { - kfree(ndparent); - return -ENOMEM; + if (nd.last_type != LAST_NORM) { + path_put(&nd.path); + return -EINVAL; } - err = path_lookup(path, LOOKUP_PARENT, ndparent); - if (err) { - kfree(ndparent); - kfree(ndwatch); - return err; + mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len); + if (IS_ERR(d)) { + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + path_put(&nd.path); + return PTR_ERR(d); } - - err = path_lookup(path, 0, ndwatch); - if (err) { - kfree(ndwatch); - ndwatch = NULL; + if (d->d_inode) { + /* update watch filter fields */ + watch->dev = d->d_inode->i_sb->s_dev; + watch->ino = d->d_inode->i_ino; } + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - *ndp = ndparent; - *ndw = ndwatch; - + *parent = nd.path; + dput(d); return 0; } -/* Release resources used for watch path information. */ -static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) -{ - if (ndp) { - path_put(&ndp->path); - kfree(ndp); - } - if (ndw) { - path_put(&ndw->path); - kfree(ndw); - } -} - /* Associate the given rule with an existing parent. * Caller must hold audit_filter_mutex. */ static void audit_add_to_parent(struct audit_krule *krule, @@ -440,31 +427,24 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) { struct audit_watch *watch = krule->watch; struct audit_parent *parent; - struct nameidata *ndp = NULL, *ndw = NULL; + struct path parent_path; int h, ret = 0; mutex_unlock(&audit_filter_mutex); /* Avoid calling path_lookup under audit_filter_mutex. */ - ret = audit_get_nd(watch->path, &ndp, &ndw); - if (ret) { - /* caller expects mutex locked */ - mutex_lock(&audit_filter_mutex); - goto error; - } + ret = audit_get_nd(watch, &parent_path); + /* caller expects mutex locked */ mutex_lock(&audit_filter_mutex); - /* update watch filter fields */ - if (ndw) { - watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; - watch->ino = ndw->path.dentry->d_inode->i_ino; - } + if (ret) + return ret; /* either find an old parent or attach a new one */ - parent = audit_find_parent(ndp->path.dentry->d_inode); + parent = audit_find_parent(parent_path.dentry->d_inode); if (!parent) { - parent = audit_init_parent(ndp); + parent = audit_init_parent(&parent_path); if (IS_ERR(parent)) { ret = PTR_ERR(parent); goto error; @@ -479,9 +459,8 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) h = audit_hash_ino((u32)watch->ino); *list = &audit_inode_hash[h]; error: - audit_put_nd(ndp, ndw); /* NULL args OK */ + path_put(&parent_path); return ret; - } void audit_remove_watch_rule(struct audit_krule *krule) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index add2819af71b..f8277c80d678 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1238,6 +1238,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &rule->fields[i]; int result = 0; + u32 sid; switch (f->type) { case AUDIT_PID: @@ -1250,19 +1251,22 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, result = audit_comparator(cb->creds.gid, f->op, f->val); break; case AUDIT_LOGINUID: - result = audit_comparator(cb->loginuid, f->op, f->val); + result = audit_comparator(audit_get_loginuid(current), + f->op, f->val); break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: - if (f->lsm_rule) - result = security_audit_rule_match(cb->sid, + if (f->lsm_rule) { + security_task_getsecid(current, &sid); + result = security_audit_rule_match(sid, f->type, f->op, f->lsm_rule, NULL); + } break; } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index f49a0318c2ed..00d79df03e76 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -443,17 +443,25 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) /* Determine if any context name data matches a rule's watch data */ /* Compare a task_struct with an audit_rule. Return 1 on match, 0 - * otherwise. */ + * otherwise. + * + * If task_creation is true, this is an explicit indication that we are + * filtering a task rule at task creation time. This and tsk == current are + * the only situations where tsk->cred may be accessed without an rcu read lock. + */ static int audit_filter_rules(struct task_struct *tsk, struct audit_krule *rule, struct audit_context *ctx, struct audit_names *name, - enum audit_state *state) + enum audit_state *state, + bool task_creation) { - const struct cred *cred = get_task_cred(tsk); + const struct cred *cred; int i, j, need_sid = 1; u32 sid; + cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); + for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &rule->fields[i]; int result = 0; @@ -637,10 +645,8 @@ static int audit_filter_rules(struct task_struct *tsk, break; } - if (!result) { - put_cred(cred); + if (!result) return 0; - } } if (ctx) { @@ -656,7 +662,6 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_NEVER: *state = AUDIT_DISABLED; break; case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; } - put_cred(cred); return 1; } @@ -671,7 +676,8 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key) rcu_read_lock(); list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { - if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) { + if (audit_filter_rules(tsk, &e->rule, NULL, NULL, + &state, true)) { if (state == AUDIT_RECORD_CONTEXT) *key = kstrdup(e->rule.filterkey, GFP_ATOMIC); rcu_read_unlock(); @@ -705,7 +711,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, list_for_each_entry_rcu(e, list, list) { if ((e->rule.mask[word] & bit) == bit && audit_filter_rules(tsk, &e->rule, ctx, NULL, - &state)) { + &state, false)) { rcu_read_unlock(); ctx->current_state = state; return state; @@ -743,7 +749,8 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) list_for_each_entry_rcu(e, list, list) { if ((e->rule.mask[word] & bit) == bit && - audit_filter_rules(tsk, &e->rule, ctx, n, &state)) { + audit_filter_rules(tsk, &e->rule, ctx, n, + &state, false)) { rcu_read_unlock(); ctx->current_state = state; return; @@ -1011,7 +1018,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, /* * to_send and len_sent accounting are very loose estimates. We aren't * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being - * within about 500 bytes (next page boundry) + * within about 500 bytes (next page boundary) * * why snprintf? an int is up to 12 digits long. if we just assumed when * logging that a[%d]= was going to be 16 characters long we would be wasting diff --git a/kernel/bounds.c b/kernel/bounds.c index 98a51f26c136..0c9b862292b2 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -9,11 +9,13 @@ #include <linux/page-flags.h> #include <linux/mmzone.h> #include <linux/kbuild.h> +#include <linux/page_cgroup.h> void foo(void) { /* The enum constants to put into include/generated/bounds.h */ DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); + DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); /* End of constants */ } diff --git a/kernel/capability.c b/kernel/capability.c index 9e9385f132c8..283c529f8b1c 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -14,6 +14,7 @@ #include <linux/security.h> #include <linux/syscalls.h> #include <linux/pid_namespace.h> +#include <linux/user_namespace.h> #include <asm/uaccess.h> /* @@ -21,12 +22,8 @@ */ const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; -const kernel_cap_t __cap_full_set = CAP_FULL_SET; -const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET; EXPORT_SYMBOL(__cap_empty_set); -EXPORT_SYMBOL(__cap_full_set); -EXPORT_SYMBOL(__cap_init_eff_set); int file_caps_enabled = 1; @@ -290,6 +287,60 @@ error: } /** + * has_capability - Does a task have a capability in init_user_ns + * @t: The task in question + * @cap: The capability to be tested for + * + * Return true if the specified task has the given superior capability + * currently in effect to the initial user namespace, false if not. + * + * Note that this does not set PF_SUPERPRIV on the task. + */ +bool has_capability(struct task_struct *t, int cap) +{ + int ret = security_real_capable(t, &init_user_ns, cap); + + return (ret == 0); +} + +/** + * has_capability - Does a task have a capability in a specific user ns + * @t: The task in question + * @ns: target user namespace + * @cap: The capability to be tested for + * + * Return true if the specified task has the given superior capability + * currently in effect to the specified user namespace, false if not. + * + * Note that this does not set PF_SUPERPRIV on the task. + */ +bool has_ns_capability(struct task_struct *t, + struct user_namespace *ns, int cap) +{ + int ret = security_real_capable(t, ns, cap); + + return (ret == 0); +} + +/** + * has_capability_noaudit - Does a task have a capability (unaudited) + * @t: The task in question + * @cap: The capability to be tested for + * + * Return true if the specified task has the given superior capability + * currently in effect to init_user_ns, false if not. Don't write an + * audit message for the check. + * + * Note that this does not set PF_SUPERPRIV on the task. + */ +bool has_capability_noaudit(struct task_struct *t, int cap) +{ + int ret = security_real_capable_noaudit(t, &init_user_ns, cap); + + return (ret == 0); +} + +/** * capable - Determine if the current task has a superior capability in effect * @cap: The capability to be tested for * @@ -299,17 +350,60 @@ error: * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ -int capable(int cap) +bool capable(int cap) +{ + return ns_capable(&init_user_ns, cap); +} +EXPORT_SYMBOL(capable); + +/** + * ns_capable - Determine if the current task has a superior capability in effect + * @ns: The usernamespace we want the capability in + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +bool ns_capable(struct user_namespace *ns, int cap) { if (unlikely(!cap_valid(cap))) { printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); BUG(); } - if (security_capable(current_cred(), cap) == 0) { + if (security_capable(ns, current_cred(), cap) == 0) { current->flags |= PF_SUPERPRIV; - return 1; + return true; } - return 0; + return false; +} +EXPORT_SYMBOL(ns_capable); + +/** + * task_ns_capable - Determine whether current task has a superior + * capability targeted at a specific task's user namespace. + * @t: The task whose user namespace is targeted. + * @cap: The capability in question. + * + * Return true if it does, false otherwise. + */ +bool task_ns_capable(struct task_struct *t, int cap) +{ + return ns_capable(task_cred_xxx(t, user)->user_ns, cap); +} +EXPORT_SYMBOL(task_ns_capable); + +/** + * nsown_capable - Check superior capability to one's own user_ns + * @cap: The capability in question + * + * Return true if the current task has the given superior capability + * targeted at its own user namespace. + */ +bool nsown_capable(int cap) +{ + return ns_capable(current_user_ns(), cap); } -EXPORT_SYMBOL(capable); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b24d7027b83c..2731d115d725 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -57,6 +57,7 @@ #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ #include <linux/eventfd.h> #include <linux/poll.h> +#include <linux/flex_array.h> /* used in cgroup_attach_proc */ #include <asm/atomic.h> @@ -157,7 +158,7 @@ struct css_id { }; /* - * cgroup_event represents events which userspace want to recieve. + * cgroup_event represents events which userspace want to receive. */ struct cgroup_event { /* @@ -326,12 +327,6 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) return &css_set_table[index]; } -static void free_css_set_rcu(struct rcu_head *obj) -{ - struct css_set *cg = container_of(obj, struct css_set, rcu_head); - kfree(cg); -} - /* We don't maintain the lists running through each css_set to its * task until after the first call to cgroup_iter_start(). This * reduces the fork()/exit() overhead for people who have cgroups @@ -375,7 +370,7 @@ static void __put_css_set(struct css_set *cg, int taskexit) } write_unlock(&css_set_lock); - call_rcu(&cg->rcu_head, free_css_set_rcu); + kfree_rcu(cg, rcu_head); } /* @@ -812,13 +807,6 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp) return ret; } -static void free_cgroup_rcu(struct rcu_head *obj) -{ - struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head); - - kfree(cgrp); -} - static void cgroup_diput(struct dentry *dentry, struct inode *inode) { /* is dentry a directory ? if so, kfree() associated cgroup */ @@ -856,7 +844,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) */ BUG_ON(!list_empty(&cgrp->pidlists)); - call_rcu(&cgrp->rcu_head, free_cgroup_rcu); + kfree_rcu(cgrp, rcu_head); } iput(inode); } @@ -1748,6 +1736,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) } EXPORT_SYMBOL_GPL(cgroup_path); +/* + * cgroup_task_migrate - move a task from one cgroup to another. + * + * 'guarantee' is set if the caller promises that a new css_set for the task + * will already exist. If not set, this function might sleep, and can fail with + * -ENOMEM. Otherwise, it can only fail with -ESRCH. + */ +static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, + struct task_struct *tsk, bool guarantee) +{ + struct css_set *oldcg; + struct css_set *newcg; + + /* + * get old css_set. we need to take task_lock and refcount it, because + * an exiting task can change its css_set to init_css_set and drop its + * old one without taking cgroup_mutex. + */ + task_lock(tsk); + oldcg = tsk->cgroups; + get_css_set(oldcg); + task_unlock(tsk); + + /* locate or allocate a new css_set for this task. */ + if (guarantee) { + /* we know the css_set we want already exists. */ + struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; + read_lock(&css_set_lock); + newcg = find_existing_css_set(oldcg, cgrp, template); + BUG_ON(!newcg); + get_css_set(newcg); + read_unlock(&css_set_lock); + } else { + might_sleep(); + /* find_css_set will give us newcg already referenced. */ + newcg = find_css_set(oldcg, cgrp); + if (!newcg) { + put_css_set(oldcg); + return -ENOMEM; + } + } + put_css_set(oldcg); + + /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */ + task_lock(tsk); + if (tsk->flags & PF_EXITING) { + task_unlock(tsk); + put_css_set(newcg); + return -ESRCH; + } + rcu_assign_pointer(tsk->cgroups, newcg); + task_unlock(tsk); + + /* Update the css_set linked lists if we're using them */ + write_lock(&css_set_lock); + if (!list_empty(&tsk->cg_list)) + list_move(&tsk->cg_list, &newcg->tasks); + write_unlock(&css_set_lock); + + /* + * We just gained a reference on oldcg by taking it from the task. As + * trading it for newcg is protected by cgroup_mutex, we're safe to drop + * it here; it will be freed under RCU. + */ + put_css_set(oldcg); + + set_bit(CGRP_RELEASABLE, &oldcgrp->flags); + return 0; +} + /** * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' * @cgrp: the cgroup the task is attaching to @@ -1758,11 +1816,9 @@ EXPORT_SYMBOL_GPL(cgroup_path); */ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { - int retval = 0; + int retval; struct cgroup_subsys *ss, *failed_ss = NULL; struct cgroup *oldcgrp; - struct css_set *cg; - struct css_set *newcg; struct cgroupfs_root *root = cgrp->root; /* Nothing to do if the task is already in that cgroup */ @@ -1772,7 +1828,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, tsk, false); + retval = ss->can_attach(ss, cgrp, tsk); if (retval) { /* * Remember on which subsystem the can_attach() @@ -1784,48 +1840,29 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) goto out; } } + if (ss->can_attach_task) { + retval = ss->can_attach_task(cgrp, tsk); + if (retval) { + failed_ss = ss; + goto out; + } + } } - task_lock(tsk); - cg = tsk->cgroups; - get_css_set(cg); - task_unlock(tsk); - /* - * Locate or allocate a new css_set for this task, - * based on its final set of cgroups - */ - newcg = find_css_set(cg, cgrp); - put_css_set(cg); - if (!newcg) { - retval = -ENOMEM; - goto out; - } - - task_lock(tsk); - if (tsk->flags & PF_EXITING) { - task_unlock(tsk); - put_css_set(newcg); - retval = -ESRCH; + retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); + if (retval) goto out; - } - rcu_assign_pointer(tsk->cgroups, newcg); - task_unlock(tsk); - - /* Update the css_set linked lists if we're using them */ - write_lock(&css_set_lock); - if (!list_empty(&tsk->cg_list)) { - list_del(&tsk->cg_list); - list_add(&tsk->cg_list, &newcg->tasks); - } - write_unlock(&css_set_lock); for_each_subsys(root, ss) { + if (ss->pre_attach) + ss->pre_attach(cgrp); + if (ss->attach_task) + ss->attach_task(cgrp, tsk); if (ss->attach) - ss->attach(ss, cgrp, oldcgrp, tsk, false); + ss->attach(ss, cgrp, oldcgrp, tsk); } - set_bit(CGRP_RELEASABLE, &oldcgrp->flags); + synchronize_rcu(); - put_css_set(cg); /* * wake up rmdir() waiter. the rmdir should fail since the cgroup @@ -1844,7 +1881,7 @@ out: */ break; if (ss->cancel_attach) - ss->cancel_attach(ss, cgrp, tsk, false); + ss->cancel_attach(ss, cgrp, tsk); } } return retval; @@ -1875,49 +1912,370 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) EXPORT_SYMBOL_GPL(cgroup_attach_task_all); /* - * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex - * held. May take task_lock of task + * cgroup_attach_proc works in two stages, the first of which prefetches all + * new css_sets needed (to make sure we have enough memory before committing + * to the move) and stores them in a list of entries of the following type. + * TODO: possible optimization: use css_set->rcu_head for chaining instead + */ +struct cg_list_entry { + struct css_set *cg; + struct list_head links; +}; + +static bool css_set_check_fetched(struct cgroup *cgrp, + struct task_struct *tsk, struct css_set *cg, + struct list_head *newcg_list) +{ + struct css_set *newcg; + struct cg_list_entry *cg_entry; + struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; + + read_lock(&css_set_lock); + newcg = find_existing_css_set(cg, cgrp, template); + if (newcg) + get_css_set(newcg); + read_unlock(&css_set_lock); + + /* doesn't exist at all? */ + if (!newcg) + return false; + /* see if it's already in the list */ + list_for_each_entry(cg_entry, newcg_list, links) { + if (cg_entry->cg == newcg) { + put_css_set(newcg); + return true; + } + } + + /* not found */ + put_css_set(newcg); + return false; +} + +/* + * Find the new css_set and store it in the list in preparation for moving the + * given task to the given cgroup. Returns 0 or -ENOMEM. + */ +static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, + struct list_head *newcg_list) +{ + struct css_set *newcg; + struct cg_list_entry *cg_entry; + + /* ensure a new css_set will exist for this thread */ + newcg = find_css_set(cg, cgrp); + if (!newcg) + return -ENOMEM; + /* add it to the list */ + cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL); + if (!cg_entry) { + put_css_set(newcg); + return -ENOMEM; + } + cg_entry->cg = newcg; + list_add(&cg_entry->links, newcg_list); + return 0; +} + +/** + * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup + * @cgrp: the cgroup to attach to + * @leader: the threadgroup leader task_struct of the group to be attached + * + * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will + * take task_lock of each thread in leader's threadgroup individually in turn. + */ +int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) +{ + int retval, i, group_size; + struct cgroup_subsys *ss, *failed_ss = NULL; + bool cancel_failed_ss = false; + /* guaranteed to be initialized later, but the compiler needs this */ + struct cgroup *oldcgrp = NULL; + struct css_set *oldcg; + struct cgroupfs_root *root = cgrp->root; + /* threadgroup list cursor and array */ + struct task_struct *tsk; + struct flex_array *group; + /* + * we need to make sure we have css_sets for all the tasks we're + * going to move -before- we actually start moving them, so that in + * case we get an ENOMEM we can bail out before making any changes. + */ + struct list_head newcg_list; + struct cg_list_entry *cg_entry, *temp_nobe; + + /* + * step 0: in order to do expensive, possibly blocking operations for + * every thread, we cannot iterate the thread group list, since it needs + * rcu or tasklist locked. instead, build an array of all threads in the + * group - threadgroup_fork_lock prevents new threads from appearing, + * and if threads exit, this will just be an over-estimate. + */ + group_size = get_nr_threads(leader); + /* flex_array supports very large thread-groups better than kmalloc. */ + group = flex_array_alloc(sizeof(struct task_struct *), group_size, + GFP_KERNEL); + if (!group) + return -ENOMEM; + /* pre-allocate to guarantee space while iterating in rcu read-side. */ + retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL); + if (retval) + goto out_free_group_list; + + /* prevent changes to the threadgroup list while we take a snapshot. */ + rcu_read_lock(); + if (!thread_group_leader(leader)) { + /* + * a race with de_thread from another thread's exec() may strip + * us of our leadership, making while_each_thread unsafe to use + * on this task. if this happens, there is no choice but to + * throw this task away and try again (from cgroup_procs_write); + * this is "double-double-toil-and-trouble-check locking". + */ + rcu_read_unlock(); + retval = -EAGAIN; + goto out_free_group_list; + } + /* take a reference on each task in the group to go in the array. */ + tsk = leader; + i = 0; + do { + /* as per above, nr_threads may decrease, but not increase. */ + BUG_ON(i >= group_size); + get_task_struct(tsk); + /* + * saying GFP_ATOMIC has no effect here because we did prealloc + * earlier, but it's good form to communicate our expectations. + */ + retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC); + BUG_ON(retval != 0); + i++; + } while_each_thread(leader, tsk); + /* remember the number of threads in the array for later. */ + group_size = i; + rcu_read_unlock(); + + /* + * step 1: check that we can legitimately attach to the cgroup. + */ + for_each_subsys(root, ss) { + if (ss->can_attach) { + retval = ss->can_attach(ss, cgrp, leader); + if (retval) { + failed_ss = ss; + goto out_cancel_attach; + } + } + /* a callback to be run on every thread in the threadgroup. */ + if (ss->can_attach_task) { + /* run on each task in the threadgroup. */ + for (i = 0; i < group_size; i++) { + tsk = flex_array_get_ptr(group, i); + retval = ss->can_attach_task(cgrp, tsk); + if (retval) { + failed_ss = ss; + cancel_failed_ss = true; + goto out_cancel_attach; + } + } + } + } + + /* + * step 2: make sure css_sets exist for all threads to be migrated. + * we use find_css_set, which allocates a new one if necessary. + */ + INIT_LIST_HEAD(&newcg_list); + for (i = 0; i < group_size; i++) { + tsk = flex_array_get_ptr(group, i); + /* nothing to do if this task is already in the cgroup */ + oldcgrp = task_cgroup_from_root(tsk, root); + if (cgrp == oldcgrp) + continue; + /* get old css_set pointer */ + task_lock(tsk); + if (tsk->flags & PF_EXITING) { + /* ignore this task if it's going away */ + task_unlock(tsk); + continue; + } + oldcg = tsk->cgroups; + get_css_set(oldcg); + task_unlock(tsk); + /* see if the new one for us is already in the list? */ + if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) { + /* was already there, nothing to do. */ + put_css_set(oldcg); + } else { + /* we don't already have it. get new one. */ + retval = css_set_prefetch(cgrp, oldcg, &newcg_list); + put_css_set(oldcg); + if (retval) + goto out_list_teardown; + } + } + + /* + * step 3: now that we're guaranteed success wrt the css_sets, proceed + * to move all tasks to the new cgroup, calling ss->attach_task for each + * one along the way. there are no failure cases after here, so this is + * the commit point. + */ + for_each_subsys(root, ss) { + if (ss->pre_attach) + ss->pre_attach(cgrp); + } + for (i = 0; i < group_size; i++) { + tsk = flex_array_get_ptr(group, i); + /* leave current thread as it is if it's already there */ + oldcgrp = task_cgroup_from_root(tsk, root); + if (cgrp == oldcgrp) + continue; + /* attach each task to each subsystem */ + for_each_subsys(root, ss) { + if (ss->attach_task) + ss->attach_task(cgrp, tsk); + } + /* if the thread is PF_EXITING, it can just get skipped. */ + retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); + BUG_ON(retval != 0 && retval != -ESRCH); + } + /* nothing is sensitive to fork() after this point. */ + + /* + * step 4: do expensive, non-thread-specific subsystem callbacks. + * TODO: if ever a subsystem needs to know the oldcgrp for each task + * being moved, this call will need to be reworked to communicate that. + */ + for_each_subsys(root, ss) { + if (ss->attach) + ss->attach(ss, cgrp, oldcgrp, leader); + } + + /* + * step 5: success! and cleanup + */ + synchronize_rcu(); + cgroup_wakeup_rmdir_waiter(cgrp); + retval = 0; +out_list_teardown: + /* clean up the list of prefetched css_sets. */ + list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { + list_del(&cg_entry->links); + put_css_set(cg_entry->cg); + kfree(cg_entry); + } +out_cancel_attach: + /* same deal as in cgroup_attach_task */ + if (retval) { + for_each_subsys(root, ss) { + if (ss == failed_ss) { + if (cancel_failed_ss && ss->cancel_attach) + ss->cancel_attach(ss, cgrp, leader); + break; + } + if (ss->cancel_attach) + ss->cancel_attach(ss, cgrp, leader); + } + } + /* clean up the array of referenced threads in the group. */ + for (i = 0; i < group_size; i++) { + tsk = flex_array_get_ptr(group, i); + put_task_struct(tsk); + } +out_free_group_list: + flex_array_free(group); + return retval; +} + +/* + * Find the task_struct of the task to attach by vpid and pass it along to the + * function to attach either it or all tasks in its threadgroup. Will take + * cgroup_mutex; may take task_lock of task. */ -static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) +static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) { struct task_struct *tsk; const struct cred *cred = current_cred(), *tcred; int ret; + if (!cgroup_lock_live_group(cgrp)) + return -ENODEV; + if (pid) { rcu_read_lock(); tsk = find_task_by_vpid(pid); - if (!tsk || tsk->flags & PF_EXITING) { + if (!tsk) { + rcu_read_unlock(); + cgroup_unlock(); + return -ESRCH; + } + if (threadgroup) { + /* + * RCU protects this access, since tsk was found in the + * tid map. a race with de_thread may cause group_leader + * to stop being the leader, but cgroup_attach_proc will + * detect it later. + */ + tsk = tsk->group_leader; + } else if (tsk->flags & PF_EXITING) { + /* optimization for the single-task-only case */ rcu_read_unlock(); + cgroup_unlock(); return -ESRCH; } + /* + * even if we're attaching all tasks in the thread group, we + * only need to check permissions on one of them. + */ tcred = __task_cred(tsk); if (cred->euid && cred->euid != tcred->uid && cred->euid != tcred->suid) { rcu_read_unlock(); + cgroup_unlock(); return -EACCES; } get_task_struct(tsk); rcu_read_unlock(); } else { - tsk = current; + if (threadgroup) + tsk = current->group_leader; + else + tsk = current; get_task_struct(tsk); } - ret = cgroup_attach_task(cgrp, tsk); + if (threadgroup) { + threadgroup_fork_write_lock(tsk); + ret = cgroup_attach_proc(cgrp, tsk); + threadgroup_fork_write_unlock(tsk); + } else { + ret = cgroup_attach_task(cgrp, tsk); + } put_task_struct(tsk); + cgroup_unlock(); return ret; } static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) { + return attach_task_by_pid(cgrp, pid, false); +} + +static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) +{ int ret; - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; - ret = attach_task_by_pid(cgrp, pid); - cgroup_unlock(); + do { + /* + * attach_proc fails with -EAGAIN if threadgroup leadership + * changes in the middle of the operation, in which case we need + * to find the task_struct for the new leader and start over. + */ + ret = attach_task_by_pid(cgrp, tgid, true); + } while (ret == -EAGAIN); return ret; } @@ -3274,9 +3632,9 @@ static struct cftype files[] = { { .name = CGROUP_FILE_GENERIC_PREFIX "procs", .open = cgroup_procs_open, - /* .write_u64 = cgroup_procs_write, TODO */ + .write_u64 = cgroup_procs_write, .release = cgroup_pidlist_release, - .mode = S_IRUGO, + .mode = S_IRUGO | S_IWUSR, }, { .name = "notify_on_release", @@ -3655,12 +4013,12 @@ again: spin_lock(&release_list_lock); set_bit(CGRP_REMOVED, &cgrp->flags); if (!list_empty(&cgrp->release_list)) - list_del(&cgrp->release_list); + list_del_init(&cgrp->release_list); spin_unlock(&release_list_lock); cgroup_lock_hierarchy(cgrp->root); /* delete this cgroup from parent->children */ - list_del(&cgrp->sibling); + list_del_init(&cgrp->sibling); cgroup_unlock_hierarchy(cgrp->root); d = dget(cgrp->dentry); @@ -3879,7 +4237,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) subsys[ss->subsys_id] = NULL; /* remove subsystem from rootnode's list of subsystems */ - list_del(&ss->sibling); + list_del_init(&ss->sibling); /* * disentangle the css from all css_sets attached to the dummytop. as @@ -4230,20 +4588,8 @@ void cgroup_post_fork(struct task_struct *child) */ void cgroup_exit(struct task_struct *tsk, int run_callbacks) { - int i; struct css_set *cg; - - if (run_callbacks && need_forkexit_callback) { - /* - * modular subsystems can't use callbacks, so no need to lock - * the subsys array - */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss->exit) - ss->exit(ss, tsk); - } - } + int i; /* * Unlink from the css_set task list if necessary. @@ -4253,7 +4599,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) if (!list_empty(&tsk->cg_list)) { write_lock(&css_set_lock); if (!list_empty(&tsk->cg_list)) - list_del(&tsk->cg_list); + list_del_init(&tsk->cg_list); write_unlock(&css_set_lock); } @@ -4261,125 +4607,26 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) task_lock(tsk); cg = tsk->cgroups; tsk->cgroups = &init_css_set; - task_unlock(tsk); - if (cg) - put_css_set_taskexit(cg); -} - -/** - * cgroup_clone - clone the cgroup the given subsystem is attached to - * @tsk: the task to be moved - * @subsys: the given subsystem - * @nodename: the name for the new cgroup - * - * Duplicate the current cgroup in the hierarchy that the given - * subsystem is attached to, and move this task into the new - * child. - */ -int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, - char *nodename) -{ - struct dentry *dentry; - int ret = 0; - struct cgroup *parent, *child; - struct inode *inode; - struct css_set *cg; - struct cgroupfs_root *root; - struct cgroup_subsys *ss; - - /* We shouldn't be called by an unregistered subsystem */ - BUG_ON(!subsys->active); - - /* First figure out what hierarchy and cgroup we're dealing - * with, and pin them so we can drop cgroup_mutex */ - mutex_lock(&cgroup_mutex); - again: - root = subsys->root; - if (root == &rootnode) { - mutex_unlock(&cgroup_mutex); - return 0; - } - /* Pin the hierarchy */ - if (!atomic_inc_not_zero(&root->sb->s_active)) { - /* We race with the final deactivate_super() */ - mutex_unlock(&cgroup_mutex); - return 0; + if (run_callbacks && need_forkexit_callback) { + /* + * modular subsystems can't use callbacks, so no need to lock + * the subsys array + */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss->exit) { + struct cgroup *old_cgrp = + rcu_dereference_raw(cg->subsys[i])->cgroup; + struct cgroup *cgrp = task_cgroup(tsk, i); + ss->exit(ss, cgrp, old_cgrp, tsk); + } + } } - - /* Keep the cgroup alive */ - task_lock(tsk); - parent = task_cgroup(tsk, subsys->subsys_id); - cg = tsk->cgroups; - get_css_set(cg); task_unlock(tsk); - mutex_unlock(&cgroup_mutex); - - /* Now do the VFS work to create a cgroup */ - inode = parent->dentry->d_inode; - - /* Hold the parent directory mutex across this operation to - * stop anyone else deleting the new cgroup */ - mutex_lock(&inode->i_mutex); - dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename)); - if (IS_ERR(dentry)) { - printk(KERN_INFO - "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename, - PTR_ERR(dentry)); - ret = PTR_ERR(dentry); - goto out_release; - } - - /* Create the cgroup directory, which also creates the cgroup */ - ret = vfs_mkdir(inode, dentry, 0755); - child = __d_cgrp(dentry); - dput(dentry); - if (ret) { - printk(KERN_INFO - "Failed to create cgroup %s: %d\n", nodename, - ret); - goto out_release; - } - - /* The cgroup now exists. Retake cgroup_mutex and check - * that we're still in the same state that we thought we - * were. */ - mutex_lock(&cgroup_mutex); - if ((root != subsys->root) || - (parent != task_cgroup(tsk, subsys->subsys_id))) { - /* Aargh, we raced ... */ - mutex_unlock(&inode->i_mutex); - put_css_set(cg); - - deactivate_super(root->sb); - /* The cgroup is still accessible in the VFS, but - * we're not going to try to rmdir() it at this - * point. */ - printk(KERN_INFO - "Race in cgroup_clone() - leaking cgroup %s\n", - nodename); - goto again; - } - - /* do any required auto-setup */ - for_each_subsys(root, ss) { - if (ss->post_clone) - ss->post_clone(ss, child); - } - - /* All seems fine. Finish by moving the task into the new cgroup */ - ret = cgroup_attach_task(child, tsk); - mutex_unlock(&cgroup_mutex); - - out_release: - mutex_unlock(&inode->i_mutex); - - mutex_lock(&cgroup_mutex); - put_css_set(cg); - mutex_unlock(&cgroup_mutex); - deactivate_super(root->sb); - return ret; + if (cg) + put_css_set_taskexit(cg); } /** @@ -4620,14 +4867,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *child, return ret; } -static void __free_css_id_cb(struct rcu_head *head) -{ - struct css_id *id; - - id = container_of(head, struct css_id, rcu_head); - kfree(id); -} - void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) { struct css_id *id = css->id; @@ -4642,7 +4881,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) spin_lock(&ss->id_lock); idr_remove(&ss->idr, id->id); spin_unlock(&ss->id_lock); - call_rcu(&id->rcu_head, __free_css_id_cb); + kfree_rcu(id, rcu_head); } EXPORT_SYMBOL_GPL(free_css_id); @@ -4813,6 +5052,29 @@ css_get_next(struct cgroup_subsys *ss, int id, return ret; } +/* + * get corresponding css from file open on cgroupfs directory + */ +struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) +{ + struct cgroup *cgrp; + struct inode *inode; + struct cgroup_subsys_state *css; + + inode = f->f_dentry->d_inode; + /* check in cgroup filesystem dir */ + if (inode->i_op != &cgroup_dir_inode_operations) + return ERR_PTR(-EBADF); + + if (id < 0 || id >= CGROUP_SUBSYS_COUNT) + return ERR_PTR(-EINVAL); + + /* get cgroup */ + cgrp = __d_cgrp(f->f_dentry); + css = cgrp->subsys[id]; + return css ? css : ERR_PTR(-ENOENT); +} + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, struct cgroup *cont) diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index e7bebb7c6c38..e691818d7e45 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -160,7 +160,7 @@ static void freezer_destroy(struct cgroup_subsys *ss, */ static int freezer_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, - struct task_struct *task, bool threadgroup) + struct task_struct *task) { struct freezer *freezer; @@ -172,26 +172,17 @@ static int freezer_can_attach(struct cgroup_subsys *ss, if (freezer->state != CGROUP_THAWED) return -EBUSY; + return 0; +} + +static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) +{ rcu_read_lock(); - if (__cgroup_freezing_or_frozen(task)) { + if (__cgroup_freezing_or_frozen(tsk)) { rcu_read_unlock(); return -EBUSY; } rcu_read_unlock(); - - if (threadgroup) { - struct task_struct *c; - - rcu_read_lock(); - list_for_each_entry_rcu(c, &task->thread_group, thread_group) { - if (__cgroup_freezing_or_frozen(c)) { - rcu_read_unlock(); - return -EBUSY; - } - } - rcu_read_unlock(); - } - return 0; } @@ -390,6 +381,9 @@ struct cgroup_subsys freezer_subsys = { .populate = freezer_populate, .subsys_id = freezer_subsys_id, .can_attach = freezer_can_attach, + .can_attach_task = freezer_can_attach_task, + .pre_attach = NULL, + .attach_task = NULL, .attach = NULL, .fork = freezer_fork, .exit = NULL, diff --git a/kernel/compat.c b/kernel/compat.c index c9e2ec0b34a8..fc9eb093acd5 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -52,6 +52,64 @@ static int compat_put_timeval(struct compat_timeval __user *o, put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; } +static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp) +{ + memset(txc, 0, sizeof(struct timex)); + + if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || + __get_user(txc->modes, &utp->modes) || + __get_user(txc->offset, &utp->offset) || + __get_user(txc->freq, &utp->freq) || + __get_user(txc->maxerror, &utp->maxerror) || + __get_user(txc->esterror, &utp->esterror) || + __get_user(txc->status, &utp->status) || + __get_user(txc->constant, &utp->constant) || + __get_user(txc->precision, &utp->precision) || + __get_user(txc->tolerance, &utp->tolerance) || + __get_user(txc->time.tv_sec, &utp->time.tv_sec) || + __get_user(txc->time.tv_usec, &utp->time.tv_usec) || + __get_user(txc->tick, &utp->tick) || + __get_user(txc->ppsfreq, &utp->ppsfreq) || + __get_user(txc->jitter, &utp->jitter) || + __get_user(txc->shift, &utp->shift) || + __get_user(txc->stabil, &utp->stabil) || + __get_user(txc->jitcnt, &utp->jitcnt) || + __get_user(txc->calcnt, &utp->calcnt) || + __get_user(txc->errcnt, &utp->errcnt) || + __get_user(txc->stbcnt, &utp->stbcnt)) + return -EFAULT; + + return 0; +} + +static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc) +{ + if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || + __put_user(txc->modes, &utp->modes) || + __put_user(txc->offset, &utp->offset) || + __put_user(txc->freq, &utp->freq) || + __put_user(txc->maxerror, &utp->maxerror) || + __put_user(txc->esterror, &utp->esterror) || + __put_user(txc->status, &utp->status) || + __put_user(txc->constant, &utp->constant) || + __put_user(txc->precision, &utp->precision) || + __put_user(txc->tolerance, &utp->tolerance) || + __put_user(txc->time.tv_sec, &utp->time.tv_sec) || + __put_user(txc->time.tv_usec, &utp->time.tv_usec) || + __put_user(txc->tick, &utp->tick) || + __put_user(txc->ppsfreq, &utp->ppsfreq) || + __put_user(txc->jitter, &utp->jitter) || + __put_user(txc->shift, &utp->shift) || + __put_user(txc->stabil, &utp->stabil) || + __put_user(txc->jitcnt, &utp->jitcnt) || + __put_user(txc->calcnt, &utp->calcnt) || + __put_user(txc->errcnt, &utp->errcnt) || + __put_user(txc->stbcnt, &utp->stbcnt) || + __put_user(txc->tai, &utp->tai)) + return -EFAULT; + return 0; +} + asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) { @@ -235,6 +293,8 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) return compat_jiffies_to_clock_t(jiffies); } +#ifdef __ARCH_WANT_SYS_SIGPENDING + /* * Assumption: old_sigset_t and compat_old_sigset_t are both * types that can be passed to put_user()/get_user(). @@ -254,6 +314,10 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set) return ret; } +#endif + +#ifdef __ARCH_WANT_SYS_SIGPROCMASK + asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, compat_old_sigset_t __user *oset) { @@ -275,6 +339,8 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, return ret; } +#endif + asmlinkage long compat_sys_setrlimit(unsigned int resource, struct compat_rlimit __user *rlim) { @@ -617,6 +683,29 @@ long compat_sys_clock_gettime(clockid_t which_clock, return err; } +long compat_sys_clock_adjtime(clockid_t which_clock, + struct compat_timex __user *utp) +{ + struct timex txc; + mm_segment_t oldfs; + int err, ret; + + err = compat_get_timex(&txc, utp); + if (err) + return err; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc); + set_fs(oldfs); + + err = compat_put_timex(utp, &txc); + if (err) + return err; + + return ret; +} + long compat_sys_clock_getres(clockid_t which_clock, struct compat_timespec __user *tp) { @@ -809,10 +898,9 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, { compat_sigset_t s32; sigset_t s; - int sig; struct timespec t; siginfo_t info; - long ret, timeout = 0; + long ret; if (sigsetsize != sizeof(sigset_t)) return -EINVAL; @@ -820,51 +908,19 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) return -EFAULT; sigset_from_compat(&s, &s32); - sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP)); - signotset(&s); if (uts) { - if (get_compat_timespec (&t, uts)) + if (get_compat_timespec(&t, uts)) return -EFAULT; - if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 - || t.tv_sec < 0) - return -EINVAL; } - spin_lock_irq(¤t->sighand->siglock); - sig = dequeue_signal(current, &s, &info); - if (!sig) { - timeout = MAX_SCHEDULE_TIMEOUT; - if (uts) - timeout = timespec_to_jiffies(&t) - +(t.tv_sec || t.tv_nsec); - if (timeout) { - current->real_blocked = current->blocked; - sigandsets(¤t->blocked, ¤t->blocked, &s); - - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - timeout = schedule_timeout_interruptible(timeout); - - spin_lock_irq(¤t->sighand->siglock); - sig = dequeue_signal(current, &s, &info); - current->blocked = current->real_blocked; - siginitset(¤t->real_blocked, 0); - recalc_sigpending(); - } - } - spin_unlock_irq(¤t->sighand->siglock); + ret = do_sigtimedwait(&s, &info, uts ? &t : NULL); - if (sig) { - ret = sig; - if (uinfo) { - if (copy_siginfo_to_user32(uinfo, &info)) - ret = -EFAULT; - } - }else { - ret = timeout?-EINTR:-EAGAIN; + if (ret > 0 && uinfo) { + if (copy_siginfo_to_user32(uinfo, &info)) + ret = -EFAULT; } + return ret; } @@ -951,58 +1007,17 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) { struct timex txc; - int ret; - - memset(&txc, 0, sizeof(struct timex)); + int err, ret; - if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || - __get_user(txc.modes, &utp->modes) || - __get_user(txc.offset, &utp->offset) || - __get_user(txc.freq, &utp->freq) || - __get_user(txc.maxerror, &utp->maxerror) || - __get_user(txc.esterror, &utp->esterror) || - __get_user(txc.status, &utp->status) || - __get_user(txc.constant, &utp->constant) || - __get_user(txc.precision, &utp->precision) || - __get_user(txc.tolerance, &utp->tolerance) || - __get_user(txc.time.tv_sec, &utp->time.tv_sec) || - __get_user(txc.time.tv_usec, &utp->time.tv_usec) || - __get_user(txc.tick, &utp->tick) || - __get_user(txc.ppsfreq, &utp->ppsfreq) || - __get_user(txc.jitter, &utp->jitter) || - __get_user(txc.shift, &utp->shift) || - __get_user(txc.stabil, &utp->stabil) || - __get_user(txc.jitcnt, &utp->jitcnt) || - __get_user(txc.calcnt, &utp->calcnt) || - __get_user(txc.errcnt, &utp->errcnt) || - __get_user(txc.stbcnt, &utp->stbcnt)) - return -EFAULT; + err = compat_get_timex(&txc, utp); + if (err) + return err; ret = do_adjtimex(&txc); - if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || - __put_user(txc.modes, &utp->modes) || - __put_user(txc.offset, &utp->offset) || - __put_user(txc.freq, &utp->freq) || - __put_user(txc.maxerror, &utp->maxerror) || - __put_user(txc.esterror, &utp->esterror) || - __put_user(txc.status, &utp->status) || - __put_user(txc.constant, &utp->constant) || - __put_user(txc.precision, &utp->precision) || - __put_user(txc.tolerance, &utp->tolerance) || - __put_user(txc.time.tv_sec, &utp->time.tv_sec) || - __put_user(txc.time.tv_usec, &utp->time.tv_usec) || - __put_user(txc.tick, &utp->tick) || - __put_user(txc.ppsfreq, &utp->ppsfreq) || - __put_user(txc.jitter, &utp->jitter) || - __put_user(txc.shift, &utp->shift) || - __put_user(txc.stabil, &utp->stabil) || - __put_user(txc.jitcnt, &utp->jitcnt) || - __put_user(txc.calcnt, &utp->calcnt) || - __put_user(txc.errcnt, &utp->errcnt) || - __put_user(txc.stbcnt, &utp->stbcnt) || - __put_user(txc.tai, &utp->tai)) - ret = -EFAULT; + err = compat_put_timex(utp, &txc); + if (err) + return err; return ret; } diff --git a/kernel/cpu.c b/kernel/cpu.c index 156cc5556140..12b7458f23b1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -126,7 +126,7 @@ static void cpu_hotplug_done(void) #else /* #if CONFIG_HOTPLUG_CPU */ static void cpu_hotplug_begin(void) {} static void cpu_hotplug_done(void) {} -#endif /* #esle #if CONFIG_HOTPLUG_CPU */ +#endif /* #else #if CONFIG_HOTPLUG_CPU */ /* Need to know about CPUs going up/down? */ int __ref register_cpu_notifier(struct notifier_block *nb) @@ -160,7 +160,6 @@ static void cpu_notify_nofail(unsigned long val, void *v) { BUG_ON(cpu_notify(val, v)); } - EXPORT_SYMBOL(register_cpu_notifier); void __ref unregister_cpu_notifier(struct notifier_block *nb) @@ -205,7 +204,6 @@ static int __ref take_cpu_down(void *_param) return err; cpu_notify(CPU_DYING | param->mod, param->hcpu); - return 0; } @@ -227,6 +225,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) return -EINVAL; cpu_hotplug_begin(); + err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); if (err) { nr_calls--; @@ -304,7 +303,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); if (ret) { nr_calls--; - printk("%s: attempt to bring up CPU %u failed\n", + printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", __func__, cpu); goto out_notify; } @@ -450,14 +449,14 @@ void __ref enable_nonboot_cpus(void) if (cpumask_empty(frozen_cpus)) goto out; - printk("Enabling non-boot CPUs ...\n"); + printk(KERN_INFO "Enabling non-boot CPUs ...\n"); arch_enable_nonboot_cpus_begin(); for_each_cpu(cpu, frozen_cpus) { error = _cpu_up(cpu, 1); if (!error) { - printk("CPU%d is up\n", cpu); + printk(KERN_INFO "CPU%d is up\n", cpu); continue; } printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); @@ -509,7 +508,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu) */ /* cpu_bit_bitmap[0] is empty - so we can back into it */ -#define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x) +#define MASK_DECLARE_1(x) [x+1][0] = (1UL << (x)) #define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1) #define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2) #define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e92e98189032..9c9b7545c810 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1015,17 +1015,12 @@ static void cpuset_change_nodemask(struct task_struct *p, struct cpuset *cs; int migrate; const nodemask_t *oldmem = scan->data; - NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL); - - if (!newmems) - return; + static nodemask_t newmems; /* protected by cgroup_mutex */ cs = cgroup_cs(scan->cg); - guarantee_online_mems(cs, newmems); + guarantee_online_mems(cs, &newmems); - cpuset_change_task_nodemask(p, newmems); - - NODEMASK_FREE(newmems); + cpuset_change_task_nodemask(p, &newmems); mm = get_task_mm(p); if (!mm) @@ -1164,7 +1159,7 @@ int current_cpuset_is_being_rebound(void) static int update_relax_domain_level(struct cpuset *cs, s64 val) { #ifdef CONFIG_SMP - if (val < -1 || val >= SD_LV_MAX) + if (val < -1 || val >= sched_domain_level_max) return -EINVAL; #endif @@ -1372,14 +1367,10 @@ static int fmeter_getrate(struct fmeter *fmp) return val; } -/* Protected by cgroup_lock */ -static cpumask_var_t cpus_attach; - /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, - struct task_struct *tsk, bool threadgroup) + struct task_struct *tsk) { - int ret; struct cpuset *cs = cgroup_cs(cont); if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) @@ -1396,29 +1387,42 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, if (tsk->flags & PF_THREAD_BOUND) return -EINVAL; - ret = security_task_setscheduler(tsk); - if (ret) - return ret; - if (threadgroup) { - struct task_struct *c; - - rcu_read_lock(); - list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - ret = security_task_setscheduler(c); - if (ret) { - rcu_read_unlock(); - return ret; - } - } - rcu_read_unlock(); - } return 0; } -static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, - struct cpuset *cs) +static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task) +{ + return security_task_setscheduler(task); +} + +/* + * Protected by cgroup_lock. The nodemasks must be stored globally because + * dynamically allocating them is not allowed in pre_attach, and they must + * persist among pre_attach, attach_task, and attach. + */ +static cpumask_var_t cpus_attach; +static nodemask_t cpuset_attach_nodemask_from; +static nodemask_t cpuset_attach_nodemask_to; + +/* Set-up work for before attaching each task. */ +static void cpuset_pre_attach(struct cgroup *cont) +{ + struct cpuset *cs = cgroup_cs(cont); + + if (cs == &top_cpuset) + cpumask_copy(cpus_attach, cpu_possible_mask); + else + guarantee_online_cpus(cs, cpus_attach); + + guarantee_online_mems(cs, &cpuset_attach_nodemask_to); +} + +/* Per-thread attachment work. */ +static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk) { int err; + struct cpuset *cs = cgroup_cs(cont); + /* * can_attach beforehand should guarantee that this doesn't fail. * TODO: have a better way to handle failure here @@ -1426,56 +1430,31 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, err = set_cpus_allowed_ptr(tsk, cpus_attach); WARN_ON_ONCE(err); - cpuset_change_task_nodemask(tsk, to); + cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to); cpuset_update_task_spread_flag(cs, tsk); - } static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, - struct cgroup *oldcont, struct task_struct *tsk, - bool threadgroup) + struct cgroup *oldcont, struct task_struct *tsk) { struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); - NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL); - NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL); - if (from == NULL || to == NULL) - goto alloc_fail; - - if (cs == &top_cpuset) { - cpumask_copy(cpus_attach, cpu_possible_mask); - } else { - guarantee_online_cpus(cs, cpus_attach); - } - guarantee_online_mems(cs, to); - - /* do per-task migration stuff possibly for each in the threadgroup */ - cpuset_attach_task(tsk, to, cs); - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - cpuset_attach_task(c, to, cs); - } - rcu_read_unlock(); - } - - /* change mm; only needs to be done once even if threadgroup */ - *from = oldcs->mems_allowed; - *to = cs->mems_allowed; + /* + * Change mm, possibly for multiple threads in a threadgroup. This is + * expensive and may sleep. + */ + cpuset_attach_nodemask_from = oldcs->mems_allowed; + cpuset_attach_nodemask_to = cs->mems_allowed; mm = get_task_mm(tsk); if (mm) { - mpol_rebind_mm(mm, to); + mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); if (is_memory_migrate(cs)) - cpuset_migrate_mm(mm, from, to); + cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from, + &cpuset_attach_nodemask_to); mmput(mm); } - -alloc_fail: - NODEMASK_FREE(from); - NODEMASK_FREE(to); } /* The various types of files and directories in a cpuset file system */ @@ -1610,34 +1589,26 @@ out: * across a page fault. */ -static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) +static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs) { - int ret; + size_t count; mutex_lock(&callback_mutex); - ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); + count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); mutex_unlock(&callback_mutex); - return ret; + return count; } -static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) +static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) { - NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL); - int retval; - - if (mask == NULL) - return -ENOMEM; + size_t count; mutex_lock(&callback_mutex); - *mask = cs->mems_allowed; + count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed); mutex_unlock(&callback_mutex); - retval = nodelist_scnprintf(page, PAGE_SIZE, *mask); - - NODEMASK_FREE(mask); - - return retval; + return count; } static ssize_t cpuset_common_file_read(struct cgroup *cont, @@ -1831,10 +1802,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) } /* - * post_clone() is called at the end of cgroup_clone(). - * 'cgroup' was just created automatically as a result of - * a cgroup_clone(), and the current task is about to - * be moved into 'cgroup'. + * post_clone() is called during cgroup_create() when the + * clone_children mount argument was specified. The cgroup + * can not yet have any tasks. * * Currently we refuse to set up the cgroup - thereby * refusing the task to be entered, and as a result refusing @@ -1862,8 +1832,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss, cs = cgroup_cs(cgroup); parent_cs = cgroup_cs(parent); + mutex_lock(&callback_mutex); cs->mems_allowed = parent_cs->mems_allowed; cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); + mutex_unlock(&callback_mutex); return; } @@ -1931,6 +1903,9 @@ struct cgroup_subsys cpuset_subsys = { .create = cpuset_create, .destroy = cpuset_destroy, .can_attach = cpuset_can_attach, + .can_attach_task = cpuset_can_attach_task, + .pre_attach = cpuset_pre_attach, + .attach_task = cpuset_attach_task, .attach = cpuset_attach, .populate = cpuset_populate, .post_clone = cpuset_post_clone, @@ -2066,10 +2041,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) struct cpuset *cp; /* scans cpusets being updated */ struct cpuset *child; /* scans child cpusets of cp */ struct cgroup *cont; - NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); - - if (oldmems == NULL) - return; + static nodemask_t oldmems; /* protected by cgroup_mutex */ list_add_tail((struct list_head *)&root->stack_list, &queue); @@ -2086,7 +2058,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) continue; - *oldmems = cp->mems_allowed; + oldmems = cp->mems_allowed; /* Remove offline cpus and mems from this cpuset. */ mutex_lock(&callback_mutex); @@ -2102,10 +2074,9 @@ static void scan_for_empty_cpusets(struct cpuset *root) remove_tasks_in_empty_cpuset(cp); else { update_tasks_cpumask(cp, NULL); - update_tasks_nodemask(cp, oldmems, NULL); + update_tasks_nodemask(cp, &oldmems, NULL); } } - NODEMASK_FREE(oldmems); } /* @@ -2147,19 +2118,16 @@ void cpuset_update_active_cpus(void) static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { - NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); - - if (oldmems == NULL) - return NOTIFY_DONE; + static nodemask_t oldmems; /* protected by cgroup_mutex */ cgroup_lock(); switch (action) { case MEM_ONLINE: - *oldmems = top_cpuset.mems_allowed; + oldmems = top_cpuset.mems_allowed; mutex_lock(&callback_mutex); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; mutex_unlock(&callback_mutex); - update_tasks_nodemask(&top_cpuset, oldmems, NULL); + update_tasks_nodemask(&top_cpuset, &oldmems, NULL); break; case MEM_OFFLINE: /* @@ -2173,7 +2141,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self, } cgroup_unlock(); - NODEMASK_FREE(oldmems); return NOTIFY_OK; } #endif @@ -2223,7 +2190,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk) rcu_read_lock(); cs = task_cs(tsk); if (cs) - cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed); + do_set_cpus_allowed(tsk, cs->cpus_allowed); rcu_read_unlock(); /* @@ -2250,7 +2217,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk) * Like above we can temporary set any mask and rely on * set_cpus_allowed_ptr() as synchronization point. */ - cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask); + do_set_cpus_allowed(tsk, cpu_possible_mask); cpu = cpumask_any(cpu_active_mask); } diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c new file mode 100644 index 000000000000..5f85690285d4 --- /dev/null +++ b/kernel/crash_dump.c @@ -0,0 +1,34 @@ +#include <linux/kernel.h> +#include <linux/crash_dump.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/module.h> + +/* + * If we have booted due to a crash, max_pfn will be a very low value. We need + * to know the amount of memory that the previous kernel used. + */ +unsigned long saved_max_pfn; + +/* + * stores the physical address of elf header of crash image + * + * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by + * is_kdump_kernel() to determine if we are booting after a panic. Hence put + * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. + */ +unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; + +/* + * elfcorehdr= specifies the location of elf core header stored by the crashed + * kernel. This option will be passed by kexec loader to the capture kernel. + */ +static int __init setup_elfcorehdr(char *arg) +{ + char *end; + if (!arg) + return -EINVAL; + elfcorehdr_addr = memparse(arg, &end); + return end > arg ? 0 : -EINVAL; +} +early_param("elfcorehdr", setup_elfcorehdr); diff --git a/kernel/cred.c b/kernel/cred.c index 3a9d6dd53a6c..174fa84eca30 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -1,4 +1,4 @@ -/* Task credentials management - see Documentation/credentials.txt +/* Task credentials management - see Documentation/security/credentials.txt * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) @@ -35,7 +35,7 @@ static struct kmem_cache *cred_jar; static struct thread_group_cred init_tgcred = { .usage = ATOMIC_INIT(2), .tgid = 0, - .lock = SPIN_LOCK_UNLOCKED, + .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock), }; #endif @@ -49,11 +49,12 @@ struct cred init_cred = { .magic = CRED_MAGIC, #endif .securebits = SECUREBITS_DEFAULT, - .cap_inheritable = CAP_INIT_INH_SET, + .cap_inheritable = CAP_EMPTY_SET, .cap_permitted = CAP_FULL_SET, - .cap_effective = CAP_INIT_EFF_SET, - .cap_bset = CAP_INIT_BSET, + .cap_effective = CAP_FULL_SET, + .cap_bset = CAP_FULL_SET, .user = INIT_USER, + .user_ns = &init_user_ns, .group_info = &init_groups, #ifdef CONFIG_KEYS .tgcred = &init_tgcred, @@ -410,6 +411,11 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) goto error_put; } + /* cache user_ns in cred. Doesn't need a refcount because it will + * stay pinned by cred->user + */ + new->user_ns = new->user->user_ns; + #ifdef CONFIG_KEYS /* new threads get their own thread keyrings if their parent already * had one */ diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index cefd4a11f6d9..bad6786dee88 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -538,7 +538,7 @@ return_normal: /* * For single stepping, try to only enter on the processor - * that was single stepping. To gaurd against a deadlock, the + * that was single stepping. To guard against a deadlock, the * kernel will only try for the value of sstep_tries before * giving up and continuing on. */ diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 481a7bd2dfe7..a11db956dd62 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -1093,3 +1093,33 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd) put_packet(remcom_out_buffer); return 0; } + +/** + * gdbstub_exit - Send an exit message to GDB + * @status: The exit code to report. + */ +void gdbstub_exit(int status) +{ + unsigned char checksum, ch, buffer[3]; + int loop; + + buffer[0] = 'W'; + buffer[1] = hex_asc_hi(status); + buffer[2] = hex_asc_lo(status); + + dbg_io_ops->write_char('$'); + checksum = 0; + + for (loop = 0; loop < 3; loop++) { + ch = buffer[loop]; + checksum += ch; + dbg_io_ops->write_char(ch); + } + + dbg_io_ops->write_char('#'); + dbg_io_ops->write_char(hex_asc_hi(checksum)); + dbg_io_ops->write_char(hex_asc_lo(checksum)); + + /* make sure the output is flushed, lest the bootloader clobber it */ + dbg_io_ops->flush(); +} diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index bd3e8e29caa3..be14779bcef6 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -78,7 +78,7 @@ static unsigned int kdb_continue_catastrophic; static kdbtab_t *kdb_commands; #define KDB_BASE_CMD_MAX 50 static int kdb_max_commands = KDB_BASE_CMD_MAX; -static kdbtab_t kdb_base_commands[50]; +static kdbtab_t kdb_base_commands[KDB_BASE_CMD_MAX]; #define for_each_kdbcmd(cmd, num) \ for ((cmd) = kdb_base_commands, (num) = 0; \ num < kdb_max_commands; \ @@ -441,9 +441,9 @@ static int kdb_check_regs(void) * symbol name, and offset to the caller. * * The argument may consist of a numeric value (decimal or - * hexidecimal), a symbol name, a register name (preceeded by the + * hexidecimal), a symbol name, a register name (preceded by the * percent sign), an environment variable with a numeric value - * (preceeded by a dollar sign) or a simple arithmetic expression + * (preceded by a dollar sign) or a simple arithmetic expression * consisting of a symbol name, +/-, and a numeric constant value * (offset). * Parameters: @@ -1335,7 +1335,7 @@ void kdb_print_state(const char *text, int value) * error The hardware-defined error code * reason2 kdb's current reason code. * Initially error but can change - * acording to kdb state. + * according to kdb state. * db_result Result code from break or debug point. * regs The exception frame at time of fault/breakpoint. * should always be valid. @@ -2892,7 +2892,7 @@ static void __init kdb_inittab(void) "Send a signal to a process", 0, KDB_REPEAT_NONE); kdb_register_repeat("summary", kdb_summary, "", "Summarize the system", 4, KDB_REPEAT_NONE); - kdb_register_repeat("per_cpu", kdb_per_cpu, "", + kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]", "Display per_cpu variables", 3, KDB_REPEAT_NONE); kdb_register_repeat("grephelp", kdb_grep_help, "", "Display help on | grep", 0, KDB_REPEAT_NONE); diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 6b2485dcb050..5532dd37aa86 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -545,7 +545,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size) * Mask for process state. * Notes: * The mask folds data from several sources into a single long value, so - * be carefull not to overlap the bits. TASK_* bits are in the LSB, + * be careful not to overlap the bits. TASK_* bits are in the LSB, * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there * is no overlap between TASK_* and EXIT_* but that may not always be * true, so EXIT_* bits are shifted left 16 bits before being stored in diff --git a/kernel/events/Makefile b/kernel/events/Makefile new file mode 100644 index 000000000000..1ce23d3d8394 --- /dev/null +++ b/kernel/events/Makefile @@ -0,0 +1,6 @@ +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_core.o = -pg +endif + +obj-y := core.o +obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o diff --git a/kernel/perf_event.c b/kernel/events/core.c index 656222fcf767..d863b3c057bb 100644 --- a/kernel/perf_event.c +++ b/kernel/events/core.c @@ -2,8 +2,8 @@ * Performance events core code: * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> - * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> * * For licensing details see kernel-base/COPYING @@ -38,13 +38,96 @@ #include <asm/irq_regs.h> +struct remote_function_call { + struct task_struct *p; + int (*func)(void *info); + void *info; + int ret; +}; + +static void remote_function(void *data) +{ + struct remote_function_call *tfc = data; + struct task_struct *p = tfc->p; + + if (p) { + tfc->ret = -EAGAIN; + if (task_cpu(p) != smp_processor_id() || !task_curr(p)) + return; + } + + tfc->ret = tfc->func(tfc->info); +} + +/** + * task_function_call - call a function on the cpu on which a task runs + * @p: the task to evaluate + * @func: the function to be called + * @info: the function call argument + * + * Calls the function @func when the task is currently running. This might + * be on the current CPU, which just calls the function directly + * + * returns: @func return value, or + * -ESRCH - when the process isn't running + * -EAGAIN - when the process moved away + */ +static int +task_function_call(struct task_struct *p, int (*func) (void *info), void *info) +{ + struct remote_function_call data = { + .p = p, + .func = func, + .info = info, + .ret = -ESRCH, /* No such (running) process */ + }; + + if (task_curr(p)) + smp_call_function_single(task_cpu(p), remote_function, &data, 1); + + return data.ret; +} + +/** + * cpu_function_call - call a function on the cpu + * @func: the function to be called + * @info: the function call argument + * + * Calls the function @func on the remote cpu. + * + * returns: @func return value or -ENXIO when the cpu is offline + */ +static int cpu_function_call(int cpu, int (*func) (void *info), void *info) +{ + struct remote_function_call data = { + .p = NULL, + .func = func, + .info = info, + .ret = -ENXIO, /* No such CPU */ + }; + + smp_call_function_single(cpu, remote_function, &data, 1); + + return data.ret; +} + +#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ + PERF_FLAG_FD_OUTPUT |\ + PERF_FLAG_PID_CGROUP) + enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, }; -atomic_t perf_task_events __read_mostly; +/* + * perf_sched_events : >0 events exist + * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu + */ +struct jump_label_key perf_sched_events __read_mostly; +static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); + static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_task_events __read_mostly; @@ -62,12 +145,30 @@ static struct srcu_struct pmus_srcu; */ int sysctl_perf_event_paranoid __read_mostly = 1; -int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ +/* Minimum for 512 kiB + 1 user control page */ +int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ /* * max perf event sample rate */ -int sysctl_perf_event_sample_rate __read_mostly = 100000; +#define DEFAULT_MAX_SAMPLE_RATE 100000 +int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; +static int max_samples_per_tick __read_mostly = + DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); + +int perf_proc_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (ret || !write) + return ret; + + max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); + + return 0; +} static atomic64_t perf_event_id; @@ -75,7 +176,11 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, enum event_type_t event_type); static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type); + enum event_type_t event_type, + struct task_struct *task); + +static void update_context_time(struct perf_event_context *ctx); +static u64 perf_event_time(struct perf_event *event); void __weak perf_event_print_debug(void) { } @@ -89,6 +194,361 @@ static inline u64 perf_clock(void) return local_clock(); } +static inline struct perf_cpu_context * +__get_cpu_context(struct perf_event_context *ctx) +{ + return this_cpu_ptr(ctx->pmu->pmu_cpu_context); +} + +#ifdef CONFIG_CGROUP_PERF + +/* + * Must ensure cgroup is pinned (css_get) before calling + * this function. In other words, we cannot call this function + * if there is no cgroup event for the current CPU context. + */ +static inline struct perf_cgroup * +perf_cgroup_from_task(struct task_struct *task) +{ + return container_of(task_subsys_state(task, perf_subsys_id), + struct perf_cgroup, css); +} + +static inline bool +perf_cgroup_match(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + return !event->cgrp || event->cgrp == cpuctx->cgrp; +} + +static inline void perf_get_cgroup(struct perf_event *event) +{ + css_get(&event->cgrp->css); +} + +static inline void perf_put_cgroup(struct perf_event *event) +{ + css_put(&event->cgrp->css); +} + +static inline void perf_detach_cgroup(struct perf_event *event) +{ + perf_put_cgroup(event); + event->cgrp = NULL; +} + +static inline int is_cgroup_event(struct perf_event *event) +{ + return event->cgrp != NULL; +} + +static inline u64 perf_cgroup_event_time(struct perf_event *event) +{ + struct perf_cgroup_info *t; + + t = per_cpu_ptr(event->cgrp->info, event->cpu); + return t->time; +} + +static inline void __update_cgrp_time(struct perf_cgroup *cgrp) +{ + struct perf_cgroup_info *info; + u64 now; + + now = perf_clock(); + + info = this_cpu_ptr(cgrp->info); + + info->time += now - info->timestamp; + info->timestamp = now; +} + +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +{ + struct perf_cgroup *cgrp_out = cpuctx->cgrp; + if (cgrp_out) + __update_cgrp_time(cgrp_out); +} + +static inline void update_cgrp_time_from_event(struct perf_event *event) +{ + struct perf_cgroup *cgrp; + + /* + * ensure we access cgroup data only when needed and + * when we know the cgroup is pinned (css_get) + */ + if (!is_cgroup_event(event)) + return; + + cgrp = perf_cgroup_from_task(current); + /* + * Do not update time when cgroup is not active + */ + if (cgrp == event->cgrp) + __update_cgrp_time(event->cgrp); +} + +static inline void +perf_cgroup_set_timestamp(struct task_struct *task, + struct perf_event_context *ctx) +{ + struct perf_cgroup *cgrp; + struct perf_cgroup_info *info; + + /* + * ctx->lock held by caller + * ensure we do not access cgroup data + * unless we have the cgroup pinned (css_get) + */ + if (!task || !ctx->nr_cgroups) + return; + + cgrp = perf_cgroup_from_task(task); + info = this_cpu_ptr(cgrp->info); + info->timestamp = ctx->timestamp; +} + +#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ +#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ + +/* + * reschedule events based on the cgroup constraint of task. + * + * mode SWOUT : schedule out everything + * mode SWIN : schedule in based on cgroup for next + */ +void perf_cgroup_switch(struct task_struct *task, int mode) +{ + struct perf_cpu_context *cpuctx; + struct pmu *pmu; + unsigned long flags; + + /* + * disable interrupts to avoid geting nr_cgroup + * changes via __perf_event_disable(). Also + * avoids preemption. + */ + local_irq_save(flags); + + /* + * we reschedule only in the presence of cgroup + * constrained events. + */ + rcu_read_lock(); + + list_for_each_entry_rcu(pmu, &pmus, entry) { + + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + perf_pmu_disable(cpuctx->ctx.pmu); + + /* + * perf_cgroup_events says at least one + * context on this CPU has cgroup events. + * + * ctx->nr_cgroups reports the number of cgroup + * events for a context. + */ + if (cpuctx->ctx.nr_cgroups > 0) { + + if (mode & PERF_CGROUP_SWOUT) { + cpu_ctx_sched_out(cpuctx, EVENT_ALL); + /* + * must not be done before ctxswout due + * to event_filter_match() in event_sched_out() + */ + cpuctx->cgrp = NULL; + } + + if (mode & PERF_CGROUP_SWIN) { + WARN_ON_ONCE(cpuctx->cgrp); + /* set cgrp before ctxsw in to + * allow event_filter_match() to not + * have to pass task around + */ + cpuctx->cgrp = perf_cgroup_from_task(task); + cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); + } + } + + perf_pmu_enable(cpuctx->ctx.pmu); + } + + rcu_read_unlock(); + + local_irq_restore(flags); +} + +static inline void perf_cgroup_sched_out(struct task_struct *task) +{ + perf_cgroup_switch(task, PERF_CGROUP_SWOUT); +} + +static inline void perf_cgroup_sched_in(struct task_struct *task) +{ + perf_cgroup_switch(task, PERF_CGROUP_SWIN); +} + +static inline int perf_cgroup_connect(int fd, struct perf_event *event, + struct perf_event_attr *attr, + struct perf_event *group_leader) +{ + struct perf_cgroup *cgrp; + struct cgroup_subsys_state *css; + struct file *file; + int ret = 0, fput_needed; + + file = fget_light(fd, &fput_needed); + if (!file) + return -EBADF; + + css = cgroup_css_from_dir(file, perf_subsys_id); + if (IS_ERR(css)) { + ret = PTR_ERR(css); + goto out; + } + + cgrp = container_of(css, struct perf_cgroup, css); + event->cgrp = cgrp; + + /* must be done before we fput() the file */ + perf_get_cgroup(event); + + /* + * all events in a group must monitor + * the same cgroup because a task belongs + * to only one perf cgroup at a time + */ + if (group_leader && group_leader->cgrp != cgrp) { + perf_detach_cgroup(event); + ret = -EINVAL; + } +out: + fput_light(file, fput_needed); + return ret; +} + +static inline void +perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +{ + struct perf_cgroup_info *t; + t = per_cpu_ptr(event->cgrp->info, event->cpu); + event->shadow_ctx_time = now - t->timestamp; +} + +static inline void +perf_cgroup_defer_enabled(struct perf_event *event) +{ + /* + * when the current task's perf cgroup does not match + * the event's, we need to remember to call the + * perf_mark_enable() function the first time a task with + * a matching perf cgroup is scheduled in. + */ + if (is_cgroup_event(event) && !perf_cgroup_match(event)) + event->cgrp_defer_enabled = 1; +} + +static inline void +perf_cgroup_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *sub; + u64 tstamp = perf_event_time(event); + + if (!event->cgrp_defer_enabled) + return; + + event->cgrp_defer_enabled = 0; + + event->tstamp_enabled = tstamp - event->total_time_enabled; + list_for_each_entry(sub, &event->sibling_list, group_entry) { + if (sub->state >= PERF_EVENT_STATE_INACTIVE) { + sub->tstamp_enabled = tstamp - sub->total_time_enabled; + sub->cgrp_defer_enabled = 0; + } + } +} +#else /* !CONFIG_CGROUP_PERF */ + +static inline bool +perf_cgroup_match(struct perf_event *event) +{ + return true; +} + +static inline void perf_detach_cgroup(struct perf_event *event) +{} + +static inline int is_cgroup_event(struct perf_event *event) +{ + return 0; +} + +static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event) +{ + return 0; +} + +static inline void update_cgrp_time_from_event(struct perf_event *event) +{ +} + +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +{ +} + +static inline void perf_cgroup_sched_out(struct task_struct *task) +{ +} + +static inline void perf_cgroup_sched_in(struct task_struct *task) +{ +} + +static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, + struct perf_event_attr *attr, + struct perf_event *group_leader) +{ + return -EINVAL; +} + +static inline void +perf_cgroup_set_timestamp(struct task_struct *task, + struct perf_event_context *ctx) +{ +} + +void +perf_cgroup_switch(struct task_struct *task, struct task_struct *next) +{ +} + +static inline void +perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +{ +} + +static inline u64 perf_cgroup_event_time(struct perf_event *event) +{ + return 0; +} + +static inline void +perf_cgroup_defer_enabled(struct perf_event *event) +{ +} + +static inline void +perf_cgroup_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ +} +#endif + void perf_pmu_disable(struct pmu *pmu) { int *count = this_cpu_ptr(pmu->pmu_disable_count); @@ -126,14 +586,6 @@ static void get_ctx(struct perf_event_context *ctx) WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); } -static void free_ctx(struct rcu_head *head) -{ - struct perf_event_context *ctx; - - ctx = container_of(head, struct perf_event_context, rcu_head); - kfree(ctx); -} - static void put_ctx(struct perf_event_context *ctx) { if (atomic_dec_and_test(&ctx->refcount)) { @@ -141,7 +593,7 @@ static void put_ctx(struct perf_event_context *ctx) put_ctx(ctx->parent_ctx); if (ctx->task) put_task_struct(ctx->task); - call_rcu(&ctx->rcu_head, free_ctx); + kfree_rcu(ctx, rcu_head); } } @@ -254,7 +706,6 @@ static void perf_unpin_context(struct perf_event_context *ctx) raw_spin_lock_irqsave(&ctx->lock, flags); --ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); - put_ctx(ctx); } /* @@ -271,6 +722,10 @@ static void update_context_time(struct perf_event_context *ctx) static u64 perf_event_time(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; + + if (is_cgroup_event(event)) + return perf_cgroup_event_time(event); + return ctx ? ctx->time : 0; } @@ -285,9 +740,20 @@ static void update_event_times(struct perf_event *event) if (event->state < PERF_EVENT_STATE_INACTIVE || event->group_leader->state < PERF_EVENT_STATE_INACTIVE) return; - - if (ctx->is_active) + /* + * in cgroup mode, time_enabled represents + * the time the event was enabled AND active + * tasks were in the monitored cgroup. This is + * independent of the activity of the context as + * there may be a mix of cgroup and non-cgroup events. + * + * That is why we treat cgroup events differently + * here. + */ + if (is_cgroup_event(event)) run_end = perf_event_time(event); + else if (ctx->is_active) + run_end = ctx->time; else run_end = event->tstamp_stopped; @@ -299,6 +765,7 @@ static void update_event_times(struct perf_event *event) run_end = perf_event_time(event); event->total_time_running = run_end - event->tstamp_running; + } /* @@ -347,6 +814,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) list_add_tail(&event->group_entry, list); } + if (is_cgroup_event(event)) + ctx->nr_cgroups++; + list_add_rcu(&event->event_entry, &ctx->event_list); if (!ctx->nr_events) perf_pmu_rotate_start(ctx->pmu); @@ -465,6 +935,7 @@ static void perf_group_attach(struct perf_event *event) static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { + struct perf_cpu_context *cpuctx; /* * We can have double detach due to exit/hot-unplug + close. */ @@ -473,6 +944,18 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; + if (is_cgroup_event(event)) { + ctx->nr_cgroups--; + cpuctx = __get_cpu_context(ctx); + /* + * if there are no more cgroup events + * then cler cgrp to avoid stale pointer + * in update_cgrp_time_from_cpuctx() + */ + if (!ctx->nr_cgroups) + cpuctx->cgrp = NULL; + } + ctx->nr_events--; if (event->attr.inherit_stat) ctx->nr_stat--; @@ -544,7 +1027,8 @@ out: static inline int event_filter_match(struct perf_event *event) { - return event->cpu == -1 || event->cpu == smp_processor_id(); + return (event->cpu == -1 || event->cpu == smp_processor_id()) + && perf_cgroup_match(event); } static void @@ -562,7 +1046,7 @@ event_sched_out(struct perf_event *event, */ if (event->state == PERF_EVENT_STATE_INACTIVE && !event_filter_match(event)) { - delta = ctx->time - event->tstamp_stopped; + delta = tstamp - event->tstamp_stopped; event->tstamp_running += delta; event->tstamp_stopped = tstamp; } @@ -606,47 +1090,30 @@ group_sched_out(struct perf_event *group_event, cpuctx->exclusive = 0; } -static inline struct perf_cpu_context * -__get_cpu_context(struct perf_event_context *ctx) -{ - return this_cpu_ptr(ctx->pmu->pmu_cpu_context); -} - /* * Cross CPU call to remove a performance event * * We disable the event on the hardware level first. After that we * remove it from the context list. */ -static void __perf_event_remove_from_context(void *info) +static int __perf_remove_from_context(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; - raw_spin_lock(&ctx->lock); - event_sched_out(event, cpuctx, ctx); - list_del_event(event, ctx); - raw_spin_unlock(&ctx->lock); + + return 0; } /* * Remove the event from a task's (or a CPU's) list of events. * - * Must be called with ctx->mutex held. - * * CPU events are removed with a smp call. For task events we only * call when the task is on a CPU. * @@ -657,49 +1124,48 @@ static void __perf_event_remove_from_context(void *info) * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ -static void perf_event_remove_from_context(struct perf_event *event) +static void perf_remove_from_context(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; + lockdep_assert_held(&ctx->mutex); + if (!task) { /* * Per cpu events are removed via an smp call and * the removal is always successful. */ - smp_call_function_single(event->cpu, - __perf_event_remove_from_context, - event, 1); + cpu_function_call(event->cpu, __perf_remove_from_context, event); return; } retry: - task_oncpu_function_call(task, __perf_event_remove_from_context, - event); + if (!task_function_call(task, __perf_remove_from_context, event)) + return; raw_spin_lock_irq(&ctx->lock); /* - * If the context is active we need to retry the smp call. + * If we failed to find a running task, but find the context active now + * that we've acquired the ctx->lock, retry. */ - if (ctx->nr_active && !list_empty(&event->group_entry)) { + if (ctx->is_active) { raw_spin_unlock_irq(&ctx->lock); goto retry; } /* - * The lock prevents that this context is scheduled in so we - * can remove the event safely, if the call above did not - * succeed. + * Since the task isn't running, its safe to remove the event, us + * holding the ctx->lock ensures the task won't get scheduled in. */ - if (!list_empty(&event->group_entry)) - list_del_event(event, ctx); + list_del_event(event, ctx); raw_spin_unlock_irq(&ctx->lock); } /* * Cross CPU call to disable a performance event */ -static void __perf_event_disable(void *info) +static int __perf_event_disable(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; @@ -708,9 +1174,12 @@ static void __perf_event_disable(void *info) /* * If this is a per-task event, need to check whether this * event's task is the current task on this cpu. + * + * Can trigger due to concurrent perf_event_context_sched_out() + * flipping contexts around. */ if (ctx->task && cpuctx->task_ctx != ctx) - return; + return -EINVAL; raw_spin_lock(&ctx->lock); @@ -720,6 +1189,7 @@ static void __perf_event_disable(void *info) */ if (event->state >= PERF_EVENT_STATE_INACTIVE) { update_context_time(ctx); + update_cgrp_time_from_event(event); update_group_times(event); if (event == event->group_leader) group_sched_out(event, cpuctx, ctx); @@ -729,6 +1199,8 @@ static void __perf_event_disable(void *info) } raw_spin_unlock(&ctx->lock); + + return 0; } /* @@ -753,13 +1225,13 @@ void perf_event_disable(struct perf_event *event) /* * Disable the event on the cpu that it's on */ - smp_call_function_single(event->cpu, __perf_event_disable, - event, 1); + cpu_function_call(event->cpu, __perf_event_disable, event); return; } retry: - task_oncpu_function_call(task, __perf_event_disable, event); + if (!task_function_call(task, __perf_event_disable, event)) + return; raw_spin_lock_irq(&ctx->lock); /* @@ -767,6 +1239,11 @@ retry: */ if (event->state == PERF_EVENT_STATE_ACTIVE) { raw_spin_unlock_irq(&ctx->lock); + /* + * Reload the task pointer, it might have been changed by + * a concurrent perf_event_context_sched_out(). + */ + task = ctx->task; goto retry; } @@ -778,10 +1255,44 @@ retry: update_group_times(event); event->state = PERF_EVENT_STATE_OFF; } - raw_spin_unlock_irq(&ctx->lock); } +static void perf_set_shadow_time(struct perf_event *event, + struct perf_event_context *ctx, + u64 tstamp) +{ + /* + * use the correct time source for the time snapshot + * + * We could get by without this by leveraging the + * fact that to get to this function, the caller + * has most likely already called update_context_time() + * and update_cgrp_time_xx() and thus both timestamp + * are identical (or very close). Given that tstamp is, + * already adjusted for cgroup, we could say that: + * tstamp - ctx->timestamp + * is equivalent to + * tstamp - cgrp->timestamp. + * + * Then, in perf_output_read(), the calculation would + * work with no changes because: + * - event is guaranteed scheduled in + * - no scheduled out in between + * - thus the timestamp would be the same + * + * But this is a bit hairy. + * + * So instead, we have an explicit cgroup call to remain + * within the time time source all along. We believe it + * is cleaner and simpler to understand. + */ + if (is_cgroup_event(event)) + perf_cgroup_set_shadow_time(event, tstamp); + else + event->shadow_ctx_time = tstamp - ctx->timestamp; +} + #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_event *event, int enable); @@ -822,7 +1333,7 @@ event_sched_in(struct perf_event *event, event->tstamp_running += tstamp - event->tstamp_stopped; - event->shadow_ctx_time = tstamp - ctx->timestamp; + perf_set_shadow_time(event, ctx, tstamp); if (!is_software_event(event)) cpuctx->active_oncpu++; @@ -943,12 +1454,15 @@ static void add_event_to_ctx(struct perf_event *event, event->tstamp_stopped = tstamp; } +static void perf_event_context_sched_in(struct perf_event_context *ctx, + struct task_struct *tsk); + /* * Cross CPU call to install and enable a performance event * * Must be called with ctx->mutex held */ -static void __perf_install_in_context(void *info) +static int __perf_install_in_context(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; @@ -957,21 +1471,22 @@ static void __perf_install_in_context(void *info) int err; /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. - * Or possibly this is the right context but it isn't - * on this cpu because it had no events. + * In case we're installing a new context to an already running task, + * could also happen before perf_event_task_sched_in() on architectures + * which do context switches with IRQs enabled. */ - if (ctx->task && cpuctx->task_ctx != ctx) { - if (cpuctx->task_ctx || ctx->task != current) - return; - cpuctx->task_ctx = ctx; - } + if (ctx->task && !cpuctx->task_ctx) + perf_event_context_sched_in(ctx, ctx->task); raw_spin_lock(&ctx->lock); ctx->is_active = 1; update_context_time(ctx); + /* + * update cgrp time only if current cgrp + * matches event->cgrp. Must be done before + * calling add_event_to_ctx() + */ + update_cgrp_time_from_event(event); add_event_to_ctx(event, ctx); @@ -1012,6 +1527,8 @@ static void __perf_install_in_context(void *info) unlock: raw_spin_unlock(&ctx->lock); + + return 0; } /* @@ -1023,8 +1540,6 @@ unlock: * If the event is attached to a task which is on a CPU we use a smp * call to enable it in the task context. The task might have been * scheduled away, but we check this in the smp call again. - * - * Must be called with ctx->mutex held. */ static void perf_install_in_context(struct perf_event_context *ctx, @@ -1033,6 +1548,8 @@ perf_install_in_context(struct perf_event_context *ctx, { struct task_struct *task = ctx->task; + lockdep_assert_held(&ctx->mutex); + event->ctx = ctx; if (!task) { @@ -1040,31 +1557,29 @@ perf_install_in_context(struct perf_event_context *ctx, * Per cpu events are installed via an smp call and * the install is always successful. */ - smp_call_function_single(cpu, __perf_install_in_context, - event, 1); + cpu_function_call(cpu, __perf_install_in_context, event); return; } retry: - task_oncpu_function_call(task, __perf_install_in_context, - event); + if (!task_function_call(task, __perf_install_in_context, event)) + return; raw_spin_lock_irq(&ctx->lock); /* - * we need to retry the smp call. + * If we failed to find a running task, but find the context active now + * that we've acquired the ctx->lock, retry. */ - if (ctx->is_active && list_empty(&event->group_entry)) { + if (ctx->is_active) { raw_spin_unlock_irq(&ctx->lock); goto retry; } /* - * The lock prevents that this context is scheduled in so we - * can add the event safely, if it the call above did not - * succeed. + * Since the task isn't running, its safe to add the event, us holding + * the ctx->lock ensures the task won't get scheduled in. */ - if (list_empty(&event->group_entry)) - add_event_to_ctx(event, ctx); + add_event_to_ctx(event, ctx); raw_spin_unlock_irq(&ctx->lock); } @@ -1093,7 +1608,7 @@ static void __perf_event_mark_enabled(struct perf_event *event, /* * Cross CPU call to enable a performance event */ -static void __perf_event_enable(void *info) +static int __perf_event_enable(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; @@ -1101,26 +1616,27 @@ static void __perf_event_enable(void *info) struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); int err; - /* - * If this is a per-task event, need to check whether this - * event's task is the current task on this cpu. - */ - if (ctx->task && cpuctx->task_ctx != ctx) { - if (cpuctx->task_ctx || ctx->task != current) - return; - cpuctx->task_ctx = ctx; - } + if (WARN_ON_ONCE(!ctx->is_active)) + return -EINVAL; raw_spin_lock(&ctx->lock); - ctx->is_active = 1; update_context_time(ctx); if (event->state >= PERF_EVENT_STATE_INACTIVE) goto unlock; + + /* + * set current task's cgroup time reference point + */ + perf_cgroup_set_timestamp(current, ctx); + __perf_event_mark_enabled(event, ctx); - if (!event_filter_match(event)) + if (!event_filter_match(event)) { + if (is_cgroup_event(event)) + perf_cgroup_defer_enabled(event); goto unlock; + } /* * If the event is in a group and isn't the group leader, @@ -1153,6 +1669,8 @@ static void __perf_event_enable(void *info) unlock: raw_spin_unlock(&ctx->lock); + + return 0; } /* @@ -1173,8 +1691,7 @@ void perf_event_enable(struct perf_event *event) /* * Enable the event on the cpu that it's on */ - smp_call_function_single(event->cpu, __perf_event_enable, - event, 1); + cpu_function_call(event->cpu, __perf_event_enable, event); return; } @@ -1193,8 +1710,15 @@ void perf_event_enable(struct perf_event *event) event->state = PERF_EVENT_STATE_OFF; retry: + if (!ctx->is_active) { + __perf_event_mark_enabled(event, ctx); + goto out; + } + raw_spin_unlock_irq(&ctx->lock); - task_oncpu_function_call(task, __perf_event_enable, event); + + if (!task_function_call(task, __perf_event_enable, event)) + return; raw_spin_lock_irq(&ctx->lock); @@ -1202,15 +1726,14 @@ retry: * If the context is active and the event is still off, * we need to retry the cross-call. */ - if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) + if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) { + /* + * task could have been flipped by a concurrent + * perf_event_context_sched_out() + */ + task = ctx->task; goto retry; - - /* - * Since we have the lock this context can't be scheduled - * in, so we can change the state safely. - */ - if (event->state == PERF_EVENT_STATE_OFF) - __perf_event_mark_enabled(event, ctx); + } out: raw_spin_unlock_irq(&ctx->lock); @@ -1242,6 +1765,7 @@ static void ctx_sched_out(struct perf_event_context *ctx, if (likely(!ctx->nr_events)) goto out; update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx); if (!ctx->nr_active) goto out; @@ -1354,8 +1878,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, } } -void perf_event_context_sched_out(struct task_struct *task, int ctxn, - struct task_struct *next) +static void perf_event_context_sched_out(struct task_struct *task, int ctxn, + struct task_struct *next) { struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; struct perf_event_context *next_ctx; @@ -1431,6 +1955,14 @@ void __perf_event_task_sched_out(struct task_struct *task, for_each_task_context_nr(ctxn) perf_event_context_sched_out(task, ctxn, next); + + /* + * if cgroup events exist on this CPU, then we need + * to check if we have to switch out PMU state. + * cgroup event are system-wide mode only + */ + if (atomic_read(&__get_cpu_var(perf_cgroup_events))) + perf_cgroup_sched_out(task); } static void task_ctx_sched_out(struct perf_event_context *ctx, @@ -1469,6 +2001,10 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, if (!event_filter_match(event)) continue; + /* may need to reset tstamp_enabled */ + if (is_cgroup_event(event)) + perf_cgroup_mark_enabled(event, ctx); + if (group_can_go_on(event, cpuctx, 1)) group_sched_in(event, cpuctx, ctx); @@ -1501,6 +2037,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, if (!event_filter_match(event)) continue; + /* may need to reset tstamp_enabled */ + if (is_cgroup_event(event)) + perf_cgroup_mark_enabled(event, ctx); + if (group_can_go_on(event, cpuctx, can_add_hw)) { if (group_sched_in(event, cpuctx, ctx)) can_add_hw = 0; @@ -1511,15 +2051,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, - enum event_type_t event_type) + enum event_type_t event_type, + struct task_struct *task) { + u64 now; + raw_spin_lock(&ctx->lock); ctx->is_active = 1; if (likely(!ctx->nr_events)) goto out; - ctx->timestamp = perf_clock(); - + now = perf_clock(); + ctx->timestamp = now; + perf_cgroup_set_timestamp(task, ctx); /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. @@ -1536,11 +2080,12 @@ out: } static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type) + enum event_type_t event_type, + struct task_struct *task) { struct perf_event_context *ctx = &cpuctx->ctx; - ctx_sched_in(ctx, cpuctx, event_type); + ctx_sched_in(ctx, cpuctx, event_type, task); } static void task_ctx_sched_in(struct perf_event_context *ctx, @@ -1548,15 +2093,16 @@ static void task_ctx_sched_in(struct perf_event_context *ctx, { struct perf_cpu_context *cpuctx; - cpuctx = __get_cpu_context(ctx); + cpuctx = __get_cpu_context(ctx); if (cpuctx->task_ctx == ctx) return; - ctx_sched_in(ctx, cpuctx, event_type); + ctx_sched_in(ctx, cpuctx, event_type, NULL); cpuctx->task_ctx = ctx; } -void perf_event_context_sched_in(struct perf_event_context *ctx) +static void perf_event_context_sched_in(struct perf_event_context *ctx, + struct task_struct *task) { struct perf_cpu_context *cpuctx; @@ -1572,9 +2118,9 @@ void perf_event_context_sched_in(struct perf_event_context *ctx) */ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - ctx_sched_in(ctx, cpuctx, EVENT_PINNED); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); - ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); + ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); cpuctx->task_ctx = ctx; @@ -1607,8 +2153,15 @@ void __perf_event_task_sched_in(struct task_struct *task) if (likely(!ctx)) continue; - perf_event_context_sched_in(ctx); + perf_event_context_sched_in(ctx, task); } + /* + * if cgroup events exist on this CPU, then we need + * to check if we have to switch in PMU state. + * cgroup event are system-wide mode only + */ + if (atomic_read(&__get_cpu_var(perf_cgroup_events))) + perf_cgroup_sched_in(task); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -1638,7 +2191,7 @@ static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) * Reduce accuracy by one bit such that @a and @b converge * to a similar magnitude. */ -#define REDUCE_FLS(a, b) \ +#define REDUCE_FLS(a, b) \ do { \ if (a##_fls > b##_fls) { \ a >>= 1; \ @@ -1808,7 +2361,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) if (ctx) rotate_ctx(ctx); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); if (ctx) task_ctx_sched_in(ctx, EVENT_FLEXIBLE); @@ -1863,6 +2416,14 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) if (!ctx || !ctx->nr_events) goto out; + /* + * We must ctxsw out cgroup events to avoid conflict + * when invoking perf_task_event_sched_in() later on + * in this function. Otherwise we end up trying to + * ctxswin cgroup events which are already scheduled + * in. + */ + perf_cgroup_sched_out(current); task_ctx_sched_out(ctx, EVENT_ALL); raw_spin_lock(&ctx->lock); @@ -1887,7 +2448,10 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) raw_spin_unlock(&ctx->lock); - perf_event_context_sched_in(ctx); + /* + * Also calls ctxswin for cgroup events, if any: + */ + perf_event_context_sched_in(ctx, ctx->task); out: local_irq_restore(flags); } @@ -1912,8 +2476,10 @@ static void __perf_event_read(void *info) return; raw_spin_lock(&ctx->lock); - if (ctx->is_active) + if (ctx->is_active) { update_context_time(ctx); + update_cgrp_time_from_event(event); + } update_event_times(event); if (event->state == PERF_EVENT_STATE_ACTIVE) event->pmu->read(event); @@ -1944,8 +2510,10 @@ static u64 perf_event_read(struct perf_event *event) * (e.g., thread is blocked), in that case * we cannot update context time */ - if (ctx->is_active) + if (ctx->is_active) { update_context_time(ctx); + update_cgrp_time_from_event(event); + } update_event_times(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } @@ -2224,6 +2792,9 @@ errout: } +/* + * Returns a matching context with refcount and pincount. + */ static struct perf_event_context * find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) { @@ -2248,6 +2819,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); ctx = &cpuctx->ctx; get_ctx(ctx); + ++ctx->pin_count; return ctx; } @@ -2261,6 +2833,7 @@ retry: ctx = perf_lock_task_context(task, ctxn, &flags); if (ctx) { unclone_ctx(ctx); + ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); } @@ -2282,8 +2855,10 @@ retry: err = -ESRCH; else if (task->perf_event_ctxp[ctxn]) err = -EAGAIN; - else + else { + ++ctx->pin_count; rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); + } mutex_unlock(&task->perf_event_mutex); if (unlikely(err)) { @@ -2323,7 +2898,7 @@ static void free_event(struct perf_event *event) if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_dec(&perf_task_events); + jump_label_dec(&perf_sched_events); if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) @@ -2332,6 +2907,10 @@ static void free_event(struct perf_event *event) atomic_dec(&nr_task_events); if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) put_callchain_buffers(); + if (is_cgroup_event(event)) { + atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); + jump_label_dec(&perf_sched_events); + } } if (event->buffer) { @@ -2339,6 +2918,9 @@ static void free_event(struct perf_event *event) event->buffer = NULL; } + if (is_cgroup_event(event)) + perf_detach_cgroup(event); + if (event->destroy) event->destroy(event); @@ -4406,26 +4988,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, if (unlikely(!is_sampling_event(event))) return 0; - if (!throttle) { - hwc->interrupts++; - } else { - if (hwc->interrupts != MAX_INTERRUPTS) { - hwc->interrupts++; - if (HZ * hwc->interrupts > - (u64)sysctl_perf_event_sample_rate) { - hwc->interrupts = MAX_INTERRUPTS; - perf_log_throttle(event, 0); - ret = 1; - } - } else { - /* - * Keep re-disabling events even though on the previous - * pass we disabled it - just in case we raced with a - * sched-in and the event got enabled again: - */ + if (unlikely(hwc->interrupts >= max_samples_per_tick)) { + if (throttle) { + hwc->interrupts = MAX_INTERRUPTS; + perf_log_throttle(event, 0); ret = 1; } - } + } else + hwc->interrupts++; if (event->attr.freq) { u64 now = perf_clock(); @@ -4458,6 +5028,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, else perf_event_output(event, nmi, data, regs); + if (event->fasync && event->pending_kill) { + if (nmi) { + event->pending_wakeup = 1; + irq_work_queue(&event->pending); + } else + perf_event_wakeup(event); + } + return ret; } @@ -4567,7 +5145,7 @@ static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) - return 0; + return 1; if (regs) { if (event->attr.exclude_user && user_mode(regs)) @@ -4753,14 +5331,6 @@ swevent_hlist_deref(struct swevent_htable *swhash) lockdep_is_held(&swhash->hlist_mutex)); } -static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) -{ - struct swevent_hlist *hlist; - - hlist = container_of(rcu_head, struct swevent_hlist, rcu_head); - kfree(hlist); -} - static void swevent_hlist_release(struct swevent_htable *swhash) { struct swevent_hlist *hlist = swevent_hlist_deref(swhash); @@ -4769,7 +5339,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash) return; rcu_assign_pointer(swhash->swevent_hlist, NULL); - call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); + kfree_rcu(hlist, rcu_head); } static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) @@ -4851,7 +5421,7 @@ fail: return err; } -atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; +struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; static void sw_perf_event_destroy(struct perf_event *event) { @@ -4923,6 +5493,8 @@ static int perf_tp_event_match(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { + if (event->hw.state & PERF_HES_STOPPED) + return 0; /* * All tracepoints are from kernel-space. */ @@ -5062,6 +5634,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) u64 period; event = container_of(hrtimer, struct perf_event, hw.hrtimer); + + if (event->state != PERF_EVENT_STATE_ACTIVE) + return HRTIMER_NORESTART; + event->pmu->read(event); perf_sample_data_init(&data, 0); @@ -5088,9 +5664,6 @@ static void perf_swevent_start_hrtimer(struct perf_event *event) if (!is_sampling_event(event)) return; - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swevent_hrtimer; - period = local64_read(&hwc->period_left); if (period) { if (period < 0) @@ -5117,6 +5690,30 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event) } } +static void perf_swevent_init_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!is_sampling_event(event)) + return; + + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + + /* + * Since hrtimers have a fixed rate, we can do a static freq->period + * mapping and avoid the whole period adjust feedback stuff. + */ + if (event->attr.freq) { + long freq = event->attr.sample_freq; + + event->attr.sample_period = NSEC_PER_SEC / freq; + hwc->sample_period = event->attr.sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + event->attr.freq = 0; + } +} + /* * Software event: cpu wall time clock */ @@ -5169,6 +5766,8 @@ static int cpu_clock_event_init(struct perf_event *event) if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) return -ENOENT; + perf_swevent_init_hrtimer(event); + return 0; } @@ -5224,16 +5823,9 @@ static void task_clock_event_del(struct perf_event *event, int flags) static void task_clock_event_read(struct perf_event *event) { - u64 time; - - if (!in_nmi()) { - update_context_time(event->ctx); - time = event->ctx->time; - } else { - u64 now = perf_clock(); - u64 delta = now - event->ctx->timestamp; - time = event->ctx->time + delta; - } + u64 now = perf_clock(); + u64 delta = now - event->ctx->timestamp; + u64 time = event->ctx->time + delta; task_clock_event_update(event, time); } @@ -5246,6 +5838,8 @@ static int task_clock_event_init(struct perf_event *event) if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) return -ENOENT; + perf_swevent_init_hrtimer(event); + return 0; } @@ -5517,17 +6111,22 @@ struct pmu *perf_init_event(struct perf_event *event) { struct pmu *pmu = NULL; int idx; + int ret; idx = srcu_read_lock(&pmus_srcu); rcu_read_lock(); pmu = idr_find(&pmu_idr, event->attr.type); rcu_read_unlock(); - if (pmu) + if (pmu) { + ret = pmu->event_init(event); + if (ret) + pmu = ERR_PTR(ret); goto unlock; + } list_for_each_entry_rcu(pmu, &pmus, entry) { - int ret = pmu->event_init(event); + ret = pmu->event_init(event); if (!ret) goto unlock; @@ -5653,7 +6252,7 @@ done: if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_inc(&perf_task_events); + jump_label_inc(&perf_sched_events); if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) @@ -5828,7 +6427,7 @@ SYSCALL_DEFINE5(perf_event_open, int err; /* for future expandability... */ - if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) + if (flags & ~PERF_FLAG_ALL) return -EINVAL; err = perf_copy_attr(attr_uptr, &attr); @@ -5845,6 +6444,15 @@ SYSCALL_DEFINE5(perf_event_open, return -EINVAL; } + /* + * In cgroup mode, the pid argument is used to pass the fd + * opened to the cgroup directory in cgroupfs. The cpu argument + * designates the cpu on which to monitor threads from that + * cgroup. + */ + if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) + return -EINVAL; + event_fd = get_unused_fd_flags(O_RDWR); if (event_fd < 0) return event_fd; @@ -5862,7 +6470,7 @@ SYSCALL_DEFINE5(perf_event_open, group_leader = NULL; } - if (pid != -1) { + if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { task = find_lively_task_by_vpid(pid); if (IS_ERR(task)) { err = PTR_ERR(task); @@ -5876,6 +6484,19 @@ SYSCALL_DEFINE5(perf_event_open, goto err_task; } + if (flags & PERF_FLAG_PID_CGROUP) { + err = perf_cgroup_connect(pid, event, &attr, group_leader); + if (err) + goto err_alloc; + /* + * one more event: + * - that has cgroup constraint on event->cpu + * - that may need work on context switch + */ + atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); + jump_label_inc(&perf_sched_events); + } + /* * Special case software events and allow them to be part of * any hardware group. @@ -5914,6 +6535,11 @@ SYSCALL_DEFINE5(perf_event_open, goto err_alloc; } + if (task) { + put_task_struct(task); + task = NULL; + } + /* * Look up the group leader (we will attach this event to it): */ @@ -5961,10 +6587,10 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_context *gctx = group_leader->ctx; mutex_lock(&gctx->mutex); - perf_event_remove_from_context(group_leader); + perf_remove_from_context(group_leader); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { - perf_event_remove_from_context(sibling); + perf_remove_from_context(sibling); put_ctx(gctx); } mutex_unlock(&gctx->mutex); @@ -5987,6 +6613,7 @@ SYSCALL_DEFINE5(perf_event_open, perf_install_in_context(ctx, event, cpu); ++ctx->generation; + perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); event->owner = current; @@ -6012,6 +6639,7 @@ SYSCALL_DEFINE5(perf_event_open, return event_fd; err_context: + perf_unpin_context(ctx); put_ctx(ctx); err_alloc: free_event(event); @@ -6062,6 +6690,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, mutex_lock(&ctx->mutex); perf_install_in_context(ctx, event, cpu); ++ctx->generation; + perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); return event; @@ -6113,17 +6742,20 @@ __perf_event_exit_task(struct perf_event *child_event, struct perf_event_context *child_ctx, struct task_struct *child) { - struct perf_event *parent_event; + if (child_event->parent) { + raw_spin_lock_irq(&child_ctx->lock); + perf_group_detach(child_event); + raw_spin_unlock_irq(&child_ctx->lock); + } - perf_event_remove_from_context(child_event); + perf_remove_from_context(child_event); - parent_event = child_event->parent; /* - * It can happen that parent exits first, and has events + * It can happen that the parent exits first, and has events * that are still around due to the child reference. These - * events need to be zapped - but otherwise linger. + * events need to be zapped. */ - if (parent_event) { + if (child_event->parent) { sync_child_event(child_event, child); free_event(child_event); } @@ -6422,7 +7054,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, return 0; } - child_ctx = child->perf_event_ctxp[ctxn]; + child_ctx = child->perf_event_ctxp[ctxn]; if (!child_ctx) { /* * This is executed from the parent task context, so @@ -6537,6 +7169,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn) mutex_unlock(&parent_ctx->mutex); perf_unpin_context(parent_ctx); + put_ctx(parent_ctx); return ret; } @@ -6606,9 +7239,9 @@ static void __perf_event_exit_context(void *__info) perf_pmu_rotate_stop(ctx->pmu); list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) - __perf_event_remove_from_context(event); + __perf_remove_from_context(event); list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) - __perf_event_remove_from_context(event); + __perf_remove_from_context(event); } static void perf_event_exit_cpu_context(int cpu) @@ -6732,3 +7365,83 @@ unlock: return ret; } device_initcall(perf_event_sysfs_init); + +#ifdef CONFIG_CGROUP_PERF +static struct cgroup_subsys_state *perf_cgroup_create( + struct cgroup_subsys *ss, struct cgroup *cont) +{ + struct perf_cgroup *jc; + + jc = kzalloc(sizeof(*jc), GFP_KERNEL); + if (!jc) + return ERR_PTR(-ENOMEM); + + jc->info = alloc_percpu(struct perf_cgroup_info); + if (!jc->info) { + kfree(jc); + return ERR_PTR(-ENOMEM); + } + + return &jc->css; +} + +static void perf_cgroup_destroy(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + struct perf_cgroup *jc; + jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), + struct perf_cgroup, css); + free_percpu(jc->info); + kfree(jc); +} + +static int __perf_cgroup_move(void *info) +{ + struct task_struct *task = info; + perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); + return 0; +} + +static void perf_cgroup_move(struct task_struct *task) +{ + task_function_call(task, __perf_cgroup_move, task); +} + +static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup *old_cgrp, struct task_struct *task, + bool threadgroup) +{ + perf_cgroup_move(task); + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &task->thread_group, thread_group) { + perf_cgroup_move(c); + } + rcu_read_unlock(); + } +} + +static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup *old_cgrp, struct task_struct *task) +{ + /* + * cgroup_exit() is called in the copy_process() failure path. + * Ignore this case since the task hasn't ran yet, this avoids + * trying to poke a half freed task state from generic code. + */ + if (!(task->flags & PF_EXITING)) + return; + + perf_cgroup_move(task); +} + +struct cgroup_subsys perf_subsys = { + .name = "perf_event", + .subsys_id = perf_subsys_id, + .create = perf_cgroup_create, + .destroy = perf_cgroup_destroy, + .exit = perf_cgroup_exit, + .attach = perf_cgroup_attach, +}; +#endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 086adf25a55e..086adf25a55e 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c diff --git a/kernel/exit.c b/kernel/exit.c index f9a45ebcc7b1..20a406471525 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -841,7 +841,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) /* Let father know we died * * Thread signals are configurable, but you aren't going to use - * that to send signals to arbitary processes. + * that to send signals to arbitrary processes. * That stops right now. * * If the parent exec id doesn't match the exec id we saved @@ -908,6 +908,7 @@ NORET_TYPE void do_exit(long code) profile_task_exit(tsk); WARN_ON(atomic_read(&tsk->fs_excl)); + WARN_ON(blk_needs_flush_plug(tsk)); if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); @@ -1015,7 +1016,7 @@ NORET_TYPE void do_exit(long code) /* * FIXME: do that only when needed, using sched_exit tracepoint */ - flush_ptrace_hw_breakpoint(tsk); + ptrace_put_breakpoints(tsk); exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA @@ -1376,11 +1377,23 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace) return NULL; } -/* - * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold - * read_lock(&tasklist_lock) on entry. If we return zero, we still hold - * the lock and this task is uninteresting. If we return nonzero, we have - * released the lock and the system call should return. +/** + * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED + * @wo: wait options + * @ptrace: is the wait for ptrace + * @p: task to wait for + * + * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. + * + * CONTEXT: + * read_lock(&tasklist_lock), which is released if return value is + * non-zero. Also, grabs and releases @p->sighand->siglock. + * + * RETURNS: + * 0 if wait condition didn't exist and search for other wait conditions + * should continue. Non-zero return, -errno on failure and @p's pid on + * success, implies that tasklist_lock is released and wait condition + * search should terminate. */ static int wait_task_stopped(struct wait_opts *wo, int ptrace, struct task_struct *p) @@ -1396,6 +1409,9 @@ static int wait_task_stopped(struct wait_opts *wo, if (!ptrace && !(wo->wo_flags & WUNTRACED)) return 0; + if (!task_stopped_code(p, ptrace)) + return 0; + exit_code = 0; spin_lock_irq(&p->sighand->siglock); @@ -1537,33 +1553,84 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, return 0; } - if (likely(!ptrace) && unlikely(task_ptrace(p))) { + /* dead body doesn't have much to contribute */ + if (p->exit_state == EXIT_DEAD) + return 0; + + /* slay zombie? */ + if (p->exit_state == EXIT_ZOMBIE) { + /* + * A zombie ptracee is only visible to its ptracer. + * Notification and reaping will be cascaded to the real + * parent when the ptracer detaches. + */ + if (likely(!ptrace) && unlikely(task_ptrace(p))) { + /* it will become visible, clear notask_error */ + wo->notask_error = 0; + return 0; + } + + /* we don't reap group leaders with subthreads */ + if (!delay_group_leader(p)) + return wait_task_zombie(wo, p); + /* - * This child is hidden by ptrace. - * We aren't allowed to see it now, but eventually we will. + * Allow access to stopped/continued state via zombie by + * falling through. Clearing of notask_error is complex. + * + * When !@ptrace: + * + * If WEXITED is set, notask_error should naturally be + * cleared. If not, subset of WSTOPPED|WCONTINUED is set, + * so, if there are live subthreads, there are events to + * wait for. If all subthreads are dead, it's still safe + * to clear - this function will be called again in finite + * amount time once all the subthreads are released and + * will then return without clearing. + * + * When @ptrace: + * + * Stopped state is per-task and thus can't change once the + * target task dies. Only continued and exited can happen. + * Clear notask_error if WCONTINUED | WEXITED. + */ + if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) + wo->notask_error = 0; + } else { + /* + * If @p is ptraced by a task in its real parent's group, + * hide group stop/continued state when looking at @p as + * the real parent; otherwise, a single stop can be + * reported twice as group and ptrace stops. + * + * If a ptracer wants to distinguish the two events for its + * own children, it should create a separate process which + * takes the role of real parent. + */ + if (likely(!ptrace) && task_ptrace(p) && + same_thread_group(p->parent, p->real_parent)) + return 0; + + /* + * @p is alive and it's gonna stop, continue or exit, so + * there always is something to wait for. */ wo->notask_error = 0; - return 0; } - if (p->exit_state == EXIT_DEAD) - return 0; - /* - * We don't reap group leaders with subthreads. + * Wait for stopped. Depending on @ptrace, different stopped state + * is used and the two don't interact with each other. */ - if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) - return wait_task_zombie(wo, p); + ret = wait_task_stopped(wo, ptrace, p); + if (ret) + return ret; /* - * It's stopped or running now, so it might - * later continue, exit, or stop again. + * Wait for continued. There's only one continued state and the + * ptracer can consume it which can confuse the real parent. Don't + * use WCONTINUED from ptracer. You don't need or want it. */ - wo->notask_error = 0; - - if (task_stopped_code(p, ptrace)) - return wait_task_stopped(wo, ptrace, p); - return wait_task_continued(wo, p); } diff --git a/kernel/extable.c b/kernel/extable.c index 7f8f263f8524..5339705b8241 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -72,6 +72,24 @@ int core_kernel_text(unsigned long addr) return 0; } +/** + * core_kernel_data - tell if addr points to kernel data + * @addr: address to test + * + * Returns true if @addr passed in is from the core kernel data + * section. + * + * Note: On some archs it may return true for core RODATA, and false + * for others. But will always be true for core RW data. + */ +int core_kernel_data(unsigned long addr) +{ + if (addr >= (unsigned long)_sdata && + addr < (unsigned long)_edata) + return 1; + return 0; +} + int __kernel_text_address(unsigned long addr) { if (core_kernel_text(addr)) diff --git a/kernel/fork.c b/kernel/fork.c index 25e429152ddc..ca406d916713 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -40,6 +40,7 @@ #include <linux/tracehook.h> #include <linux/futex.h> #include <linux/compat.h> +#include <linux/kthread.h> #include <linux/task_io_accounting_ops.h> #include <linux/rcupdate.h> #include <linux/ptrace.h> @@ -58,7 +59,6 @@ #include <linux/taskstats_kern.h> #include <linux/random.h> #include <linux/tty.h> -#include <linux/proc_fs.h> #include <linux/blkdev.h> #include <linux/fs_struct.h> #include <linux/magic.h> @@ -109,20 +109,25 @@ int nr_processes(void) } #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR -# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) -# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) +# define alloc_task_struct_node(node) \ + kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node) +# define free_task_struct(tsk) \ + kmem_cache_free(task_struct_cachep, (tsk)) static struct kmem_cache *task_struct_cachep; #endif #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR -static inline struct thread_info *alloc_thread_info(struct task_struct *tsk) +static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, + int node) { #ifdef CONFIG_DEBUG_STACK_USAGE gfp_t mask = GFP_KERNEL | __GFP_ZERO; #else gfp_t mask = GFP_KERNEL; #endif - return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER); + struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER); + + return page ? page_address(page) : NULL; } static inline void free_thread_info(struct thread_info *ti) @@ -193,6 +198,7 @@ void __put_task_struct(struct task_struct *tsk) if (!profile_handoff_task(tsk)) free_task(tsk); } +EXPORT_SYMBOL_GPL(__put_task_struct); /* * macro override instead of weak attribute alias, to workaround @@ -248,16 +254,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) struct task_struct *tsk; struct thread_info *ti; unsigned long *stackend; - + int node = tsk_fork_get_node(orig); int err; prepare_to_copy(orig); - tsk = alloc_task_struct(); + tsk = alloc_task_struct_node(node); if (!tsk) return NULL; - ti = alloc_thread_info(tsk); + ti = alloc_thread_info_node(tsk, node); if (!ti) { free_task_struct(tsk); return NULL; @@ -376,15 +382,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); if (tmp->vm_flags & VM_SHARED) mapping->i_mmap_writable++; - tmp->vm_truncate_count = mpnt->vm_truncate_count; flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ vma_prio_tree_add(tmp, mpnt); flush_dcache_mmap_unlock(mapping); - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); } /* @@ -479,6 +484,20 @@ static void mm_init_aio(struct mm_struct *mm) #endif } +int mm_init_cpumask(struct mm_struct *mm, struct mm_struct *oldmm) +{ +#ifdef CONFIG_CPUMASK_OFFSTACK + if (!alloc_cpumask_var(&mm->cpu_vm_mask_var, GFP_KERNEL)) + return -ENOMEM; + + if (oldmm) + cpumask_copy(mm_cpumask(mm), mm_cpumask(oldmm)); + else + memset(mm_cpumask(mm), 0, cpumask_size()); +#endif + return 0; +} + static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) { atomic_set(&mm->mm_users, 1); @@ -515,10 +534,20 @@ struct mm_struct * mm_alloc(void) struct mm_struct * mm; mm = allocate_mm(); - if (mm) { - memset(mm, 0, sizeof(*mm)); - mm = mm_init(mm, current); + if (!mm) + return NULL; + + memset(mm, 0, sizeof(*mm)); + mm = mm_init(mm, current); + if (!mm) + return NULL; + + if (mm_init_cpumask(mm, NULL)) { + mm_free_pgd(mm); + free_mm(mm); + return NULL; } + return mm; } @@ -530,6 +559,7 @@ struct mm_struct * mm_alloc(void) void __mmdrop(struct mm_struct *mm) { BUG_ON(mm == &init_mm); + free_cpumask_var(mm->cpu_vm_mask_var); mm_free_pgd(mm); destroy_context(mm); mmu_notifier_mm_destroy(mm); @@ -566,6 +596,57 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); +/* + * We added or removed a vma mapping the executable. The vmas are only mapped + * during exec and are not mapped with the mmap system call. + * Callers must hold down_write() on the mm's mmap_sem for these + */ +void added_exe_file_vma(struct mm_struct *mm) +{ + mm->num_exe_file_vmas++; +} + +void removed_exe_file_vma(struct mm_struct *mm) +{ + mm->num_exe_file_vmas--; + if ((mm->num_exe_file_vmas == 0) && mm->exe_file){ + fput(mm->exe_file); + mm->exe_file = NULL; + } + +} + +void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) +{ + if (new_exe_file) + get_file(new_exe_file); + if (mm->exe_file) + fput(mm->exe_file); + mm->exe_file = new_exe_file; + mm->num_exe_file_vmas = 0; +} + +struct file *get_mm_exe_file(struct mm_struct *mm) +{ + struct file *exe_file; + + /* We need mmap_sem to protect against races with removal of + * VM_EXECUTABLE vmas */ + down_read(&mm->mmap_sem); + exe_file = mm->exe_file; + if (exe_file) + get_file(exe_file); + up_read(&mm->mmap_sem); + return exe_file; +} + +static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm) +{ + /* It's safe to write the exe_file pointer without exe_file_lock because + * this is called during fork when the task is not yet in /proc */ + newmm->exe_file = get_mm_exe_file(oldmm); +} + /** * get_task_mm - acquire a reference to the task's mm * @@ -684,6 +765,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk) if (!mm_init(mm, tsk)) goto fail_nomem; + if (mm_init_cpumask(mm, oldmm)) + goto fail_nocpumask; + if (init_new_context(tsk, mm)) goto fail_nocontext; @@ -710,6 +794,9 @@ fail_nomem: return NULL; fail_nocontext: + free_cpumask_var(mm->cpu_vm_mask_var); + +fail_nocpumask: /* * If init_new_context() failed, we cannot use mmput() to free the mm * because it calls destroy_context() @@ -920,6 +1007,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); sched_autogroup_fork(sig); +#ifdef CONFIG_CGROUPS + init_rwsem(&sig->threadgroup_fork_lock); +#endif + sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; @@ -1096,12 +1187,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, posix_cpu_timers_init(p); - p->lock_depth = -1; /* -1 = no lock */ do_posix_clock_monotonic_gettime(&p->start_time); p->real_start_time = p->start_time; monotonic_to_bootbased(&p->real_start_time); p->io_context = NULL; p->audit_context = NULL; + if (clone_flags & CLONE_THREAD) + threadgroup_fork_read_lock(current); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); @@ -1146,7 +1238,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif /* Perform scheduler related setup. Assign this task to a CPU. */ - sched_fork(p, clone_flags); + sched_fork(p); retval = perf_event_init_task(p); if (retval) @@ -1180,12 +1272,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, pid = alloc_pid(p->nsproxy->pid_ns); if (!pid) goto bad_fork_cleanup_io; - - if (clone_flags & CLONE_NEWPID) { - retval = pid_ns_prepare_proc(p->nsproxy->pid_ns); - if (retval < 0) - goto bad_fork_free_pid; - } } p->pid = pid_nr(pid); @@ -1193,17 +1279,14 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (clone_flags & CLONE_THREAD) p->tgid = current->tgid; - if (current->nsproxy != p->nsproxy) { - retval = ns_cgroup_clone(p, pid); - if (retval) - goto bad_fork_free_pid; - } - p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* * Clear TID on mm_release()? */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; +#ifdef CONFIG_BLOCK + p->plug = NULL; +#endif #ifdef CONFIG_FUTEX p->robust_list = NULL; #ifdef CONFIG_COMPAT @@ -1289,7 +1372,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, tracehook_finish_clone(p, clone_flags, trace); if (thread_group_leader(p)) { - if (clone_flags & CLONE_NEWPID) + if (is_child_reaper(pid)) p->nsproxy->pid_ns->child_reaper = p; p->signal->leader_pid = pid; @@ -1309,6 +1392,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); + if (clone_flags & CLONE_THREAD) + threadgroup_fork_read_unlock(current); perf_event_fork(p); return p; @@ -1347,6 +1432,8 @@ bad_fork_cleanup_policy: mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: #endif + if (clone_flags & CLONE_THREAD) + threadgroup_fork_read_unlock(current); cgroup_exit(p, cgroup_callbacks_done); delayacct_tsk_free(p); module_put(task_thread_info(p)->exec_domain->module); @@ -1460,7 +1547,7 @@ long do_fork(unsigned long clone_flags, */ p->flags &= ~PF_STARTING; - wake_up_new_task(p, clone_flags); + wake_up_new_task(p); tracehook_report_clone_complete(trace, regs, clone_flags, nr, p); @@ -1512,38 +1599,24 @@ void __init proc_caches_init(void) } /* - * Check constraints on flags passed to the unshare system call and - * force unsharing of additional process context as appropriate. + * Check constraints on flags passed to the unshare system call. */ -static void check_unshare_flags(unsigned long *flags_ptr) +static int check_unshare_flags(unsigned long unshare_flags) { + if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| + CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| + CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) + return -EINVAL; /* - * If unsharing a thread from a thread group, must also - * unshare vm. - */ - if (*flags_ptr & CLONE_THREAD) - *flags_ptr |= CLONE_VM; - - /* - * If unsharing vm, must also unshare signal handlers. - */ - if (*flags_ptr & CLONE_VM) - *flags_ptr |= CLONE_SIGHAND; - - /* - * If unsharing namespace, must also unshare filesystem information. + * Not implemented, but pretend it works if there is nothing to + * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND + * needs to unshare vm. */ - if (*flags_ptr & CLONE_NEWNS) - *flags_ptr |= CLONE_FS; -} - -/* - * Unsharing of tasks created with CLONE_THREAD is not supported yet - */ -static int unshare_thread(unsigned long unshare_flags) -{ - if (unshare_flags & CLONE_THREAD) - return -EINVAL; + if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) { + /* FIXME: get_task_mm() increments ->mm_users */ + if (atomic_read(¤t->mm->mm_users) > 1) + return -EINVAL; + } return 0; } @@ -1570,34 +1643,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) } /* - * Unsharing of sighand is not supported yet - */ -static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) -{ - struct sighand_struct *sigh = current->sighand; - - if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1) - return -EINVAL; - else - return 0; -} - -/* - * Unshare vm if it is being shared - */ -static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp) -{ - struct mm_struct *mm = current->mm; - - if ((unshare_flags & CLONE_VM) && - (mm && atomic_read(&mm->mm_users) > 1)) { - return -EINVAL; - } - - return 0; -} - -/* * Unshare file descriptor table if it is being shared */ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) @@ -1625,45 +1670,37 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp */ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) { - int err = 0; struct fs_struct *fs, *new_fs = NULL; - struct sighand_struct *new_sigh = NULL; - struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; struct nsproxy *new_nsproxy = NULL; int do_sysvsem = 0; + int err; - check_unshare_flags(&unshare_flags); - - /* Return -EINVAL for all unsupported flags */ - err = -EINVAL; - if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| - CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) + err = check_unshare_flags(unshare_flags); + if (err) goto bad_unshare_out; /* + * If unsharing namespace, must also unshare filesystem information. + */ + if (unshare_flags & CLONE_NEWNS) + unshare_flags |= CLONE_FS; + /* * CLONE_NEWIPC must also detach from the undolist: after switching * to a new ipc namespace, the semaphore arrays from the old * namespace are unreachable. */ if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) do_sysvsem = 1; - if ((err = unshare_thread(unshare_flags))) - goto bad_unshare_out; if ((err = unshare_fs(unshare_flags, &new_fs))) - goto bad_unshare_cleanup_thread; - if ((err = unshare_sighand(unshare_flags, &new_sigh))) - goto bad_unshare_cleanup_fs; - if ((err = unshare_vm(unshare_flags, &new_mm))) - goto bad_unshare_cleanup_sigh; + goto bad_unshare_out; if ((err = unshare_fd(unshare_flags, &new_fd))) - goto bad_unshare_cleanup_vm; + goto bad_unshare_cleanup_fs; if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs))) goto bad_unshare_cleanup_fd; - if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { + if (new_fs || new_fd || do_sysvsem || new_nsproxy) { if (do_sysvsem) { /* * CLONE_SYSVSEM is equivalent to sys_exit(). @@ -1689,19 +1726,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) spin_unlock(&fs->lock); } - if (new_mm) { - mm = current->mm; - active_mm = current->active_mm; - current->mm = new_mm; - current->active_mm = new_mm; - if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { - atomic_dec(&mm->oom_disable_count); - atomic_inc(&new_mm->oom_disable_count); - } - activate_mm(active_mm, new_mm); - new_mm = mm; - } - if (new_fd) { fd = current->files; current->files = new_fd; @@ -1718,20 +1742,10 @@ bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); -bad_unshare_cleanup_vm: - if (new_mm) - mmput(new_mm); - -bad_unshare_cleanup_sigh: - if (new_sigh) - if (atomic_dec_and_test(&new_sigh->count)) - kmem_cache_free(sighand_cachep, new_sigh); - bad_unshare_cleanup_fs: if (new_fs) free_fs_struct(new_fs); -bad_unshare_cleanup_thread: bad_unshare_out: return err; } diff --git a/kernel/freezer.c b/kernel/freezer.c index 66ecd2ead215..7b01de98bb6a 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -17,7 +17,7 @@ static inline void frozen_process(void) { if (!unlikely(current->flags & PF_NOFREEZE)) { current->flags |= PF_FROZEN; - wmb(); + smp_wmb(); } clear_freeze_flag(current); } @@ -93,7 +93,7 @@ bool freeze_task(struct task_struct *p, bool sig_only) * the task as frozen and next clears its TIF_FREEZE. */ if (!freezing(p)) { - rmb(); + smp_rmb(); if (frozen(p)) return false; diff --git a/kernel/futex.c b/kernel/futex.c index b766d28accd6..fe28dc282eae 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -381,15 +381,16 @@ static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, return NULL; } -static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) +static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, + u32 uval, u32 newval) { - u32 curval; + int ret; pagefault_disable(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); pagefault_enable(); - return curval; + return ret; } static int get_futex_value_locked(u32 *dest, u32 __user *from) @@ -674,7 +675,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, struct task_struct *task, int set_waiters) { int lock_taken, ret, ownerdied = 0; - u32 uval, newval, curval; + u32 uval, newval, curval, vpid = task_pid_vnr(task); retry: ret = lock_taken = 0; @@ -684,19 +685,17 @@ retry: * (by doing a 0 -> TID atomic cmpxchg), while holding all * the locks. It will most likely not succeed. */ - newval = task_pid_vnr(task); + newval = vpid; if (set_waiters) newval |= FUTEX_WAITERS; - curval = cmpxchg_futex_value_locked(uaddr, 0, newval); - - if (unlikely(curval == -EFAULT)) + if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval))) return -EFAULT; /* * Detect deadlocks. */ - if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) + if ((unlikely((curval & FUTEX_TID_MASK) == vpid))) return -EDEADLK; /* @@ -723,14 +722,12 @@ retry: */ if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { /* Keep the OWNER_DIED bit */ - newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); + newval = (curval & ~FUTEX_TID_MASK) | vpid; ownerdied = 0; lock_taken = 1; } - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - - if (unlikely(curval == -EFAULT)) + if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) return -EFAULT; if (unlikely(curval != uval)) goto retry; @@ -775,6 +772,24 @@ retry: return ret; } +/** + * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket + * @q: The futex_q to unqueue + * + * The q->lock_ptr must not be NULL and must be held by the caller. + */ +static void __unqueue_futex(struct futex_q *q) +{ + struct futex_hash_bucket *hb; + + if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr)) + || WARN_ON(plist_node_empty(&q->list))) + return; + + hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); + plist_del(&q->list, &hb->chain); +} + /* * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. @@ -792,7 +807,7 @@ static void wake_futex(struct futex_q *q) */ get_task_struct(p); - plist_del(&q->list, &q->list.plist); + __unqueue_futex(q); /* * The waiting task can free the futex_q as soon as * q->lock_ptr = NULL is written, without taking any locks. A @@ -843,9 +858,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) newval = FUTEX_WAITERS | task_pid_vnr(new_owner); - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - - if (curval == -EFAULT) + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) ret = -EFAULT; else if (curval != uval) ret = -EINVAL; @@ -880,10 +893,8 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval) * There is no waiter, so we unlock the futex. The owner died * bit has not to be preserved here. We are the owner: */ - oldval = cmpxchg_futex_value_locked(uaddr, uval, 0); - - if (oldval == -EFAULT) - return oldval; + if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0)) + return -EFAULT; if (oldval != uval) return -EAGAIN; @@ -1071,9 +1082,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, plist_del(&q->list, &hb1->chain); plist_add(&q->list, &hb2->chain); q->lock_ptr = &hb2->lock; -#ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.spinlock = &hb2->lock; -#endif } get_futex_key_refs(key2); q->key = *key2; @@ -1100,16 +1108,12 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, get_futex_key_refs(key); q->key = *key; - WARN_ON(plist_node_empty(&q->list)); - plist_del(&q->list, &q->list.plist); + __unqueue_futex(q); WARN_ON(!q->rt_waiter); q->rt_waiter = NULL; q->lock_ptr = &hb->lock; -#ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.spinlock = &hb->lock; -#endif wake_up_state(q->task, TASK_NORMAL); } @@ -1457,9 +1461,6 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) prio = min(current->normal_prio, MAX_RT_PRIO); plist_node_init(&q->list, prio); -#ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.spinlock = &hb->lock; -#endif plist_add(&q->list, &hb->chain); q->task = current; spin_unlock(&hb->lock); @@ -1504,8 +1505,7 @@ retry: spin_unlock(lock_ptr); goto retry; } - WARN_ON(plist_node_empty(&q->list)); - plist_del(&q->list, &q->list.plist); + __unqueue_futex(q); BUG_ON(q->pi_state); @@ -1525,8 +1525,7 @@ retry: static void unqueue_me_pi(struct futex_q *q) __releases(q->lock_ptr) { - WARN_ON(plist_node_empty(&q->list)); - plist_del(&q->list, &q->list.plist); + __unqueue_futex(q); BUG_ON(!q->pi_state); free_pi_state(q->pi_state); @@ -1556,10 +1555,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, /* * We are here either because we stole the rtmutex from the - * pending owner or we are the pending owner which failed to - * get the rtmutex. We have to replace the pending owner TID - * in the user space variable. This must be atomic as we have - * to preserve the owner died bit here. + * previous highest priority waiter or we are the highest priority + * waiter but failed to get the rtmutex the first time. + * We have to replace the newowner TID in the user space variable. + * This must be atomic as we have to preserve the owner died bit here. * * Note: We write the user space value _before_ changing the pi_state * because we can fault here. Imagine swapped out pages or a fork @@ -1578,9 +1577,7 @@ retry: while (1) { newval = (uval & FUTEX_OWNER_DIED) | newtid; - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - - if (curval == -EFAULT) + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) goto handle_fault; if (curval == uval) break; @@ -1608,8 +1605,8 @@ retry: /* * To handle the page fault we need to drop the hash bucket - * lock here. That gives the other task (either the pending - * owner itself or the task which stole the rtmutex) the + * lock here. That gives the other task (either the highest priority + * waiter itself or the task which stole the rtmutex) the * chance to try the fixup of the pi_state. So once we are * back from handling the fault we need to check the pi_state * after reacquiring the hash bucket lock and before trying to @@ -1685,18 +1682,20 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) /* * pi_state is incorrect, some other task did a lock steal and * we returned due to timeout or signal without taking the - * rt_mutex. Too late. We can access the rt_mutex_owner without - * locking, as the other task is now blocked on the hash bucket - * lock. Fix the state up. + * rt_mutex. Too late. */ + raw_spin_lock(&q->pi_state->pi_mutex.wait_lock); owner = rt_mutex_owner(&q->pi_state->pi_mutex); + if (!owner) + owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); + raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock); ret = fixup_pi_state_owner(uaddr, q, owner); goto out; } /* * Paranoia check. If we did not take the lock, then we should not be - * the owner, nor the pending owner, of the rt_mutex. + * the owner of the rt_mutex. */ if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " @@ -1781,13 +1780,14 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, * * The basic logical guarantee of a futex is that it blocks ONLY * if cond(var) is known to be true at the time of blocking, for - * any cond. If we queued after testing *uaddr, that would open - * a race condition where we could block indefinitely with + * any cond. If we locked the hash-bucket after testing *uaddr, that + * would open a race condition where we could block indefinitely with * cond(var) false, which would violate the guarantee. * - * A consequence is that futex_wait() can return zero and absorb - * a wakeup when *uaddr != val on entry to the syscall. This is - * rare, but normal. + * On the other hand, we insert q and release the hash-bucket only + * after testing *uaddr. This guarantees that futex_wait() will NOT + * absorb a wakeup if *uaddr does not match the desired values + * while the syscall executes. */ retry: ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); @@ -1886,7 +1886,7 @@ retry: restart->futex.val = val; restart->futex.time = abs_time->tv64; restart->futex.bitset = bitset; - restart->futex.flags = flags; + restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; ret = -ERESTART_RESTARTBLOCK; @@ -2046,9 +2046,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) { struct futex_hash_bucket *hb; struct futex_q *this, *next; - u32 uval; struct plist_head *head; union futex_key key = FUTEX_KEY_INIT; + u32 uval, vpid = task_pid_vnr(current); int ret; retry: @@ -2057,7 +2057,7 @@ retry: /* * We release only a lock we actually own: */ - if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) + if ((uval & FUTEX_TID_MASK) != vpid) return -EPERM; ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); @@ -2072,17 +2072,14 @@ retry: * again. If it succeeds then we can return without waking * anyone else up: */ - if (!(uval & FUTEX_OWNER_DIED)) - uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0); - - - if (unlikely(uval == -EFAULT)) + if (!(uval & FUTEX_OWNER_DIED) && + cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) goto pi_faulted; /* * Rare case: we managed to release the lock atomically, * no need to wake anyone else up: */ - if (unlikely(uval == task_pid_vnr(current))) + if (unlikely(uval == vpid)) goto out_unlock; /* @@ -2167,7 +2164,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * We were woken prior to requeue by a timeout or a signal. * Unqueue the futex_q and determine which it was. */ - plist_del(&q->list, &q->list.plist); + plist_del(&q->list, &hb->chain); /* Handle spurious wakeups gracefully */ ret = -EWOULDBLOCK; @@ -2421,10 +2418,19 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, goto err_unlock; ret = -EPERM; pcred = __task_cred(p); + /* If victim is in different user_ns, then uids are not + comparable, so we must have CAP_SYS_PTRACE */ + if (cred->user->user_ns != pcred->user->user_ns) { + if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) + goto err_unlock; + goto ok; + } + /* If victim is in same user_ns, then uids are comparable */ if (cred->euid != pcred->euid && cred->euid != pcred->uid && - !capable(CAP_SYS_PTRACE)) + !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) goto err_unlock; +ok: head = p->robust_list; rcu_read_unlock(); } @@ -2463,11 +2469,20 @@ retry: * userspace. */ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; - nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); - - if (nval == -EFAULT) - return -1; - + /* + * We are not holding a lock here, but we want to have + * the pagefault_disable/enable() protection because + * we want to handle the fault gracefully. If the + * access fails we try to fault in the futex with R/W + * verification via get_user_pages. get_user() above + * does not guarantee R/W access. If that fails we + * give up and leave the futex locked. + */ + if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { + if (fault_in_user_writeable(uaddr)) + return -1; + goto retry; + } if (nval != uval) goto retry; @@ -2678,8 +2693,7 @@ static int __init futex_init(void) * implementation, the non-functional ones will return * -ENOSYS. */ - curval = cmpxchg_futex_value_locked(NULL, 0, 0); - if (curval == -EFAULT) + if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) futex_cmpxchg_enabled = 1; for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index a7934ac75e5b..5f9e689dc8f0 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -153,10 +153,19 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, goto err_unlock; ret = -EPERM; pcred = __task_cred(p); + /* If victim is in different user_ns, then uids are not + comparable, so we must have CAP_SYS_PTRACE */ + if (cred->user->user_ns != pcred->user->user_ns) { + if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) + goto err_unlock; + goto ok; + } + /* If victim is in same user_ns, then uids are comparable */ if (cred->euid != pcred->euid && cred->euid != pcred->uid && - !capable(CAP_SYS_PTRACE)) + !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) goto err_unlock; +ok: head = p->compat_robust_list; rcu_read_unlock(); } diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 70a298d6da71..b8cadf70b1fb 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -34,7 +34,7 @@ config GCOV_KERNEL config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE + depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE default n ---help--- This options activates profiling for the entire kernel. diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index 3f761001d517..e97ca59e2520 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -1,3 +1,3 @@ -EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' +ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o diff --git a/kernel/groups.c b/kernel/groups.c index 253dc0f35cf4..1cc476d52dd3 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) struct group_info *group_info; int retval; - if (!capable(CAP_SETGID)) + if (!nsown_capable(CAP_SETGID)) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 0c8d7c048615..a9205e32a059 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -53,11 +53,10 @@ /* * The timer bases: * - * Note: If we want to add new timer bases, we have to skip the two - * clock ids captured by the cpu-timers. We do this by holding empty - * entries rather than doing math adjustment of the clock ids. - * This ensures that we capture erroneous accesses to these clock ids - * rather than moving them into the range of valid clock id's. + * There are more clockids then hrtimer bases. Thus, we index + * into the timer bases by the hrtimer_base_type enum. When trying + * to reach a base using a clockid, hrtimer_clockid_to_base() + * is used to convert from clockid to the proper hrtimer_base_type. */ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { @@ -65,39 +64,55 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .clock_base = { { - .index = CLOCK_REALTIME, + .index = HRTIMER_BASE_MONOTONIC, + .clockid = CLOCK_MONOTONIC, + .get_time = &ktime_get, + .resolution = KTIME_LOW_RES, + }, + { + .index = HRTIMER_BASE_REALTIME, + .clockid = CLOCK_REALTIME, .get_time = &ktime_get_real, .resolution = KTIME_LOW_RES, }, { - .index = CLOCK_MONOTONIC, - .get_time = &ktime_get, + .index = HRTIMER_BASE_BOOTTIME, + .clockid = CLOCK_BOOTTIME, + .get_time = &ktime_get_boottime, .resolution = KTIME_LOW_RES, }, } }; +static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { + [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, + [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, + [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, +}; + +static inline int hrtimer_clockid_to_base(clockid_t clock_id) +{ + return hrtimer_clock_to_base_table[clock_id]; +} + + /* * Get the coarse grained time at the softirq based on xtime and * wall_to_monotonic. */ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) { - ktime_t xtim, tomono; - struct timespec xts, tom; - unsigned long seq; + ktime_t xtim, mono, boot; + struct timespec xts, tom, slp; - do { - seq = read_seqbegin(&xtime_lock); - xts = __current_kernel_time(); - tom = __get_wall_to_monotonic(); - } while (read_seqretry(&xtime_lock, seq)); + get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); xtim = timespec_to_ktime(xts); - tomono = timespec_to_ktime(tom); - base->clock_base[CLOCK_REALTIME].softirq_time = xtim; - base->clock_base[CLOCK_MONOTONIC].softirq_time = - ktime_add(xtim, tomono); + mono = ktime_add(xtim, timespec_to_ktime(tom)); + boot = ktime_add(mono, timespec_to_ktime(slp)); + base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; + base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; + base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; } /* @@ -184,10 +199,11 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, struct hrtimer_cpu_base *new_cpu_base; int this_cpu = smp_processor_id(); int cpu = hrtimer_get_target(this_cpu, pinned); + int basenum = base->index; again: new_cpu_base = &per_cpu(hrtimer_bases, cpu); - new_base = &new_cpu_base->clock_base[base->index]; + new_base = &new_cpu_base->clock_base[basenum]; if (base != new_base) { /* @@ -334,6 +350,11 @@ EXPORT_SYMBOL_GPL(ktime_add_safe); static struct debug_obj_descr hrtimer_debug_descr; +static void *hrtimer_debug_hint(void *addr) +{ + return ((struct hrtimer *) addr)->function; +} + /* * fixup_init is called when: * - an active object is initialized @@ -393,6 +414,7 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state) static struct debug_obj_descr hrtimer_debug_descr = { .name = "hrtimer", + .debug_hint = hrtimer_debug_hint, .fixup_init = hrtimer_fixup_init, .fixup_activate = hrtimer_fixup_activate, .fixup_free = hrtimer_fixup_free, @@ -602,67 +624,6 @@ static int hrtimer_reprogram(struct hrtimer *timer, return res; } - -/* - * Retrigger next event is called after clock was set - * - * Called with interrupts disabled via on_each_cpu() - */ -static void retrigger_next_event(void *arg) -{ - struct hrtimer_cpu_base *base; - struct timespec realtime_offset, wtm; - unsigned long seq; - - if (!hrtimer_hres_active()) - return; - - do { - seq = read_seqbegin(&xtime_lock); - wtm = __get_wall_to_monotonic(); - } while (read_seqretry(&xtime_lock, seq)); - set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); - - base = &__get_cpu_var(hrtimer_bases); - - /* Adjust CLOCK_REALTIME offset */ - raw_spin_lock(&base->lock); - base->clock_base[CLOCK_REALTIME].offset = - timespec_to_ktime(realtime_offset); - - hrtimer_force_reprogram(base, 0); - raw_spin_unlock(&base->lock); -} - -/* - * Clock realtime was set - * - * Change the offset of the realtime clock vs. the monotonic - * clock. - * - * We might have to reprogram the high resolution timer interrupt. On - * SMP we call the architecture specific code to retrigger _all_ high - * resolution timer interrupts. On UP we just disable interrupts and - * call the high resolution interrupt code. - */ -void clock_was_set(void) -{ - /* Retrigger the CPU local events everywhere */ - on_each_cpu(retrigger_next_event, NULL, 1); -} - -/* - * During resume we might have to reprogram the high resolution timer - * interrupt (on the local CPU): - */ -void hres_timers_resume(void) -{ - WARN_ONCE(!irqs_disabled(), - KERN_INFO "hres_timers_resume() called with IRQs enabled!"); - - retrigger_next_event(NULL); -} - /* * Initialize the high resolution related parts of cpu_base */ @@ -673,14 +634,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) } /* - * Initialize the high resolution related parts of a hrtimer - */ -static inline void hrtimer_init_timer_hres(struct hrtimer *timer) -{ -} - - -/* * When High resolution timers are active, try to reprogram. Note, that in case * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry * check happens. The timer gets enqueued into the rbtree. The reprogramming @@ -705,11 +658,39 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, } /* + * Retrigger next event is called after clock was set + * + * Called with interrupts disabled via on_each_cpu() + */ +static void retrigger_next_event(void *arg) +{ + struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); + struct timespec realtime_offset, xtim, wtm, sleep; + + if (!hrtimer_hres_active()) + return; + + /* Optimized out for !HIGH_RES */ + get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep); + set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); + + /* Adjust CLOCK_REALTIME offset */ + raw_spin_lock(&base->lock); + base->clock_base[HRTIMER_BASE_REALTIME].offset = + timespec_to_ktime(realtime_offset); + base->clock_base[HRTIMER_BASE_BOOTTIME].offset = + timespec_to_ktime(sleep); + + hrtimer_force_reprogram(base, 0); + raw_spin_unlock(&base->lock); +} + +/* * Switch to high resolution mode */ static int hrtimer_switch_to_hres(void) { - int cpu = smp_processor_id(); + int i, cpu = smp_processor_id(); struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); unsigned long flags; @@ -725,8 +706,8 @@ static int hrtimer_switch_to_hres(void) return 0; } base->hres_active = 1; - base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES; - base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES; + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) + base->clock_base[i].resolution = KTIME_HIGH_RES; tick_setup_sched_timer(); @@ -750,10 +731,43 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, return 0; } static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } -static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } +static inline void retrigger_next_event(void *arg) { } #endif /* CONFIG_HIGH_RES_TIMERS */ +/* + * Clock realtime was set + * + * Change the offset of the realtime clock vs. the monotonic + * clock. + * + * We might have to reprogram the high resolution timer interrupt. On + * SMP we call the architecture specific code to retrigger _all_ high + * resolution timer interrupts. On UP we just disable interrupts and + * call the high resolution interrupt code. + */ +void clock_was_set(void) +{ +#ifdef CONFIG_HIGH_RES_TIMERS + /* Retrigger the CPU local events everywhere */ + on_each_cpu(retrigger_next_event, NULL, 1); +#endif + timerfd_clock_was_set(); +} + +/* + * During resume we might have to reprogram the high resolution timer + * interrupt (on the local CPU): + */ +void hrtimers_resume(void) +{ + WARN_ONCE(!irqs_disabled(), + KERN_INFO "hrtimers_resume() called with IRQs enabled!"); + + retrigger_next_event(NULL); + timerfd_clock_was_set(); +} + static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) { #ifdef CONFIG_TIMER_STATS @@ -846,6 +860,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, debug_activate(timer); timerqueue_add(&base->active, &timer->node); + base->cpu_base->active_bases |= 1 << base->index; /* * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the @@ -887,6 +902,8 @@ static void __remove_hrtimer(struct hrtimer *timer, #endif } timerqueue_del(&base->active, &timer->node); + if (!timerqueue_getnext(&base->active)) + base->cpu_base->active_bases &= ~(1 << base->index); out: timer->state = newstate; } @@ -1121,6 +1138,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { struct hrtimer_cpu_base *cpu_base; + int base; memset(timer, 0, sizeof(struct hrtimer)); @@ -1129,8 +1147,8 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) clock_id = CLOCK_MONOTONIC; - timer->base = &cpu_base->clock_base[clock_id]; - hrtimer_init_timer_hres(timer); + base = hrtimer_clockid_to_base(clock_id); + timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); #ifdef CONFIG_TIMER_STATS @@ -1165,9 +1183,10 @@ EXPORT_SYMBOL_GPL(hrtimer_init); int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) { struct hrtimer_cpu_base *cpu_base; + int base = hrtimer_clockid_to_base(which_clock); cpu_base = &__raw_get_cpu_var(hrtimer_bases); - *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution); + *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); return 0; } @@ -1222,7 +1241,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) void hrtimer_interrupt(struct clock_event_device *dev) { struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - struct hrtimer_clock_base *base; ktime_t expires_next, now, entry_time, delta; int i, retries = 0; @@ -1244,12 +1262,15 @@ retry: */ cpu_base->expires_next.tv64 = KTIME_MAX; - base = cpu_base->clock_base; - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - ktime_t basenow; + struct hrtimer_clock_base *base; struct timerqueue_node *node; + ktime_t basenow; + + if (!(cpu_base->active_bases & (1 << i))) + continue; + base = cpu_base->clock_base + i; basenow = ktime_add(now, base->offset); while ((node = timerqueue_getnext(&base->active))) { @@ -1282,7 +1303,6 @@ retry: __run_hrtimer(timer, &basenow); } - base++; } /* @@ -1513,7 +1533,7 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart) struct timespec __user *rmtp; int ret = 0; - hrtimer_init_on_stack(&t.timer, restart->nanosleep.index, + hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid, HRTIMER_MODE_ABS); hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); @@ -1565,7 +1585,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, restart = ¤t_thread_info()->restart_block; restart->fn = hrtimer_nanosleep_restart; - restart->nanosleep.index = t.timer.base->index; + restart->nanosleep.clockid = t.timer.base->clockid; restart->nanosleep.rmtp = rmtp; restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 53ead174da2f..ea640120ab86 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -33,7 +33,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; /* * Zero means infinite timeout - no checking done: */ -unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; +unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; unsigned long __read_mostly sysctl_hung_task_warnings = 10; diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 8e42fec7686d..d1d051b38e0b 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -1,5 +1,6 @@ +# Select this to activate the generic irq options below config HAVE_GENERIC_HARDIRQS - def_bool n + bool if HAVE_GENERIC_HARDIRQS menu "IRQ subsystem" @@ -9,28 +10,51 @@ menu "IRQ subsystem" config GENERIC_HARDIRQS def_bool y -# Select this to disable the deprecated stuff -config GENERIC_HARDIRQS_NO_DEPRECATED - def_bool n - # Options selectable by the architecture code + +# Make sparse irq Kconfig switch below available config HAVE_SPARSE_IRQ - def_bool n + bool +# Enable the generic irq autoprobe mechanism config GENERIC_IRQ_PROBE - def_bool n + bool + +# Use the generic /proc/interrupts implementation +config GENERIC_IRQ_SHOW + bool +# Print level/edge extra information +config GENERIC_IRQ_SHOW_LEVEL + bool + +# Support for delayed migration from interrupt context config GENERIC_PENDING_IRQ - def_bool n + bool +# Alpha specific irq affinity mechanism config AUTO_IRQ_AFFINITY - def_bool n - -config IRQ_PER_CPU - def_bool n + bool +# Tasklet based software resend for pending interrupts on enable_irq() config HARDIRQS_SW_RESEND - def_bool n + bool + +# Preflow handler support for fasteoi (sparc64) +config IRQ_PREFLOW_FASTEOI + bool + +# Edge style eoi based handler (cell) +config IRQ_EDGE_EOI_HANDLER + bool + +# Generic configurable interrupt chip implementation +config GENERIC_IRQ_CHIP + bool + +# Support forced irq threading +config IRQ_FORCED_THREADING + bool config SPARSE_IRQ bool "Support sparse irq numbering" diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 54329cd7b3ee..73290056cfb6 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -1,5 +1,6 @@ obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o +obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 505798f86c36..342d8f44e401 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -17,7 +17,7 @@ /* * Autodetection depends on the fact that any interrupt that * comes in on to an unassigned handler will get stuck with - * "IRQ_WAITING" cleared and the interrupt disabled. + * "IRQS_WAITING" cleared and the interrupt disabled. */ static DEFINE_MUTEX(probing_active); @@ -32,7 +32,6 @@ unsigned long probe_irq_on(void) { struct irq_desc *desc; unsigned long mask = 0; - unsigned int status; int i; /* @@ -46,13 +45,7 @@ unsigned long probe_irq_on(void) */ for_each_irq_desc_reverse(i, desc) { raw_spin_lock_irq(&desc->lock); - if (!desc->action && !(desc->status & IRQ_NOPROBE)) { - /* - * An old-style architecture might still have - * the handle_bad_irq handler there: - */ - compat_irq_chip_set_default_handler(desc); - + if (!desc->action && irq_settings_can_probe(desc)) { /* * Some chips need to know about probing in * progress: @@ -60,7 +53,7 @@ unsigned long probe_irq_on(void) if (desc->irq_data.chip->irq_set_type) desc->irq_data.chip->irq_set_type(&desc->irq_data, IRQ_TYPE_PROBE); - desc->irq_data.chip->irq_startup(&desc->irq_data); + irq_startup(desc); } raw_spin_unlock_irq(&desc->lock); } @@ -75,10 +68,10 @@ unsigned long probe_irq_on(void) */ for_each_irq_desc_reverse(i, desc) { raw_spin_lock_irq(&desc->lock); - if (!desc->action && !(desc->status & IRQ_NOPROBE)) { - desc->status |= IRQ_AUTODETECT | IRQ_WAITING; - if (desc->irq_data.chip->irq_startup(&desc->irq_data)) - desc->status |= IRQ_PENDING; + if (!desc->action && irq_settings_can_probe(desc)) { + desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; + if (irq_startup(desc)) + desc->istate |= IRQS_PENDING; } raw_spin_unlock_irq(&desc->lock); } @@ -93,13 +86,12 @@ unsigned long probe_irq_on(void) */ for_each_irq_desc(i, desc) { raw_spin_lock_irq(&desc->lock); - status = desc->status; - if (status & IRQ_AUTODETECT) { + if (desc->istate & IRQS_AUTODETECT) { /* It triggered already - consider it spurious. */ - if (!(status & IRQ_WAITING)) { - desc->status = status & ~IRQ_AUTODETECT; - desc->irq_data.chip->irq_shutdown(&desc->irq_data); + if (!(desc->istate & IRQS_WAITING)) { + desc->istate &= ~IRQS_AUTODETECT; + irq_shutdown(desc); } else if (i < 32) mask |= 1 << i; @@ -125,20 +117,18 @@ EXPORT_SYMBOL(probe_irq_on); */ unsigned int probe_irq_mask(unsigned long val) { - unsigned int status, mask = 0; + unsigned int mask = 0; struct irq_desc *desc; int i; for_each_irq_desc(i, desc) { raw_spin_lock_irq(&desc->lock); - status = desc->status; - - if (status & IRQ_AUTODETECT) { - if (i < 16 && !(status & IRQ_WAITING)) + if (desc->istate & IRQS_AUTODETECT) { + if (i < 16 && !(desc->istate & IRQS_WAITING)) mask |= 1 << i; - desc->status = status & ~IRQ_AUTODETECT; - desc->irq_data.chip->irq_shutdown(&desc->irq_data); + desc->istate &= ~IRQS_AUTODETECT; + irq_shutdown(desc); } raw_spin_unlock_irq(&desc->lock); } @@ -169,20 +159,18 @@ int probe_irq_off(unsigned long val) { int i, irq_found = 0, nr_of_irqs = 0; struct irq_desc *desc; - unsigned int status; for_each_irq_desc(i, desc) { raw_spin_lock_irq(&desc->lock); - status = desc->status; - if (status & IRQ_AUTODETECT) { - if (!(status & IRQ_WAITING)) { + if (desc->istate & IRQS_AUTODETECT) { + if (!(desc->istate & IRQS_WAITING)) { if (!nr_of_irqs) irq_found = i; nr_of_irqs++; } - desc->status = status & ~IRQ_AUTODETECT; - desc->irq_data.chip->irq_shutdown(&desc->irq_data); + desc->istate &= ~IRQS_AUTODETECT; + irq_shutdown(desc); } raw_spin_unlock_irq(&desc->lock); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index baa5c4acad83..d5a3009da71a 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -19,140 +19,115 @@ #include "internals.h" /** - * set_irq_chip - set the irq chip for an irq + * irq_set_chip - set the irq chip for an irq * @irq: irq number * @chip: pointer to irq chip description structure */ -int set_irq_chip(unsigned int irq, struct irq_chip *chip) +int irq_set_chip(unsigned int irq, struct irq_chip *chip) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); - if (!desc) { - WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq); + if (!desc) return -EINVAL; - } if (!chip) chip = &no_irq_chip; - raw_spin_lock_irqsave(&desc->lock, flags); - irq_chip_set_defaults(chip); desc->irq_data.chip = chip; - raw_spin_unlock_irqrestore(&desc->lock, flags); - + irq_put_desc_unlock(desc, flags); + /* + * For !CONFIG_SPARSE_IRQ make the irq show up in + * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is + * already marked, and this call is harmless. + */ + irq_reserve_irq(irq); return 0; } -EXPORT_SYMBOL(set_irq_chip); +EXPORT_SYMBOL(irq_set_chip); /** - * set_irq_type - set the irq trigger type for an irq + * irq_set_type - set the irq trigger type for an irq * @irq: irq number * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h */ -int set_irq_type(unsigned int irq, unsigned int type) +int irq_set_irq_type(unsigned int irq, unsigned int type) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - int ret = -ENXIO; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); + int ret = 0; - if (!desc) { - printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); - return -ENODEV; - } + if (!desc) + return -EINVAL; type &= IRQ_TYPE_SENSE_MASK; - if (type == IRQ_TYPE_NONE) - return 0; - - raw_spin_lock_irqsave(&desc->lock, flags); - ret = __irq_set_trigger(desc, irq, type); - raw_spin_unlock_irqrestore(&desc->lock, flags); + if (type != IRQ_TYPE_NONE) + ret = __irq_set_trigger(desc, irq, type); + irq_put_desc_busunlock(desc, flags); return ret; } -EXPORT_SYMBOL(set_irq_type); +EXPORT_SYMBOL(irq_set_irq_type); /** - * set_irq_data - set irq type data for an irq + * irq_set_handler_data - set irq handler data for an irq * @irq: Interrupt number * @data: Pointer to interrupt specific data * * Set the hardware irq controller data for an irq */ -int set_irq_data(unsigned int irq, void *data) +int irq_set_handler_data(unsigned int irq, void *data) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); - if (!desc) { - printk(KERN_ERR - "Trying to install controller data for IRQ%d\n", irq); + if (!desc) return -EINVAL; - } - - raw_spin_lock_irqsave(&desc->lock, flags); desc->irq_data.handler_data = data; - raw_spin_unlock_irqrestore(&desc->lock, flags); + irq_put_desc_unlock(desc, flags); return 0; } -EXPORT_SYMBOL(set_irq_data); +EXPORT_SYMBOL(irq_set_handler_data); /** - * set_irq_msi - set MSI descriptor data for an irq + * irq_set_msi_desc - set MSI descriptor data for an irq * @irq: Interrupt number * @entry: Pointer to MSI descriptor data * * Set the MSI descriptor entry for an irq */ -int set_irq_msi(unsigned int irq, struct msi_desc *entry) +int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); - if (!desc) { - printk(KERN_ERR - "Trying to install msi data for IRQ%d\n", irq); + if (!desc) return -EINVAL; - } - - raw_spin_lock_irqsave(&desc->lock, flags); desc->irq_data.msi_desc = entry; if (entry) entry->irq = irq; - raw_spin_unlock_irqrestore(&desc->lock, flags); + irq_put_desc_unlock(desc, flags); return 0; } /** - * set_irq_chip_data - set irq chip data for an irq + * irq_set_chip_data - set irq chip data for an irq * @irq: Interrupt number * @data: Pointer to chip specific data * * Set the hardware irq chip data for an irq */ -int set_irq_chip_data(unsigned int irq, void *data) +int irq_set_chip_data(unsigned int irq, void *data) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); - if (!desc) { - printk(KERN_ERR - "Trying to install chip data for IRQ%d\n", irq); - return -EINVAL; - } - - if (!desc->irq_data.chip) { - printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); + if (!desc) return -EINVAL; - } - - raw_spin_lock_irqsave(&desc->lock, flags); desc->irq_data.chip_data = data; - raw_spin_unlock_irqrestore(&desc->lock, flags); - + irq_put_desc_unlock(desc, flags); return 0; } -EXPORT_SYMBOL(set_irq_chip_data); +EXPORT_SYMBOL(irq_set_chip_data); struct irq_data *irq_get_irq_data(unsigned int irq) { @@ -162,221 +137,71 @@ struct irq_data *irq_get_irq_data(unsigned int irq) } EXPORT_SYMBOL_GPL(irq_get_irq_data); -/** - * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq - * - * @irq: Interrupt number - * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag - * - * The IRQ_NESTED_THREAD flag indicates that on - * request_threaded_irq() no separate interrupt thread should be - * created for the irq as the handler are called nested in the - * context of a demultiplexing interrupt handler thread. - */ -void set_irq_nested_thread(unsigned int irq, int nest) -{ - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - - if (!desc) - return; - - raw_spin_lock_irqsave(&desc->lock, flags); - if (nest) - desc->status |= IRQ_NESTED_THREAD; - else - desc->status &= ~IRQ_NESTED_THREAD; - raw_spin_unlock_irqrestore(&desc->lock, flags); -} -EXPORT_SYMBOL_GPL(set_irq_nested_thread); - -/* - * default enable function - */ -static void default_enable(struct irq_data *data) +static void irq_state_clr_disabled(struct irq_desc *desc) { - struct irq_desc *desc = irq_data_to_desc(data); - - desc->irq_data.chip->irq_unmask(&desc->irq_data); - desc->status &= ~IRQ_MASKED; + irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED); } -/* - * default disable function - */ -static void default_disable(struct irq_data *data) -{ -} - -/* - * default startup function - */ -static unsigned int default_startup(struct irq_data *data) +static void irq_state_set_disabled(struct irq_desc *desc) { - struct irq_desc *desc = irq_data_to_desc(data); - - desc->irq_data.chip->irq_enable(data); - return 0; + irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); } -/* - * default shutdown function - */ -static void default_shutdown(struct irq_data *data) +static void irq_state_clr_masked(struct irq_desc *desc) { - struct irq_desc *desc = irq_data_to_desc(data); - - desc->irq_data.chip->irq_mask(&desc->irq_data); - desc->status |= IRQ_MASKED; + irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED); } -#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED -/* Temporary migration helpers */ -static void compat_irq_mask(struct irq_data *data) +static void irq_state_set_masked(struct irq_desc *desc) { - data->chip->mask(data->irq); + irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); } -static void compat_irq_unmask(struct irq_data *data) +int irq_startup(struct irq_desc *desc) { - data->chip->unmask(data->irq); -} + irq_state_clr_disabled(desc); + desc->depth = 0; -static void compat_irq_ack(struct irq_data *data) -{ - data->chip->ack(data->irq); -} - -static void compat_irq_mask_ack(struct irq_data *data) -{ - data->chip->mask_ack(data->irq); -} - -static void compat_irq_eoi(struct irq_data *data) -{ - data->chip->eoi(data->irq); -} - -static void compat_irq_enable(struct irq_data *data) -{ - data->chip->enable(data->irq); -} - -static void compat_irq_disable(struct irq_data *data) -{ - data->chip->disable(data->irq); -} - -static void compat_irq_shutdown(struct irq_data *data) -{ - data->chip->shutdown(data->irq); -} - -static unsigned int compat_irq_startup(struct irq_data *data) -{ - return data->chip->startup(data->irq); -} - -static int compat_irq_set_affinity(struct irq_data *data, - const struct cpumask *dest, bool force) -{ - return data->chip->set_affinity(data->irq, dest); -} - -static int compat_irq_set_type(struct irq_data *data, unsigned int type) -{ - return data->chip->set_type(data->irq, type); -} - -static int compat_irq_set_wake(struct irq_data *data, unsigned int on) -{ - return data->chip->set_wake(data->irq, on); -} + if (desc->irq_data.chip->irq_startup) { + int ret = desc->irq_data.chip->irq_startup(&desc->irq_data); + irq_state_clr_masked(desc); + return ret; + } -static int compat_irq_retrigger(struct irq_data *data) -{ - return data->chip->retrigger(data->irq); + irq_enable(desc); + return 0; } -static void compat_bus_lock(struct irq_data *data) +void irq_shutdown(struct irq_desc *desc) { - data->chip->bus_lock(data->irq); + irq_state_set_disabled(desc); + desc->depth = 1; + if (desc->irq_data.chip->irq_shutdown) + desc->irq_data.chip->irq_shutdown(&desc->irq_data); + if (desc->irq_data.chip->irq_disable) + desc->irq_data.chip->irq_disable(&desc->irq_data); + else + desc->irq_data.chip->irq_mask(&desc->irq_data); + irq_state_set_masked(desc); } -static void compat_bus_sync_unlock(struct irq_data *data) +void irq_enable(struct irq_desc *desc) { - data->chip->bus_sync_unlock(data->irq); + irq_state_clr_disabled(desc); + if (desc->irq_data.chip->irq_enable) + desc->irq_data.chip->irq_enable(&desc->irq_data); + else + desc->irq_data.chip->irq_unmask(&desc->irq_data); + irq_state_clr_masked(desc); } -#endif -/* - * Fixup enable/disable function pointers - */ -void irq_chip_set_defaults(struct irq_chip *chip) +void irq_disable(struct irq_desc *desc) { -#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED - /* - * Compat fixup functions need to be before we set the - * defaults for enable/disable/startup/shutdown - */ - if (chip->enable) - chip->irq_enable = compat_irq_enable; - if (chip->disable) - chip->irq_disable = compat_irq_disable; - if (chip->shutdown) - chip->irq_shutdown = compat_irq_shutdown; - if (chip->startup) - chip->irq_startup = compat_irq_startup; -#endif - /* - * The real defaults - */ - if (!chip->irq_enable) - chip->irq_enable = default_enable; - if (!chip->irq_disable) - chip->irq_disable = default_disable; - if (!chip->irq_startup) - chip->irq_startup = default_startup; - /* - * We use chip->irq_disable, when the user provided its own. When - * we have default_disable set for chip->irq_disable, then we need - * to use default_shutdown, otherwise the irq line is not - * disabled on free_irq(): - */ - if (!chip->irq_shutdown) - chip->irq_shutdown = chip->irq_disable != default_disable ? - chip->irq_disable : default_shutdown; - -#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED - if (!chip->end) - chip->end = dummy_irq_chip.end; - - /* - * Now fix up the remaining compat handlers - */ - if (chip->bus_lock) - chip->irq_bus_lock = compat_bus_lock; - if (chip->bus_sync_unlock) - chip->irq_bus_sync_unlock = compat_bus_sync_unlock; - if (chip->mask) - chip->irq_mask = compat_irq_mask; - if (chip->unmask) - chip->irq_unmask = compat_irq_unmask; - if (chip->ack) - chip->irq_ack = compat_irq_ack; - if (chip->mask_ack) - chip->irq_mask_ack = compat_irq_mask_ack; - if (chip->eoi) - chip->irq_eoi = compat_irq_eoi; - if (chip->set_affinity) - chip->irq_set_affinity = compat_irq_set_affinity; - if (chip->set_type) - chip->irq_set_type = compat_irq_set_type; - if (chip->set_wake) - chip->irq_set_wake = compat_irq_set_wake; - if (chip->retrigger) - chip->irq_retrigger = compat_irq_retrigger; -#endif + irq_state_set_disabled(desc); + if (desc->irq_data.chip->irq_disable) { + desc->irq_data.chip->irq_disable(&desc->irq_data); + irq_state_set_masked(desc); + } } static inline void mask_ack_irq(struct irq_desc *desc) @@ -388,22 +213,22 @@ static inline void mask_ack_irq(struct irq_desc *desc) if (desc->irq_data.chip->irq_ack) desc->irq_data.chip->irq_ack(&desc->irq_data); } - desc->status |= IRQ_MASKED; + irq_state_set_masked(desc); } -static inline void mask_irq(struct irq_desc *desc) +void mask_irq(struct irq_desc *desc) { if (desc->irq_data.chip->irq_mask) { desc->irq_data.chip->irq_mask(&desc->irq_data); - desc->status |= IRQ_MASKED; + irq_state_set_masked(desc); } } -static inline void unmask_irq(struct irq_desc *desc) +void unmask_irq(struct irq_desc *desc) { if (desc->irq_data.chip->irq_unmask) { desc->irq_data.chip->irq_unmask(&desc->irq_data); - desc->status &= ~IRQ_MASKED; + irq_state_clr_masked(desc); } } @@ -428,10 +253,10 @@ void handle_nested_irq(unsigned int irq) kstat_incr_irqs_this_cpu(irq, desc); action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) + if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) goto out_unlock; - desc->status |= IRQ_INPROGRESS; + irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock_irq(&desc->lock); action_ret = action->thread_fn(action->irq, action->dev_id); @@ -439,13 +264,20 @@ void handle_nested_irq(unsigned int irq) note_interrupt(irq, desc, action_ret); raw_spin_lock_irq(&desc->lock); - desc->status &= ~IRQ_INPROGRESS; + irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); out_unlock: raw_spin_unlock_irq(&desc->lock); } EXPORT_SYMBOL_GPL(handle_nested_irq); +static bool irq_check_poll(struct irq_desc *desc) +{ + if (!(desc->istate & IRQS_POLL_INPROGRESS)) + return false; + return irq_wait_for_poll(desc); +} + /** * handle_simple_irq - Simple and software-decoded IRQs. * @irq: the interrupt number @@ -461,32 +293,24 @@ EXPORT_SYMBOL_GPL(handle_nested_irq); void handle_simple_irq(unsigned int irq, struct irq_desc *desc) { - struct irqaction *action; - irqreturn_t action_ret; - raw_spin_lock(&desc->lock); - if (unlikely(desc->status & IRQ_INPROGRESS)) - goto out_unlock; - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + if (unlikely(irqd_irq_inprogress(&desc->irq_data))) + if (!irq_check_poll(desc)) + goto out_unlock; + + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); - action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) goto out_unlock; - desc->status |= IRQ_INPROGRESS; - raw_spin_unlock(&desc->lock); + handle_irq_event(desc); - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - - raw_spin_lock(&desc->lock); - desc->status &= ~IRQ_INPROGRESS; out_unlock: raw_spin_unlock(&desc->lock); } +EXPORT_SYMBOL_GPL(handle_simple_irq); /** * handle_level_irq - Level type irq handler @@ -501,42 +325,42 @@ out_unlock: void handle_level_irq(unsigned int irq, struct irq_desc *desc) { - struct irqaction *action; - irqreturn_t action_ret; - raw_spin_lock(&desc->lock); mask_ack_irq(desc); - if (unlikely(desc->status & IRQ_INPROGRESS)) - goto out_unlock; - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + if (unlikely(irqd_irq_inprogress(&desc->irq_data))) + if (!irq_check_poll(desc)) + goto out_unlock; + + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); /* * If its disabled or no action available * keep it masked and get out of here */ - action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) goto out_unlock; - desc->status |= IRQ_INPROGRESS; - raw_spin_unlock(&desc->lock); - - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - - raw_spin_lock(&desc->lock); - desc->status &= ~IRQ_INPROGRESS; + handle_irq_event(desc); - if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) + if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT)) unmask_irq(desc); out_unlock: raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_level_irq); +#ifdef CONFIG_IRQ_PREFLOW_FASTEOI +static inline void preflow_handler(struct irq_desc *desc) +{ + if (desc->preflow_handler) + desc->preflow_handler(&desc->irq_data); +} +#else +static inline void preflow_handler(struct irq_desc *desc) { } +#endif + /** * handle_fasteoi_irq - irq handler for transparent controllers * @irq: the interrupt number @@ -550,42 +374,40 @@ EXPORT_SYMBOL_GPL(handle_level_irq); void handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) { - struct irqaction *action; - irqreturn_t action_ret; - raw_spin_lock(&desc->lock); - if (unlikely(desc->status & IRQ_INPROGRESS)) - goto out; + if (unlikely(irqd_irq_inprogress(&desc->irq_data))) + if (!irq_check_poll(desc)) + goto out; - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); /* * If its disabled or no action available * then mask it and get out of here: */ - action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) { - desc->status |= IRQ_PENDING; + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { + desc->istate |= IRQS_PENDING; mask_irq(desc); goto out; } - desc->status |= IRQ_INPROGRESS; - desc->status &= ~IRQ_PENDING; - raw_spin_unlock(&desc->lock); + if (desc->istate & IRQS_ONESHOT) + mask_irq(desc); - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); + preflow_handler(desc); + handle_irq_event(desc); - raw_spin_lock(&desc->lock); - desc->status &= ~IRQ_INPROGRESS; -out: +out_eoi: desc->irq_data.chip->irq_eoi(&desc->irq_data); - +out_unlock: raw_spin_unlock(&desc->lock); + return; +out: + if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED)) + goto out_eoi; + goto out_unlock; } /** @@ -594,7 +416,7 @@ out: * @desc: the interrupt description structure for this irq * * Interrupt occures on the falling and/or rising edge of a hardware - * signal. The occurence is latched into the irq controller hardware + * signal. The occurrence is latched into the irq controller hardware * and must be acked in order to be reenabled. After the ack another * interrupt can happen on the same source even before the first one * is handled by the associated event handler. If this happens it @@ -609,32 +431,27 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) { raw_spin_lock(&desc->lock); - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); - + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); /* * If we're currently running this IRQ, or its disabled, * we shouldn't process the IRQ. Mark it pending, handle * the necessary masking and go out */ - if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || - !desc->action)) { - desc->status |= (IRQ_PENDING | IRQ_MASKED); - mask_ack_irq(desc); - goto out_unlock; + if (unlikely(irqd_irq_disabled(&desc->irq_data) || + irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { + if (!irq_check_poll(desc)) { + desc->istate |= IRQS_PENDING; + mask_ack_irq(desc); + goto out_unlock; + } } kstat_incr_irqs_this_cpu(irq, desc); /* Start handling the irq */ desc->irq_data.chip->irq_ack(&desc->irq_data); - /* Mark the IRQ currently in progress.*/ - desc->status |= IRQ_INPROGRESS; - do { - struct irqaction *action = desc->action; - irqreturn_t action_ret; - - if (unlikely(!action)) { + if (unlikely(!desc->action)) { mask_irq(desc); goto out_unlock; } @@ -644,26 +461,66 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) * one, we could have masked the irq. * Renable it, if it was not disabled in meantime. */ - if (unlikely((desc->status & - (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == - (IRQ_PENDING | IRQ_MASKED))) { - unmask_irq(desc); + if (unlikely(desc->istate & IRQS_PENDING)) { + if (!irqd_irq_disabled(&desc->irq_data) && + irqd_irq_masked(&desc->irq_data)) + unmask_irq(desc); } - desc->status &= ~IRQ_PENDING; - raw_spin_unlock(&desc->lock); - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - raw_spin_lock(&desc->lock); + handle_irq_event(desc); - } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); + } while ((desc->istate & IRQS_PENDING) && + !irqd_irq_disabled(&desc->irq_data)); - desc->status &= ~IRQ_INPROGRESS; out_unlock: raw_spin_unlock(&desc->lock); } +#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER +/** + * handle_edge_eoi_irq - edge eoi type IRQ handler + * @irq: the interrupt number + * @desc: the interrupt description structure for this irq + * + * Similar as the above handle_edge_irq, but using eoi and w/o the + * mask/unmask logic. + */ +void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc) +{ + struct irq_chip *chip = irq_desc_get_chip(desc); + + raw_spin_lock(&desc->lock); + + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + /* + * If we're currently running this IRQ, or its disabled, + * we shouldn't process the IRQ. Mark it pending, handle + * the necessary masking and go out + */ + if (unlikely(irqd_irq_disabled(&desc->irq_data) || + irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { + if (!irq_check_poll(desc)) { + desc->istate |= IRQS_PENDING; + goto out_eoi; + } + } + kstat_incr_irqs_this_cpu(irq, desc); + + do { + if (unlikely(!desc->action)) + goto out_eoi; + + handle_irq_event(desc); + + } while ((desc->istate & IRQS_PENDING) && + !irqd_irq_disabled(&desc->irq_data)); + +out_eoi: + chip->irq_eoi(&desc->irq_data); + raw_spin_unlock(&desc->lock); +} +#endif + /** * handle_percpu_irq - Per CPU local irq handler * @irq: the interrupt number @@ -674,103 +531,147 @@ out_unlock: void handle_percpu_irq(unsigned int irq, struct irq_desc *desc) { - irqreturn_t action_ret; + struct irq_chip *chip = irq_desc_get_chip(desc); kstat_incr_irqs_this_cpu(irq, desc); - if (desc->irq_data.chip->irq_ack) - desc->irq_data.chip->irq_ack(&desc->irq_data); + if (chip->irq_ack) + chip->irq_ack(&desc->irq_data); - action_ret = handle_IRQ_event(irq, desc->action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); + handle_irq_event_percpu(desc, desc->action); - if (desc->irq_data.chip->irq_eoi) - desc->irq_data.chip->irq_eoi(&desc->irq_data); + if (chip->irq_eoi) + chip->irq_eoi(&desc->irq_data); } void -__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, +__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, const char *name) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); - if (!desc) { - printk(KERN_ERR - "Trying to install type control for IRQ%d\n", irq); + if (!desc) return; - } - if (!handle) + if (!handle) { handle = handle_bad_irq; - else if (desc->irq_data.chip == &no_irq_chip) { - printk(KERN_WARNING "Trying to install %sinterrupt handler " - "for IRQ%d\n", is_chained ? "chained " : "", irq); - /* - * Some ARM implementations install a handler for really dumb - * interrupt hardware without setting an irq_chip. This worked - * with the ARM no_irq_chip but the check in setup_irq would - * prevent us to setup the interrupt at all. Switch it to - * dummy_irq_chip for easy transition. - */ - desc->irq_data.chip = &dummy_irq_chip; + } else { + if (WARN_ON(desc->irq_data.chip == &no_irq_chip)) + goto out; } - chip_bus_lock(desc); - raw_spin_lock_irqsave(&desc->lock, flags); - /* Uninstall? */ if (handle == handle_bad_irq) { if (desc->irq_data.chip != &no_irq_chip) mask_ack_irq(desc); - desc->status |= IRQ_DISABLED; + irq_state_set_disabled(desc); desc->depth = 1; } desc->handle_irq = handle; desc->name = name; if (handle != handle_bad_irq && is_chained) { - desc->status &= ~IRQ_DISABLED; - desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; - desc->depth = 0; - desc->irq_data.chip->irq_startup(&desc->irq_data); + irq_settings_set_noprobe(desc); + irq_settings_set_norequest(desc); + irq_settings_set_nothread(desc); + irq_startup(desc); } - raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(desc); -} -EXPORT_SYMBOL_GPL(__set_irq_handler); - -void -set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, - irq_flow_handler_t handle) -{ - set_irq_chip(irq, chip); - __set_irq_handler(irq, handle, 0, NULL); +out: + irq_put_desc_busunlock(desc, flags); } +EXPORT_SYMBOL_GPL(__irq_set_handler); void -set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, +irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, irq_flow_handler_t handle, const char *name) { - set_irq_chip(irq, chip); - __set_irq_handler(irq, handle, 0, name); + irq_set_chip(irq, chip); + __irq_set_handler(irq, handle, 0, name); } void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); if (!desc) return; + irq_settings_clr_and_set(desc, clr, set); + + irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | + IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT); + if (irq_settings_has_no_balance_set(desc)) + irqd_set(&desc->irq_data, IRQD_NO_BALANCING); + if (irq_settings_is_per_cpu(desc)) + irqd_set(&desc->irq_data, IRQD_PER_CPU); + if (irq_settings_can_move_pcntxt(desc)) + irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT); + if (irq_settings_is_level(desc)) + irqd_set(&desc->irq_data, IRQD_LEVEL); + + irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); + + irq_put_desc_unlock(desc, flags); +} +EXPORT_SYMBOL_GPL(irq_modify_status); + +/** + * irq_cpu_online - Invoke all irq_cpu_online functions. + * + * Iterate through all irqs and invoke the chip.irq_cpu_online() + * for each. + */ +void irq_cpu_online(void) +{ + struct irq_desc *desc; + struct irq_chip *chip; + unsigned long flags; + unsigned int irq; + + for_each_active_irq(irq) { + desc = irq_to_desc(irq); + if (!desc) + continue; - /* Sanitize flags */ - set &= IRQF_MODIFY_MASK; - clr &= IRQF_MODIFY_MASK; + raw_spin_lock_irqsave(&desc->lock, flags); - raw_spin_lock_irqsave(&desc->lock, flags); - desc->status &= ~clr; - desc->status |= set; - raw_spin_unlock_irqrestore(&desc->lock, flags); + chip = irq_data_get_irq_chip(&desc->irq_data); + if (chip && chip->irq_cpu_online && + (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || + !irqd_irq_disabled(&desc->irq_data))) + chip->irq_cpu_online(&desc->irq_data); + + raw_spin_unlock_irqrestore(&desc->lock, flags); + } +} + +/** + * irq_cpu_offline - Invoke all irq_cpu_offline functions. + * + * Iterate through all irqs and invoke the chip.irq_cpu_offline() + * for each. + */ +void irq_cpu_offline(void) +{ + struct irq_desc *desc; + struct irq_chip *chip; + unsigned long flags; + unsigned int irq; + + for_each_active_irq(irq) { + desc = irq_to_desc(irq); + if (!desc) + continue; + + raw_spin_lock_irqsave(&desc->lock, flags); + + chip = irq_data_get_irq_chip(&desc->irq_data); + if (chip && chip->irq_cpu_offline && + (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || + !irqd_irq_disabled(&desc->irq_data))) + chip->irq_cpu_offline(&desc->irq_data); + + raw_spin_unlock_irqrestore(&desc->lock, flags); + } } diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h new file mode 100644 index 000000000000..97a8bfadc88a --- /dev/null +++ b/kernel/irq/debug.h @@ -0,0 +1,45 @@ +/* + * Debugging printout: + */ + +#include <linux/kallsyms.h> + +#define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) +#define PS(f) if (desc->istate & f) printk("%14s set\n", #f) +/* FIXME */ +#define PD(f) do { } while (0) + +static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) +{ + printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", + irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); + printk("->handle_irq(): %p, ", desc->handle_irq); + print_symbol("%s\n", (unsigned long)desc->handle_irq); + printk("->irq_data.chip(): %p, ", desc->irq_data.chip); + print_symbol("%s\n", (unsigned long)desc->irq_data.chip); + printk("->action(): %p\n", desc->action); + if (desc->action) { + printk("->action->handler(): %p, ", desc->action->handler); + print_symbol("%s\n", (unsigned long)desc->action->handler); + } + + P(IRQ_LEVEL); + P(IRQ_PER_CPU); + P(IRQ_NOPROBE); + P(IRQ_NOREQUEST); + P(IRQ_NOTHREAD); + P(IRQ_NOAUTOEN); + + PS(IRQS_AUTODETECT); + PS(IRQS_REPLAY); + PS(IRQS_WAITING); + PS(IRQS_PENDING); + + PD(IRQS_INPROGRESS); + PD(IRQS_DISABLED); + PD(IRQS_MASKED); +} + +#undef P +#undef PS +#undef PD diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c index 20dc5474947e..b5fcd96c7102 100644 --- a/kernel/irq/dummychip.c +++ b/kernel/irq/dummychip.c @@ -31,13 +31,6 @@ static unsigned int noop_ret(struct irq_data *data) return 0; } -#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED -static void compat_noop(unsigned int irq) { } -#define END_INIT .end = compat_noop -#else -#define END_INIT -#endif - /* * Generic no controller implementation */ @@ -48,7 +41,6 @@ struct irq_chip no_irq_chip = { .irq_enable = noop, .irq_disable = noop, .irq_ack = ack_bad, - END_INIT }; /* @@ -64,5 +56,4 @@ struct irq_chip dummy_irq_chip = { .irq_ack = noop, .irq_mask = noop, .irq_unmask = noop, - END_INIT }; diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c new file mode 100644 index 000000000000..31a9db711906 --- /dev/null +++ b/kernel/irq/generic-chip.c @@ -0,0 +1,354 @@ +/* + * Library implementing the most common irq chip callback functions + * + * Copyright (C) 2011, Thomas Gleixner + */ +#include <linux/io.h> +#include <linux/irq.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> +#include <linux/syscore_ops.h> + +#include "internals.h" + +static LIST_HEAD(gc_list); +static DEFINE_RAW_SPINLOCK(gc_lock); + +static inline struct irq_chip_regs *cur_regs(struct irq_data *d) +{ + return &container_of(d->chip, struct irq_chip_type, chip)->regs; +} + +/** + * irq_gc_noop - NOOP function + * @d: irq_data + */ +void irq_gc_noop(struct irq_data *d) +{ +} + +/** + * irq_gc_mask_disable_reg - Mask chip via disable register + * @d: irq_data + * + * Chip has separate enable/disable registers instead of a single mask + * register. + */ +void irq_gc_mask_disable_reg(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + u32 mask = 1 << (d->irq - gc->irq_base); + + irq_gc_lock(gc); + irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable); + gc->mask_cache &= ~mask; + irq_gc_unlock(gc); +} + +/** + * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register + * @d: irq_data + * + * Chip has a single mask register. Values of this register are cached + * and protected by gc->lock + */ +void irq_gc_mask_set_bit(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + u32 mask = 1 << (d->irq - gc->irq_base); + + irq_gc_lock(gc); + gc->mask_cache |= mask; + irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); + irq_gc_unlock(gc); +} + +/** + * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register + * @d: irq_data + * + * Chip has a single mask register. Values of this register are cached + * and protected by gc->lock + */ +void irq_gc_mask_clr_bit(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + u32 mask = 1 << (d->irq - gc->irq_base); + + irq_gc_lock(gc); + gc->mask_cache &= ~mask; + irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); + irq_gc_unlock(gc); +} + +/** + * irq_gc_unmask_enable_reg - Unmask chip via enable register + * @d: irq_data + * + * Chip has separate enable/disable registers instead of a single mask + * register. + */ +void irq_gc_unmask_enable_reg(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + u32 mask = 1 << (d->irq - gc->irq_base); + + irq_gc_lock(gc); + irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable); + gc->mask_cache |= mask; + irq_gc_unlock(gc); +} + +/** + * irq_gc_ack - Ack pending interrupt + * @d: irq_data + */ +void irq_gc_ack(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + u32 mask = 1 << (d->irq - gc->irq_base); + + irq_gc_lock(gc); + irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); + irq_gc_unlock(gc); +} + +/** + * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt + * @d: irq_data + */ +void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + u32 mask = 1 << (d->irq - gc->irq_base); + + irq_gc_lock(gc); + irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask); + irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); + irq_gc_unlock(gc); +} + +/** + * irq_gc_eoi - EOI interrupt + * @d: irq_data + */ +void irq_gc_eoi(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + u32 mask = 1 << (d->irq - gc->irq_base); + + irq_gc_lock(gc); + irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi); + irq_gc_unlock(gc); +} + +/** + * irq_gc_set_wake - Set/clr wake bit for an interrupt + * @d: irq_data + * + * For chips where the wake from suspend functionality is not + * configured in a separate register and the wakeup active state is + * just stored in a bitmask. + */ +int irq_gc_set_wake(struct irq_data *d, unsigned int on) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + u32 mask = 1 << (d->irq - gc->irq_base); + + if (!(mask & gc->wake_enabled)) + return -EINVAL; + + irq_gc_lock(gc); + if (on) + gc->wake_active |= mask; + else + gc->wake_active &= ~mask; + irq_gc_unlock(gc); + return 0; +} + +/** + * irq_alloc_generic_chip - Allocate a generic chip and initialize it + * @name: Name of the irq chip + * @num_ct: Number of irq_chip_type instances associated with this + * @irq_base: Interrupt base nr for this chip + * @reg_base: Register base address (virtual) + * @handler: Default flow handler associated with this chip + * + * Returns an initialized irq_chip_generic structure. The chip defaults + * to the primary (index 0) irq_chip_type and @handler + */ +struct irq_chip_generic * +irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base, + void __iomem *reg_base, irq_flow_handler_t handler) +{ + struct irq_chip_generic *gc; + unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); + + gc = kzalloc(sz, GFP_KERNEL); + if (gc) { + raw_spin_lock_init(&gc->lock); + gc->num_ct = num_ct; + gc->irq_base = irq_base; + gc->reg_base = reg_base; + gc->chip_types->chip.name = name; + gc->chip_types->handler = handler; + } + return gc; +} + +/* + * Separate lockdep class for interrupt chip which can nest irq_desc + * lock. + */ +static struct lock_class_key irq_nested_lock_class; + +/** + * irq_setup_generic_chip - Setup a range of interrupts with a generic chip + * @gc: Generic irq chip holding all data + * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base + * @flags: Flags for initialization + * @clr: IRQ_* bits to clear + * @set: IRQ_* bits to set + * + * Set up max. 32 interrupts starting from gc->irq_base. Note, this + * initializes all interrupts to the primary irq_chip_type and its + * associated handler. + */ +void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, + enum irq_gc_flags flags, unsigned int clr, + unsigned int set) +{ + struct irq_chip_type *ct = gc->chip_types; + unsigned int i; + + raw_spin_lock(&gc_lock); + list_add_tail(&gc->list, &gc_list); + raw_spin_unlock(&gc_lock); + + /* Init mask cache ? */ + if (flags & IRQ_GC_INIT_MASK_CACHE) + gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask); + + for (i = gc->irq_base; msk; msk >>= 1, i++) { + if (!msk & 0x01) + continue; + + if (flags & IRQ_GC_INIT_NESTED_LOCK) + irq_set_lockdep_class(i, &irq_nested_lock_class); + + irq_set_chip_and_handler(i, &ct->chip, ct->handler); + irq_set_chip_data(i, gc); + irq_modify_status(i, clr, set); + } + gc->irq_cnt = i - gc->irq_base; +} + +/** + * irq_setup_alt_chip - Switch to alternative chip + * @d: irq_data for this interrupt + * @type Flow type to be initialized + * + * Only to be called from chip->irq_set_type() callbacks. + */ +int irq_setup_alt_chip(struct irq_data *d, unsigned int type) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct irq_chip_type *ct = gc->chip_types; + unsigned int i; + + for (i = 0; i < gc->num_ct; i++, ct++) { + if (ct->type & type) { + d->chip = &ct->chip; + irq_data_to_desc(d)->handle_irq = ct->handler; + return 0; + } + } + return -EINVAL; +} + +/** + * irq_remove_generic_chip - Remove a chip + * @gc: Generic irq chip holding all data + * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base + * @clr: IRQ_* bits to clear + * @set: IRQ_* bits to set + * + * Remove up to 32 interrupts starting from gc->irq_base. + */ +void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, + unsigned int clr, unsigned int set) +{ + unsigned int i = gc->irq_base; + + raw_spin_lock(&gc_lock); + list_del(&gc->list); + raw_spin_unlock(&gc_lock); + + for (; msk; msk >>= 1, i++) { + if (!msk & 0x01) + continue; + + /* Remove handler first. That will mask the irq line */ + irq_set_handler(i, NULL); + irq_set_chip(i, &no_irq_chip); + irq_set_chip_data(i, NULL); + irq_modify_status(i, clr, set); + } +} + +#ifdef CONFIG_PM +static int irq_gc_suspend(void) +{ + struct irq_chip_generic *gc; + + list_for_each_entry(gc, &gc_list, list) { + struct irq_chip_type *ct = gc->chip_types; + + if (ct->chip.irq_suspend) + ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base)); + } + return 0; +} + +static void irq_gc_resume(void) +{ + struct irq_chip_generic *gc; + + list_for_each_entry(gc, &gc_list, list) { + struct irq_chip_type *ct = gc->chip_types; + + if (ct->chip.irq_resume) + ct->chip.irq_resume(irq_get_irq_data(gc->irq_base)); + } +} +#else +#define irq_gc_suspend NULL +#define irq_gc_resume NULL +#endif + +static void irq_gc_shutdown(void) +{ + struct irq_chip_generic *gc; + + list_for_each_entry(gc, &gc_list, list) { + struct irq_chip_type *ct = gc->chip_types; + + if (ct->chip.irq_pm_shutdown) + ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base)); + } +} + +static struct syscore_ops irq_gc_syscore_ops = { + .suspend = irq_gc_suspend, + .resume = irq_gc_resume, + .shutdown = irq_gc_shutdown, +}; + +static int __init irq_gc_init_ops(void) +{ + register_syscore_ops(&irq_gc_syscore_ops); + return 0; +} +device_initcall(irq_gc_init_ops); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 3540a7190122..90cb55f6d7eb 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -51,30 +51,92 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) "but no thread function available.", irq, action->name); } -/** - * handle_IRQ_event - irq action chain handler - * @irq: the interrupt number - * @action: the interrupt action chain for this irq - * - * Handles the action chain of an irq event - */ -irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) +static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) +{ + /* + * Wake up the handler thread for this action. In case the + * thread crashed and was killed we just pretend that we + * handled the interrupt. The hardirq handler has disabled the + * device interrupt, so no irq storm is lurking. If the + * RUNTHREAD bit is already set, nothing to do. + */ + if (test_bit(IRQTF_DIED, &action->thread_flags) || + test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + return; + + /* + * It's safe to OR the mask lockless here. We have only two + * places which write to threads_oneshot: This code and the + * irq thread. + * + * This code is the hard irq context and can never run on two + * cpus in parallel. If it ever does we have more serious + * problems than this bitmask. + * + * The irq threads of this irq which clear their "running" bit + * in threads_oneshot are serialized via desc->lock against + * each other and they are serialized against this code by + * IRQS_INPROGRESS. + * + * Hard irq handler: + * + * spin_lock(desc->lock); + * desc->state |= IRQS_INPROGRESS; + * spin_unlock(desc->lock); + * set_bit(IRQTF_RUNTHREAD, &action->thread_flags); + * desc->threads_oneshot |= mask; + * spin_lock(desc->lock); + * desc->state &= ~IRQS_INPROGRESS; + * spin_unlock(desc->lock); + * + * irq thread: + * + * again: + * spin_lock(desc->lock); + * if (desc->state & IRQS_INPROGRESS) { + * spin_unlock(desc->lock); + * while(desc->state & IRQS_INPROGRESS) + * cpu_relax(); + * goto again; + * } + * if (!test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + * desc->threads_oneshot &= ~mask; + * spin_unlock(desc->lock); + * + * So either the thread waits for us to clear IRQS_INPROGRESS + * or we are waiting in the flow handler for desc->lock to be + * released before we reach this point. The thread also checks + * IRQTF_RUNTHREAD under desc->lock. If set it leaves + * threads_oneshot untouched and runs the thread another time. + */ + desc->threads_oneshot |= action->thread_mask; + wake_up_process(action->thread); +} + +irqreturn_t +handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) { - irqreturn_t ret, retval = IRQ_NONE; - unsigned int status = 0; + irqreturn_t retval = IRQ_NONE; + unsigned int random = 0, irq = desc->irq_data.irq; do { + irqreturn_t res; + trace_irq_handler_entry(irq, action); - ret = action->handler(irq, action->dev_id); - trace_irq_handler_exit(irq, action, ret); + res = action->handler(irq, action->dev_id); + trace_irq_handler_exit(irq, action, res); + + if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n", + irq, action->handler)) + local_irq_disable(); - switch (ret) { + switch (res) { case IRQ_WAKE_THREAD: /* * Set result to handled so the spurious check * does not trigger. */ - ret = IRQ_HANDLED; + res = IRQ_HANDLED; /* * Catch drivers which return WAKE_THREAD but @@ -85,36 +147,41 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) break; } - /* - * Wake up the handler thread for this - * action. In case the thread crashed and was - * killed we just pretend that we handled the - * interrupt. The hardirq handler above has - * disabled the device interrupt, so no irq - * storm is lurking. - */ - if (likely(!test_bit(IRQTF_DIED, - &action->thread_flags))) { - set_bit(IRQTF_RUNTHREAD, &action->thread_flags); - wake_up_process(action->thread); - } + irq_wake_thread(desc, action); /* Fall through to add to randomness */ case IRQ_HANDLED: - status |= action->flags; + random |= action->flags; break; default: break; } - retval |= ret; + retval |= res; action = action->next; } while (action); - if (status & IRQF_SAMPLE_RANDOM) + if (random & IRQF_SAMPLE_RANDOM) add_interrupt_randomness(irq); - local_irq_disable(); + if (!noirqdebug) + note_interrupt(irq, desc, retval); return retval; } + +irqreturn_t handle_irq_event(struct irq_desc *desc) +{ + struct irqaction *action = desc->action; + irqreturn_t ret; + + desc->istate &= ~IRQS_PENDING; + irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); + raw_spin_unlock(&desc->lock); + + ret = handle_irq_event_percpu(desc, action); + + raw_spin_lock(&desc->lock); + irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); + return ret; +} diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 99c3bc8a6fb4..6546431447d7 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -1,5 +1,9 @@ /* * IRQ subsystem internal functions and variables: + * + * Do not ever include this file from anything else than + * kernel/irq/. Do not even think about using any information outside + * of this file for your non core code. */ #include <linux/irqdesc.h> @@ -9,25 +13,75 @@ # define IRQ_BITMAP_BITS NR_IRQS #endif +#define istate core_internal_state__do_not_mess_with_it + extern int noirqdebug; -#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) +/* + * Bits used by threaded handlers: + * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run + * IRQTF_DIED - handler thread died + * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed + * IRQTF_AFFINITY - irq thread is requested to adjust affinity + * IRQTF_FORCED_THREAD - irq action is force threaded + */ +enum { + IRQTF_RUNTHREAD, + IRQTF_DIED, + IRQTF_WARNED, + IRQTF_AFFINITY, + IRQTF_FORCED_THREAD, +}; -/* Set default functions for irq_chip structures: */ -extern void irq_chip_set_defaults(struct irq_chip *chip); +/* + * Bit masks for desc->state + * + * IRQS_AUTODETECT - autodetection in progress + * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt + * detection + * IRQS_POLL_INPROGRESS - polling in progress + * IRQS_ONESHOT - irq is not unmasked in primary handler + * IRQS_REPLAY - irq is replayed + * IRQS_WAITING - irq is waiting + * IRQS_PENDING - irq is pending and replayed later + * IRQS_SUSPENDED - irq is suspended + */ +enum { + IRQS_AUTODETECT = 0x00000001, + IRQS_SPURIOUS_DISABLED = 0x00000002, + IRQS_POLL_INPROGRESS = 0x00000008, + IRQS_ONESHOT = 0x00000020, + IRQS_REPLAY = 0x00000040, + IRQS_WAITING = 0x00000080, + IRQS_PENDING = 0x00000200, + IRQS_SUSPENDED = 0x00000800, +}; + +#include "debug.h" +#include "settings.h" -/* Set default handler: */ -extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); +#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags); extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); +extern int irq_startup(struct irq_desc *desc); +extern void irq_shutdown(struct irq_desc *desc); +extern void irq_enable(struct irq_desc *desc); +extern void irq_disable(struct irq_desc *desc); +extern void mask_irq(struct irq_desc *desc); +extern void unmask_irq(struct irq_desc *desc); + extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); +irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action); +irqreturn_t handle_irq_event(struct irq_desc *desc); + /* Resending of interrupts :*/ void check_irq_resend(struct irq_desc *desc, unsigned int irq); +bool irq_wait_for_poll(struct irq_desc *desc); #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); @@ -43,20 +97,10 @@ static inline void unregister_handler_proc(unsigned int irq, struct irqaction *action) { } #endif -extern int irq_select_affinity_usr(unsigned int irq); +extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); extern void irq_set_thread_affinity(struct irq_desc *desc); -#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED -static inline void irq_end(unsigned int irq, struct irq_desc *desc) -{ - if (desc->irq_data.chip && desc->irq_data.chip->end) - desc->irq_data.chip->end(irq); -} -#else -static inline void irq_end(unsigned int irq, struct irq_desc *desc) { } -#endif - /* Inline functions for support of irq chips on slow busses */ static inline void chip_bus_lock(struct irq_desc *desc) { @@ -70,43 +114,58 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc) desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); } +struct irq_desc * +__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus); +void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus); + +static inline struct irq_desc * +irq_get_desc_buslock(unsigned int irq, unsigned long *flags) +{ + return __irq_get_desc_lock(irq, flags, true); +} + +static inline void +irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags) +{ + __irq_put_desc_unlock(desc, flags, true); +} + +static inline struct irq_desc * +irq_get_desc_lock(unsigned int irq, unsigned long *flags) +{ + return __irq_get_desc_lock(irq, flags, false); +} + +static inline void +irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) +{ + __irq_put_desc_unlock(desc, flags, false); +} + /* - * Debugging printout: + * Manipulation functions for irq_data.state */ +static inline void irqd_set_move_pending(struct irq_data *d) +{ + d->state_use_accessors |= IRQD_SETAFFINITY_PENDING; +} -#include <linux/kallsyms.h> - -#define P(f) if (desc->status & f) printk("%14s set\n", #f) +static inline void irqd_clr_move_pending(struct irq_data *d) +{ + d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING; +} -static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) +static inline void irqd_clear(struct irq_data *d, unsigned int mask) { - printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", - irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); - printk("->handle_irq(): %p, ", desc->handle_irq); - print_symbol("%s\n", (unsigned long)desc->handle_irq); - printk("->irq_data.chip(): %p, ", desc->irq_data.chip); - print_symbol("%s\n", (unsigned long)desc->irq_data.chip); - printk("->action(): %p\n", desc->action); - if (desc->action) { - printk("->action->handler(): %p, ", desc->action->handler); - print_symbol("%s\n", (unsigned long)desc->action->handler); - } - - P(IRQ_INPROGRESS); - P(IRQ_DISABLED); - P(IRQ_PENDING); - P(IRQ_REPLAY); - P(IRQ_AUTODETECT); - P(IRQ_WAITING); - P(IRQ_LEVEL); - P(IRQ_MASKED); -#ifdef CONFIG_IRQ_PER_CPU - P(IRQ_PER_CPU); -#endif - P(IRQ_NOPROBE); - P(IRQ_NOREQUEST); - P(IRQ_NOAUTOEN); + d->state_use_accessors &= ~mask; } -#undef P +static inline void irqd_set(struct irq_data *d, unsigned int mask) +{ + d->state_use_accessors |= mask; +} +static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) +{ + return d->state_use_accessors & mask; +} diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 2039bea31bdf..886e80347b32 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -22,7 +22,7 @@ */ static struct lock_class_key irq_desc_lock_class; -#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) +#if defined(CONFIG_SMP) static void __init init_irq_default_affinity(void) { alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); @@ -79,7 +79,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) desc->irq_data.chip_data = NULL; desc->irq_data.handler_data = NULL; desc->irq_data.msi_desc = NULL; - desc->status = IRQ_DEFAULT_INIT_FLAGS; + irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); + irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); desc->handle_irq = handle_bad_irq; desc->depth = 1; desc->irq_count = 0; @@ -197,13 +198,12 @@ err: return -ENOMEM; } -struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) +static int irq_expand_nr_irqs(unsigned int nr) { - int res = irq_alloc_descs(irq, irq, 1, node); - - if (res == -EEXIST || res == irq) - return irq_to_desc(irq); - return NULL; + if (nr > IRQ_BITMAP_BITS) + return -ENOMEM; + nr_irqs = nr; + return 0; } int __init early_irq_init(void) @@ -238,7 +238,6 @@ int __init early_irq_init(void) struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { - .status = IRQ_DEFAULT_INIT_FLAGS, .handle_irq = handle_bad_irq, .depth = 1, .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), @@ -260,8 +259,8 @@ int __init early_irq_init(void) for (i = 0; i < count; i++) { desc[i].irq_data.irq = i; desc[i].irq_data.chip = &no_irq_chip; - /* TODO : do this allocation on-demand ... */ desc[i].kstat_irqs = alloc_percpu(unsigned int); + irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); alloc_masks(desc + i, GFP_KERNEL, node); desc_smp_init(desc + i, node); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); @@ -274,11 +273,6 @@ struct irq_desc *irq_to_desc(unsigned int irq) return (irq < NR_IRQS) ? irq_desc + irq : NULL; } -struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) -{ - return irq_to_desc(irq); -} - static void free_desc(unsigned int irq) { dynamic_irq_cleanup(irq); @@ -286,26 +280,32 @@ static void free_desc(unsigned int irq) static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) { -#if defined(CONFIG_KSTAT_IRQS_ONDEMAND) - struct irq_desc *desc; - unsigned int i; - - for (i = 0; i < cnt; i++) { - desc = irq_to_desc(start + i); - if (desc && !desc->kstat_irqs) { - unsigned int __percpu *stats = alloc_percpu(unsigned int); - - if (!stats) - return -1; - if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL) - free_percpu(stats); - } - } -#endif return start; } + +static int irq_expand_nr_irqs(unsigned int nr) +{ + return -ENOMEM; +} + #endif /* !CONFIG_SPARSE_IRQ */ +/** + * generic_handle_irq - Invoke the handler for a particular irq + * @irq: The irq number to handle + * + */ +int generic_handle_irq(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (!desc) + return -EINVAL; + generic_handle_irq_desc(irq, desc); + return 0; +} +EXPORT_SYMBOL_GPL(generic_handle_irq); + /* Dynamic interrupt handling */ /** @@ -327,6 +327,7 @@ void irq_free_descs(unsigned int from, unsigned int cnt) bitmap_clear(allocated_irqs, from, cnt); mutex_unlock(&sparse_irq_lock); } +EXPORT_SYMBOL_GPL(irq_free_descs); /** * irq_alloc_descs - allocate and initialize a range of irq descriptors @@ -347,14 +348,17 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) mutex_lock(&sparse_irq_lock); - start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); + start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS, + from, cnt, 0); ret = -EEXIST; if (irq >=0 && start != irq) goto err; - ret = -ENOMEM; - if (start >= nr_irqs) - goto err; + if (start + cnt > nr_irqs) { + ret = irq_expand_nr_irqs(start + cnt); + if (ret) + goto err; + } bitmap_set(allocated_irqs, start, cnt); mutex_unlock(&sparse_irq_lock); @@ -364,6 +368,7 @@ err: mutex_unlock(&sparse_irq_lock); return ret; } +EXPORT_SYMBOL_GPL(irq_alloc_descs); /** * irq_reserve_irqs - mark irqs allocated @@ -401,6 +406,26 @@ unsigned int irq_get_next_irq(unsigned int offset) return find_next_bit(allocated_irqs, nr_irqs, offset); } +struct irq_desc * +__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc) { + if (bus) + chip_bus_lock(desc); + raw_spin_lock_irqsave(&desc->lock, *flags); + } + return desc; +} + +void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) +{ + raw_spin_unlock_irqrestore(&desc->lock, flags); + if (bus) + chip_bus_sync_unlock(desc); +} + /** * dynamic_irq_cleanup - cleanup a dynamically allocated irq * @irq: irq number to initialize @@ -423,7 +448,6 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; } -#ifdef CONFIG_GENERIC_HARDIRQS unsigned int kstat_irqs(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -436,4 +460,3 @@ unsigned int kstat_irqs(unsigned int irq) sum += *per_cpu_ptr(desc->kstat_irqs, cpu); return sum; } -#endif /* CONFIG_GENERIC_HARDIRQS */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9033c1c70828..f7ce0021e1c4 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -17,6 +17,17 @@ #include "internals.h" +#ifdef CONFIG_IRQ_FORCED_THREADING +__read_mostly bool force_irqthreads; + +static int __init setup_forced_irqthreads(char *arg) +{ + force_irqthreads = true; + return 0; +} +early_param("threadirqs", setup_forced_irqthreads); +#endif + /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) * @irq: interrupt number to wait for @@ -30,7 +41,7 @@ void synchronize_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - unsigned int status; + bool inprogress; if (!desc) return; @@ -42,16 +53,16 @@ void synchronize_irq(unsigned int irq) * Wait until we're out of the critical section. This might * give the wrong answer due to the lack of memory barriers. */ - while (desc->status & IRQ_INPROGRESS) + while (irqd_irq_inprogress(&desc->irq_data)) cpu_relax(); /* Ok, that indicated we're done: double-check carefully. */ raw_spin_lock_irqsave(&desc->lock, flags); - status = desc->status; + inprogress = irqd_irq_inprogress(&desc->irq_data); raw_spin_unlock_irqrestore(&desc->lock, flags); /* Oops, that failed? */ - } while (status & IRQ_INPROGRESS); + } while (inprogress); /* * We made sure that no hardirq handler is running. Now verify @@ -73,8 +84,8 @@ int irq_can_set_affinity(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip || - !desc->irq_data.chip->irq_set_affinity) + if (!desc || !irqd_can_balance(&desc->irq_data) || + !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) return 0; return 1; @@ -100,67 +111,180 @@ void irq_set_thread_affinity(struct irq_desc *desc) } } +#ifdef CONFIG_GENERIC_PENDING_IRQ +static inline bool irq_can_move_pcntxt(struct irq_data *data) +{ + return irqd_can_move_in_process_context(data); +} +static inline bool irq_move_pending(struct irq_data *data) +{ + return irqd_is_setaffinity_pending(data); +} +static inline void +irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) +{ + cpumask_copy(desc->pending_mask, mask); +} +static inline void +irq_get_pending(struct cpumask *mask, struct irq_desc *desc) +{ + cpumask_copy(mask, desc->pending_mask); +} +#else +static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; } +static inline bool irq_move_pending(struct irq_data *data) { return false; } +static inline void +irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { } +static inline void +irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } +#endif + +int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) +{ + struct irq_chip *chip = irq_data_get_irq_chip(data); + struct irq_desc *desc = irq_data_to_desc(data); + int ret = 0; + + if (!chip || !chip->irq_set_affinity) + return -EINVAL; + + if (irq_can_move_pcntxt(data)) { + ret = chip->irq_set_affinity(data, mask, false); + switch (ret) { + case IRQ_SET_MASK_OK: + cpumask_copy(data->affinity, mask); + case IRQ_SET_MASK_OK_NOCOPY: + irq_set_thread_affinity(desc); + ret = 0; + } + } else { + irqd_set_move_pending(data); + irq_copy_pending(desc, mask); + } + + if (desc->affinity_notify) { + kref_get(&desc->affinity_notify->kref); + schedule_work(&desc->affinity_notify->work); + } + irqd_set(data, IRQD_AFFINITY_SET); + + return ret; +} + /** * irq_set_affinity - Set the irq affinity of a given irq * @irq: Interrupt to set affinity - * @cpumask: cpumask + * @mask: cpumask * */ -int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) +int irq_set_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); - struct irq_chip *chip = desc->irq_data.chip; unsigned long flags; + int ret; - if (!chip->irq_set_affinity) + if (!desc) return -EINVAL; raw_spin_lock_irqsave(&desc->lock, flags); - -#ifdef CONFIG_GENERIC_PENDING_IRQ - if (desc->status & IRQ_MOVE_PCNTXT) { - if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { - cpumask_copy(desc->irq_data.affinity, cpumask); - irq_set_thread_affinity(desc); - } - } - else { - desc->status |= IRQ_MOVE_PENDING; - cpumask_copy(desc->pending_mask, cpumask); - } -#else - if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { - cpumask_copy(desc->irq_data.affinity, cpumask); - irq_set_thread_affinity(desc); - } -#endif - desc->status |= IRQ_AFFINITY_SET; + ret = __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask); raw_spin_unlock_irqrestore(&desc->lock, flags); - return 0; + return ret; } int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) { + unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + + if (!desc) + return -EINVAL; + desc->affinity_hint = m; + irq_put_desc_unlock(desc, flags); + return 0; +} +EXPORT_SYMBOL_GPL(irq_set_affinity_hint); + +static void irq_affinity_notify(struct work_struct *work) +{ + struct irq_affinity_notify *notify = + container_of(work, struct irq_affinity_notify, work); + struct irq_desc *desc = irq_to_desc(notify->irq); + cpumask_var_t cpumask; + unsigned long flags; + + if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL)) + goto out; + + raw_spin_lock_irqsave(&desc->lock, flags); + if (irq_move_pending(&desc->irq_data)) + irq_get_pending(cpumask, desc); + else + cpumask_copy(cpumask, desc->irq_data.affinity); + raw_spin_unlock_irqrestore(&desc->lock, flags); + + notify->notify(notify, cpumask); + + free_cpumask_var(cpumask); +out: + kref_put(¬ify->kref, notify->release); +} + +/** + * irq_set_affinity_notifier - control notification of IRQ affinity changes + * @irq: Interrupt for which to enable/disable notification + * @notify: Context for notification, or %NULL to disable + * notification. Function pointers must be initialised; + * the other fields will be initialised by this function. + * + * Must be called in process context. Notification may only be enabled + * after the IRQ is allocated and must be disabled before the IRQ is + * freed using free_irq(). + */ +int +irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) +{ struct irq_desc *desc = irq_to_desc(irq); + struct irq_affinity_notify *old_notify; unsigned long flags; + /* The release function is promised process context */ + might_sleep(); + if (!desc) return -EINVAL; + /* Complete initialisation of *notify */ + if (notify) { + notify->irq = irq; + kref_init(¬ify->kref); + INIT_WORK(¬ify->work, irq_affinity_notify); + } + raw_spin_lock_irqsave(&desc->lock, flags); - desc->affinity_hint = m; + old_notify = desc->affinity_notify; + desc->affinity_notify = notify; raw_spin_unlock_irqrestore(&desc->lock, flags); + if (old_notify) + kref_put(&old_notify->kref, old_notify->release); + return 0; } -EXPORT_SYMBOL_GPL(irq_set_affinity_hint); +EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); #ifndef CONFIG_AUTO_IRQ_AFFINITY /* * Generic version of the affinity autoselector. */ -static int setup_affinity(unsigned int irq, struct irq_desc *desc) +static int +setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) { + struct irq_chip *chip = irq_desc_get_chip(desc); + struct cpumask *set = irq_default_affinity; + int ret; + + /* Excludes PER_CPU and NO_BALANCE interrupts */ if (!irq_can_set_affinity(irq)) return 0; @@ -168,22 +292,27 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc) * Preserve an userspace affinity setup, but make sure that * one of the targets is online. */ - if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { - if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask) - < nr_cpu_ids) - goto set_affinity; + if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { + if (cpumask_intersects(desc->irq_data.affinity, + cpu_online_mask)) + set = desc->irq_data.affinity; else - desc->status &= ~IRQ_AFFINITY_SET; + irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); } - cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity); -set_affinity: - desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false); - + cpumask_and(mask, cpu_online_mask, set); + ret = chip->irq_set_affinity(&desc->irq_data, mask, false); + switch (ret) { + case IRQ_SET_MASK_OK: + cpumask_copy(desc->irq_data.affinity, mask); + case IRQ_SET_MASK_OK_NOCOPY: + irq_set_thread_affinity(desc); + } return 0; } #else -static inline int setup_affinity(unsigned int irq, struct irq_desc *d) +static inline int +setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask) { return irq_select_affinity(irq); } @@ -192,23 +321,21 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *d) /* * Called when affinity is set via /proc/irq */ -int irq_select_affinity_usr(unsigned int irq) +int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; int ret; raw_spin_lock_irqsave(&desc->lock, flags); - ret = setup_affinity(irq, desc); - if (!ret) - irq_set_thread_affinity(desc); + ret = setup_affinity(irq, desc, mask); raw_spin_unlock_irqrestore(&desc->lock, flags); - return ret; } #else -static inline int setup_affinity(unsigned int irq, struct irq_desc *desc) +static inline int +setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) { return 0; } @@ -219,13 +346,23 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) if (suspend) { if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) return; - desc->status |= IRQ_SUSPENDED; + desc->istate |= IRQS_SUSPENDED; } - if (!desc->depth++) { - desc->status |= IRQ_DISABLED; - desc->irq_data.chip->irq_disable(&desc->irq_data); - } + if (!desc->depth++) + irq_disable(desc); +} + +static int __disable_irq_nosync(unsigned int irq) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); + + if (!desc) + return -EINVAL; + __disable_irq(desc, irq, false); + irq_put_desc_busunlock(desc, flags); + return 0; } /** @@ -241,17 +378,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) */ void disable_irq_nosync(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - - if (!desc) - return; - - chip_bus_lock(desc); - raw_spin_lock_irqsave(&desc->lock, flags); - __disable_irq(desc, irq, false); - raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(desc); + __disable_irq_nosync(irq); } EXPORT_SYMBOL(disable_irq_nosync); @@ -269,21 +396,24 @@ EXPORT_SYMBOL(disable_irq_nosync); */ void disable_irq(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); - - if (!desc) - return; - - disable_irq_nosync(irq); - if (desc->action) + if (!__disable_irq_nosync(irq)) synchronize_irq(irq); } EXPORT_SYMBOL(disable_irq); void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) { - if (resume) - desc->status &= ~IRQ_SUSPENDED; + if (resume) { + if (!(desc->istate & IRQS_SUSPENDED)) { + if (!desc->action) + return; + if (!(desc->action->flags & IRQF_FORCE_RESUME)) + return; + /* Pretend that it got disabled ! */ + desc->depth++; + } + desc->istate &= ~IRQS_SUSPENDED; + } switch (desc->depth) { case 0: @@ -291,12 +421,11 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); break; case 1: { - unsigned int status = desc->status & ~IRQ_DISABLED; - - if (desc->status & IRQ_SUSPENDED) + if (desc->istate & IRQS_SUSPENDED) goto err_out; /* Prevent probing on this irq: */ - desc->status = status | IRQ_NOPROBE; + irq_settings_set_noprobe(desc); + irq_enable(desc); check_irq_resend(desc, irq); /* fall-through */ } @@ -318,21 +447,18 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) */ void enable_irq(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); if (!desc) return; + if (WARN(!desc->irq_data.chip, + KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) + goto out; - if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable, - KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) - return; - - chip_bus_lock(desc); - raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, false); - raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(desc); +out: + irq_put_desc_busunlock(desc, flags); } EXPORT_SYMBOL(enable_irq); @@ -348,7 +474,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) } /** - * set_irq_wake - control irq power management wakeup + * irq_set_irq_wake - control irq power management wakeup * @irq: interrupt to control * @on: enable/disable power management wakeup * @@ -359,23 +485,22 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) * Wakeup mode lets this IRQ wake the system from sleep * states like "suspend to RAM". */ -int set_irq_wake(unsigned int irq, unsigned int on) +int irq_set_irq_wake(unsigned int irq, unsigned int on) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); int ret = 0; /* wakeup-capable irqs can be shared between drivers that * don't need to have the same sleep mode behaviors. */ - raw_spin_lock_irqsave(&desc->lock, flags); if (on) { if (desc->wake_depth++ == 0) { ret = set_irq_wake_real(irq, on); if (ret) desc->wake_depth = 0; else - desc->status |= IRQ_WAKEUP; + irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE); } } else { if (desc->wake_depth == 0) { @@ -385,14 +510,13 @@ int set_irq_wake(unsigned int irq, unsigned int on) if (ret) desc->wake_depth = 1; else - desc->status &= ~IRQ_WAKEUP; + irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE); } } - - raw_spin_unlock_irqrestore(&desc->lock, flags); + irq_put_desc_busunlock(desc, flags); return ret; } -EXPORT_SYMBOL(set_irq_wake); +EXPORT_SYMBOL(irq_set_irq_wake); /* * Internal function that tells the architecture code whether a @@ -401,43 +525,27 @@ EXPORT_SYMBOL(set_irq_wake); */ int can_request_irq(unsigned int irq, unsigned long irqflags) { - struct irq_desc *desc = irq_to_desc(irq); - struct irqaction *action; unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + int canrequest = 0; if (!desc) return 0; - if (desc->status & IRQ_NOREQUEST) - return 0; - - raw_spin_lock_irqsave(&desc->lock, flags); - action = desc->action; - if (action) - if (irqflags & action->flags & IRQF_SHARED) - action = NULL; - - raw_spin_unlock_irqrestore(&desc->lock, flags); - - return !action; -} - -void compat_irq_chip_set_default_handler(struct irq_desc *desc) -{ - /* - * If the architecture still has not overriden - * the flow handler then zap the default. This - * should catch incorrect flow-type setting. - */ - if (desc->handle_irq == &handle_bad_irq) - desc->handle_irq = NULL; + if (irq_settings_can_request(desc)) { + if (desc->action) + if (irqflags & desc->action->flags & IRQF_SHARED) + canrequest =1; + } + irq_put_desc_unlock(desc, flags); + return canrequest; } int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags) { - int ret; struct irq_chip *chip = desc->irq_data.chip; + int ret, unmask = 0; if (!chip || !chip->irq_set_type) { /* @@ -449,23 +557,41 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, return 0; } + flags &= IRQ_TYPE_SENSE_MASK; + + if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { + if (!irqd_irq_masked(&desc->irq_data)) + mask_irq(desc); + if (!irqd_irq_disabled(&desc->irq_data)) + unmask = 1; + } + /* caller masked out all except trigger mode flags */ ret = chip->irq_set_type(&desc->irq_data, flags); - if (ret) + switch (ret) { + case IRQ_SET_MASK_OK: + irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); + irqd_set(&desc->irq_data, flags); + + case IRQ_SET_MASK_OK_NOCOPY: + flags = irqd_get_trigger_type(&desc->irq_data); + irq_settings_set_trigger_mask(desc, flags); + irqd_clear(&desc->irq_data, IRQD_LEVEL); + irq_settings_clr_level(desc); + if (flags & IRQ_TYPE_LEVEL_MASK) { + irq_settings_set_level(desc); + irqd_set(&desc->irq_data, IRQD_LEVEL); + } + + ret = 0; + break; + default: pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", flags, irq, chip->irq_set_type); - else { - if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)) - flags |= IRQ_LEVEL; - /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ - desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); - desc->status |= flags; - - if (chip != desc->irq_data.chip) - irq_chip_set_defaults(desc->irq_data.chip); } - + if (unmask) + unmask_irq(desc); return ret; } @@ -509,8 +635,11 @@ static int irq_wait_for_interrupt(struct irqaction *action) * handler finished. unmask if the interrupt has not been disabled and * is marked MASKED. */ -static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) +static void irq_finalize_oneshot(struct irq_desc *desc, + struct irqaction *action, bool force) { + if (!(desc->istate & IRQS_ONESHOT)) + return; again: chip_bus_lock(desc); raw_spin_lock_irq(&desc->lock); @@ -522,26 +651,42 @@ again: * The thread is faster done than the hard interrupt handler * on the other CPU. If we unmask the irq line then the * interrupt can come in again and masks the line, leaves due - * to IRQ_INPROGRESS and the irq line is masked forever. + * to IRQS_INPROGRESS and the irq line is masked forever. + * + * This also serializes the state of shared oneshot handlers + * versus "desc->threads_onehsot |= action->thread_mask;" in + * irq_wake_thread(). See the comment there which explains the + * serialization. */ - if (unlikely(desc->status & IRQ_INPROGRESS)) { + if (unlikely(irqd_irq_inprogress(&desc->irq_data))) { raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); cpu_relax(); goto again; } - if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { - desc->status &= ~IRQ_MASKED; - desc->irq_data.chip->irq_unmask(&desc->irq_data); - } + /* + * Now check again, whether the thread should run. Otherwise + * we would clear the threads_oneshot bit of this thread which + * was just set. + */ + if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + goto out_unlock; + + desc->threads_oneshot &= ~action->thread_mask; + + if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) && + irqd_irq_masked(&desc->irq_data)) + unmask_irq(desc); + +out_unlock: raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); } #ifdef CONFIG_SMP /* - * Check whether we need to change the affinity of the interrupt thread. + * Check whether we need to chasnge the affinity of the interrupt thread. */ static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) @@ -573,6 +718,32 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } #endif /* + * Interrupts which are not explicitely requested as threaded + * interrupts rely on the implicit bh/preempt disable of the hard irq + * context. So we need to disable bh here to avoid deadlocks and other + * side effects. + */ +static void +irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) +{ + local_bh_disable(); + action->thread_fn(action->irq, action->dev_id); + irq_finalize_oneshot(desc, action, false); + local_bh_enable(); +} + +/* + * Interrupts explicitely requested as threaded interupts want to be + * preemtible - many of them need to sleep and wait for slow busses to + * complete. + */ +static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action) +{ + action->thread_fn(action->irq, action->dev_id); + irq_finalize_oneshot(desc, action, false); +} + +/* * Interrupt handler thread */ static int irq_thread(void *data) @@ -582,7 +753,14 @@ static int irq_thread(void *data) }; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); - int wake, oneshot = desc->status & IRQ_ONESHOT; + void (*handler_fn)(struct irq_desc *desc, struct irqaction *action); + int wake; + + if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD, + &action->thread_flags)) + handler_fn = irq_forced_thread_fn; + else + handler_fn = irq_thread_fn; sched_setscheduler(current, SCHED_FIFO, ¶m); current->irqaction = action; @@ -594,23 +772,19 @@ static int irq_thread(void *data) atomic_inc(&desc->threads_active); raw_spin_lock_irq(&desc->lock); - if (unlikely(desc->status & IRQ_DISABLED)) { + if (unlikely(irqd_irq_disabled(&desc->irq_data))) { /* * CHECKME: We might need a dedicated * IRQ_THREAD_PENDING flag here, which * retriggers the thread in check_irq_resend() - * but AFAICT IRQ_PENDING should be fine as it + * but AFAICT IRQS_PENDING should be fine as it * retriggers the interrupt itself --- tglx */ - desc->status |= IRQ_PENDING; + desc->istate |= IRQS_PENDING; raw_spin_unlock_irq(&desc->lock); } else { raw_spin_unlock_irq(&desc->lock); - - action->thread_fn(action->irq, action->dev_id); - - if (oneshot) - irq_finalize_oneshot(action->irq, desc); + handler_fn(desc, action); } wake = atomic_dec_and_test(&desc->threads_active); @@ -619,6 +793,9 @@ static int irq_thread(void *data) wake_up(&desc->wait_for_threads); } + /* Prevent a stale desc->threads_oneshot */ + irq_finalize_oneshot(desc, action, true); + /* * Clear irqaction. Otherwise exit_irq_thread() would make * fuzz about an active irq thread going into nirvana. @@ -633,6 +810,7 @@ static int irq_thread(void *data) void exit_irq_thread(void) { struct task_struct *tsk = current; + struct irq_desc *desc; if (!tsk->irqaction) return; @@ -641,6 +819,14 @@ void exit_irq_thread(void) "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); + desc = irq_to_desc(tsk->irqaction->irq); + + /* + * Prevent a stale desc->threads_oneshot. Must be called + * before setting the IRQTF_DIED flag. + */ + irq_finalize_oneshot(desc, tsk->irqaction, true); + /* * Set the THREAD DIED flag to prevent further wakeups of the * soon to be gone threaded handler. @@ -648,6 +834,22 @@ void exit_irq_thread(void) set_bit(IRQTF_DIED, &tsk->irqaction->flags); } +static void irq_setup_forced_threading(struct irqaction *new) +{ + if (!force_irqthreads) + return; + if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) + return; + + new->flags |= IRQF_ONESHOT; + + if (!new->thread_fn) { + set_bit(IRQTF_FORCED_THREAD, &new->thread_flags); + new->thread_fn = new->handler; + new->handler = irq_default_primary_handler; + } +} + /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. @@ -657,9 +859,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) { struct irqaction *old, **old_ptr; const char *old_name = NULL; - unsigned long flags; - int nested, shared = 0; - int ret; + unsigned long flags, thread_mask = 0; + int ret, nested, shared = 0; + cpumask_var_t mask; if (!desc) return -EINVAL; @@ -683,15 +885,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) rand_initialize_irq(irq); } - /* Oneshot interrupts are not allowed with shared */ - if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED)) - return -EINVAL; - /* * Check whether the interrupt nests into another interrupt * thread. */ - nested = desc->status & IRQ_NESTED_THREAD; + nested = irq_settings_is_nested_thread(desc); if (nested) { if (!new->thread_fn) return -EINVAL; @@ -701,6 +899,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * dummy function which warns when called. */ new->handler = irq_nested_primary_handler; + } else { + if (irq_settings_can_thread(desc)) + irq_setup_forced_threading(new); } /* @@ -724,6 +925,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) new->thread = t; } + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { + ret = -ENOMEM; + goto out_thread; + } + /* * The following block of code has to be executed atomically */ @@ -735,32 +941,41 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * Can't share interrupts unless both agree to and are * the same type (level, edge, polarity). So both flag * fields must have IRQF_SHARED set and the bits which - * set the trigger type must match. + * set the trigger type must match. Also all must + * agree on ONESHOT. */ if (!((old->flags & new->flags) & IRQF_SHARED) || - ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) { + ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || + ((old->flags ^ new->flags) & IRQF_ONESHOT)) { old_name = old->name; goto mismatch; } -#if defined(CONFIG_IRQ_PER_CPU) /* All handlers must agree on per-cpuness */ if ((old->flags & IRQF_PERCPU) != (new->flags & IRQF_PERCPU)) goto mismatch; -#endif /* add new interrupt at end of irq queue */ do { + thread_mask |= old->thread_mask; old_ptr = &old->next; old = *old_ptr; } while (old); shared = 1; } - if (!shared) { - irq_chip_set_defaults(desc->irq_data.chip); + /* + * Setup the thread mask for this irqaction. Unlikely to have + * 32 resp 64 irqs sharing one line, but who knows. + */ + if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) { + ret = -EBUSY; + goto out_mask; + } + new->thread_mask = 1 << ffz(thread_mask); + if (!shared) { init_waitqueue_head(&desc->wait_for_threads); /* Setup the type (level, edge polarity) if configured: */ @@ -769,42 +984,44 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) new->flags & IRQF_TRIGGER_MASK); if (ret) - goto out_thread; - } else - compat_irq_chip_set_default_handler(desc); -#if defined(CONFIG_IRQ_PER_CPU) - if (new->flags & IRQF_PERCPU) - desc->status |= IRQ_PER_CPU; -#endif + goto out_mask; + } - desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT | - IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); + desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ + IRQS_ONESHOT | IRQS_WAITING); + irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); + + if (new->flags & IRQF_PERCPU) { + irqd_set(&desc->irq_data, IRQD_PER_CPU); + irq_settings_set_per_cpu(desc); + } if (new->flags & IRQF_ONESHOT) - desc->status |= IRQ_ONESHOT; + desc->istate |= IRQS_ONESHOT; - if (!(desc->status & IRQ_NOAUTOEN)) { - desc->depth = 0; - desc->status &= ~IRQ_DISABLED; - desc->irq_data.chip->irq_startup(&desc->irq_data); - } else + if (irq_settings_can_autoenable(desc)) + irq_startup(desc); + else /* Undo nested disables: */ desc->depth = 1; /* Exclude IRQ from balancing if requested */ - if (new->flags & IRQF_NOBALANCING) - desc->status |= IRQ_NO_BALANCING; + if (new->flags & IRQF_NOBALANCING) { + irq_settings_set_no_balancing(desc); + irqd_set(&desc->irq_data, IRQD_NO_BALANCING); + } /* Set default affinity mask once everything is setup */ - setup_affinity(irq, desc); - - } else if ((new->flags & IRQF_TRIGGER_MASK) - && (new->flags & IRQF_TRIGGER_MASK) - != (desc->status & IRQ_TYPE_SENSE_MASK)) { - /* hope the handler works with the actual trigger mode... */ - pr_warning("IRQ %d uses trigger mode %d; requested %d\n", - irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK), - (int)(new->flags & IRQF_TRIGGER_MASK)); + setup_affinity(irq, desc, mask); + + } else if (new->flags & IRQF_TRIGGER_MASK) { + unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; + unsigned int omsk = irq_settings_get_trigger_mask(desc); + + if (nmsk != omsk) + /* hope the handler works with current trigger mode */ + pr_warning("IRQ %d uses trigger mode %u; requested %u\n", + irq, nmsk, omsk); } new->irq = irq; @@ -818,8 +1035,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * Check whether we disabled the irq via the spurious handler * before. Reenable it and give it another chance. */ - if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) { - desc->status &= ~IRQ_SPURIOUS_DISABLED; + if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { + desc->istate &= ~IRQS_SPURIOUS_DISABLED; __enable_irq(desc, irq, false); } @@ -835,6 +1052,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) register_irq_proc(irq, desc); new->dir = NULL; register_handler_proc(irq, new); + free_cpumask_var(mask); return 0; @@ -849,8 +1067,11 @@ mismatch: #endif ret = -EBUSY; -out_thread: +out_mask: raw_spin_unlock_irqrestore(&desc->lock, flags); + free_cpumask_var(mask); + +out_thread: if (new->thread) { struct task_struct *t = new->thread; @@ -871,9 +1092,14 @@ out_thread: */ int setup_irq(unsigned int irq, struct irqaction *act) { + int retval; struct irq_desc *desc = irq_to_desc(irq); - return __setup_irq(irq, desc, act); + chip_bus_lock(desc); + retval = __setup_irq(irq, desc, act); + chip_bus_sync_unlock(desc); + + return retval; } EXPORT_SYMBOL_GPL(setup_irq); @@ -924,13 +1150,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) #endif /* If this was the last handler, shut down the IRQ line: */ - if (!desc->action) { - desc->status |= IRQ_DISABLED; - if (desc->irq_data.chip->irq_shutdown) - desc->irq_data.chip->irq_shutdown(&desc->irq_data); - else - desc->irq_data.chip->irq_disable(&desc->irq_data); - } + if (!desc->action) + irq_shutdown(desc); #ifdef CONFIG_SMP /* make sure affinity_hint is cleaned up */ @@ -1004,6 +1225,11 @@ void free_irq(unsigned int irq, void *dev_id) if (!desc) return; +#ifdef CONFIG_SMP + if (WARN_ON(desc->affinity_notify)) + desc->affinity_notify = NULL; +#endif + chip_bus_lock(desc); kfree(__free_irq(irq, dev_id)); chip_bus_sync_unlock(desc); @@ -1074,7 +1300,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, if (!desc) return -EINVAL; - if (desc->status & IRQ_NOREQUEST) + if (!irq_settings_can_request(desc)) return -EINVAL; if (!handler) { @@ -1149,7 +1375,7 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler, if (!desc) return -EINVAL; - if (desc->status & IRQ_NESTED_THREAD) { + if (irq_settings_is_nested_thread(desc)) { ret = request_threaded_irq(irq, NULL, handler, flags, name, dev_id); return !ret ? IRQC_IS_NESTED : ret; diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 441fd629ff04..47420908fba0 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -4,23 +4,23 @@ #include "internals.h" -void move_masked_irq(int irq) +void irq_move_masked_irq(struct irq_data *idata) { - struct irq_desc *desc = irq_to_desc(irq); - struct irq_chip *chip = desc->irq_data.chip; + struct irq_desc *desc = irq_data_to_desc(idata); + struct irq_chip *chip = idata->chip; - if (likely(!(desc->status & IRQ_MOVE_PENDING))) + if (likely(!irqd_is_setaffinity_pending(&desc->irq_data))) return; /* * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. */ - if (CHECK_IRQ_PER_CPU(desc->status)) { + if (!irqd_can_balance(&desc->irq_data)) { WARN_ON(1); return; } - desc->status &= ~IRQ_MOVE_PENDING; + irqd_clr_move_pending(&desc->irq_data); if (unlikely(cpumask_empty(desc->pending_mask))) return; @@ -35,7 +35,7 @@ void move_masked_irq(int irq) * do the disable, re-program, enable sequence. * This is *not* particularly important for level triggered * but in a edge trigger case, we might be setting rte - * when an active trigger is comming in. This could + * when an active trigger is coming in. This could * cause some ioapics to mal-function. * Being paranoid i guess! * @@ -53,15 +53,14 @@ void move_masked_irq(int irq) cpumask_clear(desc->pending_mask); } -void move_native_irq(int irq) +void irq_move_irq(struct irq_data *idata) { - struct irq_desc *desc = irq_to_desc(irq); bool masked; - if (likely(!(desc->status & IRQ_MOVE_PENDING))) + if (likely(!irqd_is_setaffinity_pending(idata))) return; - if (unlikely(desc->status & IRQ_DISABLED)) + if (unlikely(irqd_irq_disabled(idata))) return; /* @@ -69,10 +68,10 @@ void move_native_irq(int irq) * threaded interrupt with ONESHOT set, we can end up with an * interrupt storm. */ - masked = desc->status & IRQ_MASKED; + masked = irqd_irq_masked(idata); if (!masked) - desc->irq_data.chip->irq_mask(&desc->irq_data); - move_masked_irq(irq); + idata->chip->irq_mask(idata); + irq_move_masked_irq(idata); if (!masked) - desc->irq_data.chip->irq_unmask(&desc->irq_data); + idata->chip->irq_unmask(idata); } diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 0d4005d85b03..f76fc00c9877 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -18,7 +18,7 @@ * During system-wide suspend or hibernation device drivers need to be prevented * from receiving interrupts and this function is provided for this purpose. * It marks all interrupt lines in use, except for the timer ones, as disabled - * and sets the IRQ_SUSPENDED flag for each of them. + * and sets the IRQS_SUSPENDED flag for each of them. */ void suspend_device_irqs(void) { @@ -34,7 +34,7 @@ void suspend_device_irqs(void) } for_each_irq_desc(irq, desc) - if (desc->status & IRQ_SUSPENDED) + if (desc->istate & IRQS_SUSPENDED) synchronize_irq(irq); } EXPORT_SYMBOL_GPL(suspend_device_irqs); @@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs); * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() * * Enable all interrupt lines previously disabled by suspend_device_irqs() that - * have the IRQ_SUSPENDED flag set. + * have the IRQS_SUSPENDED flag set. */ void resume_device_irqs(void) { @@ -53,9 +53,6 @@ void resume_device_irqs(void) for_each_irq_desc(irq, desc) { unsigned long flags; - if (!(desc->status & IRQ_SUSPENDED)) - continue; - raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, true); raw_spin_unlock_irqrestore(&desc->lock, flags); @@ -71,9 +68,24 @@ int check_wakeup_irqs(void) struct irq_desc *desc; int irq; - for_each_irq_desc(irq, desc) - if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING)) - return -EBUSY; + for_each_irq_desc(irq, desc) { + if (irqd_is_wakeup_set(&desc->irq_data)) { + if (desc->istate & IRQS_PENDING) + return -EBUSY; + continue; + } + /* + * Check the non wakeup interrupts whether they need + * to be masked before finally going into suspend + * state. That's for hardware which has no wakeup + * source configuration facility. The chip + * implementation indicates that with + * IRQCHIP_MASK_ON_SUSPEND. + */ + if (desc->istate & IRQS_SUSPENDED && + irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND) + mask_irq(desc); + } return 0; } diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 6c8a2a9f8a7b..4bd4faa6323a 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -11,6 +11,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/interrupt.h> +#include <linux/kernel_stat.h> #include "internals.h" @@ -18,16 +19,19 @@ static struct proc_dir_entry *root_irq_dir; #ifdef CONFIG_SMP -static int irq_affinity_proc_show(struct seq_file *m, void *v) +static int show_irq_affinity(int type, struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long)m->private); const struct cpumask *mask = desc->irq_data.affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ - if (desc->status & IRQ_MOVE_PENDING) + if (irqd_is_setaffinity_pending(&desc->irq_data)) mask = desc->pending_mask; #endif - seq_cpumask(m, mask); + if (type) + seq_cpumask_list(m, mask); + else + seq_cpumask(m, mask); seq_putc(m, '\n'); return 0; } @@ -58,21 +62,34 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v) #endif int no_irq_affinity; -static ssize_t irq_affinity_proc_write(struct file *file, +static int irq_affinity_proc_show(struct seq_file *m, void *v) +{ + return show_irq_affinity(0, m, v); +} + +static int irq_affinity_list_proc_show(struct seq_file *m, void *v) +{ + return show_irq_affinity(1, m, v); +} + + +static ssize_t write_irq_affinity(int type, struct file *file, const char __user *buffer, size_t count, loff_t *pos) { unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; cpumask_var_t new_value; int err; - if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity || - irq_balancing_disabled(irq)) + if (!irq_can_set_affinity(irq) || no_irq_affinity) return -EIO; if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) return -ENOMEM; - err = cpumask_parse_user(buffer, count, new_value); + if (type) + err = cpumask_parselist_user(buffer, count, new_value); + else + err = cpumask_parse_user(buffer, count, new_value); if (err) goto free_cpumask; @@ -89,7 +106,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, if (!cpumask_intersects(new_value, cpu_online_mask)) { /* Special case for empty set - allow the architecture code to set default SMP affinity. */ - err = irq_select_affinity_usr(irq) ? -EINVAL : count; + err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count; } else { irq_set_affinity(irq, new_value); err = count; @@ -100,11 +117,28 @@ free_cpumask: return err; } +static ssize_t irq_affinity_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *pos) +{ + return write_irq_affinity(0, file, buffer, count, pos); +} + +static ssize_t irq_affinity_list_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *pos) +{ + return write_irq_affinity(1, file, buffer, count, pos); +} + static int irq_affinity_proc_open(struct inode *inode, struct file *file) { return single_open(file, irq_affinity_proc_show, PDE(inode)->data); } +static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data); +} + static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file) { return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data); @@ -125,6 +159,14 @@ static const struct file_operations irq_affinity_hint_proc_fops = { .release = single_release, }; +static const struct file_operations irq_affinity_list_proc_fops = { + .open = irq_affinity_list_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = irq_affinity_list_proc_write, +}; + static int default_affinity_show(struct seq_file *m, void *v) { seq_cpumask(m, irq_default_affinity); @@ -289,6 +331,10 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) proc_create_data("affinity_hint", 0400, desc->dir, &irq_affinity_hint_proc_fops, (void *)(long)irq); + /* create /proc/irq/<irq>/smp_affinity_list */ + proc_create_data("smp_affinity_list", 0600, desc->dir, + &irq_affinity_list_proc_fops, (void *)(long)irq); + proc_create_data("node", 0444, desc->dir, &irq_node_proc_fops, (void *)(long)irq); #endif @@ -306,6 +352,7 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) #ifdef CONFIG_SMP remove_proc_entry("smp_affinity", desc->dir); remove_proc_entry("affinity_hint", desc->dir); + remove_proc_entry("smp_affinity_list", desc->dir); remove_proc_entry("node", desc->dir); #endif remove_proc_entry("spurious", desc->dir); @@ -357,3 +404,83 @@ void init_irq_proc(void) } } +#ifdef CONFIG_GENERIC_IRQ_SHOW + +int __weak arch_show_interrupts(struct seq_file *p, int prec) +{ + return 0; +} + +#ifndef ACTUAL_NR_IRQS +# define ACTUAL_NR_IRQS nr_irqs +#endif + +int show_interrupts(struct seq_file *p, void *v) +{ + static int prec; + + unsigned long flags, any_count = 0; + int i = *(loff_t *) v, j; + struct irqaction *action; + struct irq_desc *desc; + + if (i > ACTUAL_NR_IRQS) + return 0; + + if (i == ACTUAL_NR_IRQS) + return arch_show_interrupts(p, prec); + + /* print header and calculate the width of the first column */ + if (i == 0) { + for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec) + j *= 10; + + seq_printf(p, "%*s", prec + 8, ""); + for_each_online_cpu(j) + seq_printf(p, "CPU%-8d", j); + seq_putc(p, '\n'); + } + + desc = irq_to_desc(i); + if (!desc) + return 0; + + raw_spin_lock_irqsave(&desc->lock, flags); + for_each_online_cpu(j) + any_count |= kstat_irqs_cpu(i, j); + action = desc->action; + if (!action && !any_count) + goto out; + + seq_printf(p, "%*d: ", prec, i); + for_each_online_cpu(j) + seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + + if (desc->irq_data.chip) { + if (desc->irq_data.chip->irq_print_chip) + desc->irq_data.chip->irq_print_chip(&desc->irq_data, p); + else if (desc->irq_data.chip->name) + seq_printf(p, " %8s", desc->irq_data.chip->name); + else + seq_printf(p, " %8s", "-"); + } else { + seq_printf(p, " %8s", "None"); + } +#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL + seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); +#endif + if (desc->name) + seq_printf(p, "-%-8s", desc->name); + + if (action) { + seq_printf(p, " %s", action->name); + while ((action = action->next) != NULL) + seq_printf(p, ", %s", action->name); + } + + seq_putc(p, '\n'); +out: + raw_spin_unlock_irqrestore(&desc->lock, flags); + return 0; +} +#endif diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index dc49358b73fa..14dd5761e8c9 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -55,20 +55,18 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); */ void check_irq_resend(struct irq_desc *desc, unsigned int irq) { - unsigned int status = desc->status; - - /* - * Make sure the interrupt is enabled, before resending it: - */ - desc->irq_data.chip->irq_enable(&desc->irq_data); - /* * We do not resend level type interrupts. Level type * interrupts are resent by hardware when they are still * active. */ - if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { - desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; + if (irq_settings_is_level(desc)) + return; + if (desc->istate & IRQS_REPLAY) + return; + if (desc->istate & IRQS_PENDING) { + desc->istate &= ~IRQS_PENDING; + desc->istate |= IRQS_REPLAY; if (!desc->irq_data.chip->irq_retrigger || !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h new file mode 100644 index 000000000000..f1667833d444 --- /dev/null +++ b/kernel/irq/settings.h @@ -0,0 +1,142 @@ +/* + * Internal header to deal with irq_desc->status which will be renamed + * to irq_desc->settings. + */ +enum { + _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS, + _IRQ_PER_CPU = IRQ_PER_CPU, + _IRQ_LEVEL = IRQ_LEVEL, + _IRQ_NOPROBE = IRQ_NOPROBE, + _IRQ_NOREQUEST = IRQ_NOREQUEST, + _IRQ_NOTHREAD = IRQ_NOTHREAD, + _IRQ_NOAUTOEN = IRQ_NOAUTOEN, + _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, + _IRQ_NO_BALANCING = IRQ_NO_BALANCING, + _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, + _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, +}; + +#define IRQ_PER_CPU GOT_YOU_MORON +#define IRQ_NO_BALANCING GOT_YOU_MORON +#define IRQ_LEVEL GOT_YOU_MORON +#define IRQ_NOPROBE GOT_YOU_MORON +#define IRQ_NOREQUEST GOT_YOU_MORON +#define IRQ_NOTHREAD GOT_YOU_MORON +#define IRQ_NOAUTOEN GOT_YOU_MORON +#define IRQ_NESTED_THREAD GOT_YOU_MORON +#undef IRQF_MODIFY_MASK +#define IRQF_MODIFY_MASK GOT_YOU_MORON + +static inline void +irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set) +{ + desc->status_use_accessors &= ~(clr & _IRQF_MODIFY_MASK); + desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK); +} + +static inline bool irq_settings_is_per_cpu(struct irq_desc *desc) +{ + return desc->status_use_accessors & _IRQ_PER_CPU; +} + +static inline void irq_settings_set_per_cpu(struct irq_desc *desc) +{ + desc->status_use_accessors |= _IRQ_PER_CPU; +} + +static inline void irq_settings_set_no_balancing(struct irq_desc *desc) +{ + desc->status_use_accessors |= _IRQ_NO_BALANCING; +} + +static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc) +{ + return desc->status_use_accessors & _IRQ_NO_BALANCING; +} + +static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc) +{ + return desc->status_use_accessors & IRQ_TYPE_SENSE_MASK; +} + +static inline void +irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask) +{ + desc->status_use_accessors &= ~IRQ_TYPE_SENSE_MASK; + desc->status_use_accessors |= mask & IRQ_TYPE_SENSE_MASK; +} + +static inline bool irq_settings_is_level(struct irq_desc *desc) +{ + return desc->status_use_accessors & _IRQ_LEVEL; +} + +static inline void irq_settings_clr_level(struct irq_desc *desc) +{ + desc->status_use_accessors &= ~_IRQ_LEVEL; +} + +static inline void irq_settings_set_level(struct irq_desc *desc) +{ + desc->status_use_accessors |= _IRQ_LEVEL; +} + +static inline bool irq_settings_can_request(struct irq_desc *desc) +{ + return !(desc->status_use_accessors & _IRQ_NOREQUEST); +} + +static inline void irq_settings_clr_norequest(struct irq_desc *desc) +{ + desc->status_use_accessors &= ~_IRQ_NOREQUEST; +} + +static inline void irq_settings_set_norequest(struct irq_desc *desc) +{ + desc->status_use_accessors |= _IRQ_NOREQUEST; +} + +static inline bool irq_settings_can_thread(struct irq_desc *desc) +{ + return !(desc->status_use_accessors & _IRQ_NOTHREAD); +} + +static inline void irq_settings_clr_nothread(struct irq_desc *desc) +{ + desc->status_use_accessors &= ~_IRQ_NOTHREAD; +} + +static inline void irq_settings_set_nothread(struct irq_desc *desc) +{ + desc->status_use_accessors |= _IRQ_NOTHREAD; +} + +static inline bool irq_settings_can_probe(struct irq_desc *desc) +{ + return !(desc->status_use_accessors & _IRQ_NOPROBE); +} + +static inline void irq_settings_clr_noprobe(struct irq_desc *desc) +{ + desc->status_use_accessors &= ~_IRQ_NOPROBE; +} + +static inline void irq_settings_set_noprobe(struct irq_desc *desc) +{ + desc->status_use_accessors |= _IRQ_NOPROBE; +} + +static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc) +{ + return desc->status_use_accessors & _IRQ_MOVE_PCNTXT; +} + +static inline bool irq_settings_can_autoenable(struct irq_desc *desc) +{ + return !(desc->status_use_accessors & _IRQ_NOAUTOEN); +} + +static inline bool irq_settings_is_nested_thread(struct irq_desc *desc) +{ + return desc->status_use_accessors & _IRQ_NESTED_THREAD; +} diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 3089d3b9d5f3..dfbd550401b2 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -21,70 +21,93 @@ static int irqfixup __read_mostly; #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) static void poll_spurious_irqs(unsigned long dummy); static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); +static int irq_poll_cpu; +static atomic_t irq_poll_active; + +/* + * We wait here for a poller to finish. + * + * If the poll runs on this CPU, then we yell loudly and return + * false. That will leave the interrupt line disabled in the worst + * case, but it should never happen. + * + * We wait until the poller is done and then recheck disabled and + * action (about to be disabled). Only if it's still active, we return + * true and let the handler run. + */ +bool irq_wait_for_poll(struct irq_desc *desc) +{ + if (WARN_ONCE(irq_poll_cpu == smp_processor_id(), + "irq poll in progress on cpu %d for irq %d\n", + smp_processor_id(), desc->irq_data.irq)) + return false; + +#ifdef CONFIG_SMP + do { + raw_spin_unlock(&desc->lock); + while (irqd_irq_inprogress(&desc->irq_data)) + cpu_relax(); + raw_spin_lock(&desc->lock); + } while (irqd_irq_inprogress(&desc->irq_data)); + /* Might have been disabled in meantime */ + return !irqd_irq_disabled(&desc->irq_data) && desc->action; +#else + return false; +#endif +} + /* * Recovery handler for misrouted interrupts. */ -static int try_one_irq(int irq, struct irq_desc *desc) +static int try_one_irq(int irq, struct irq_desc *desc, bool force) { + irqreturn_t ret = IRQ_NONE; struct irqaction *action; - int ok = 0, work = 0; raw_spin_lock(&desc->lock); - /* Already running on another processor */ - if (desc->status & IRQ_INPROGRESS) { - /* - * Already running: If it is shared get the other - * CPU to go looking for our mystery interrupt too - */ - if (desc->action && (desc->action->flags & IRQF_SHARED)) - desc->status |= IRQ_PENDING; - raw_spin_unlock(&desc->lock); - return ok; - } - /* Honour the normal IRQ locking */ - desc->status |= IRQ_INPROGRESS; - action = desc->action; - raw_spin_unlock(&desc->lock); - while (action) { - /* Only shared IRQ handlers are safe to call */ - if (action->flags & IRQF_SHARED) { - if (action->handler(irq, action->dev_id) == - IRQ_HANDLED) - ok = 1; - } - action = action->next; - } - local_irq_disable(); - /* Now clean up the flags */ - raw_spin_lock(&desc->lock); - action = desc->action; + /* PER_CPU and nested thread interrupts are never polled */ + if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc)) + goto out; /* - * While we were looking for a fixup someone queued a real - * IRQ clashing with our walk: + * Do not poll disabled interrupts unless the spurious + * disabled poller asks explicitely. */ - while ((desc->status & IRQ_PENDING) && action) { + if (irqd_irq_disabled(&desc->irq_data) && !force) + goto out; + + /* + * All handlers must agree on IRQF_SHARED, so we test just the + * first. Check for action->next as well. + */ + action = desc->action; + if (!action || !(action->flags & IRQF_SHARED) || + (action->flags & __IRQF_TIMER) || !action->next) + goto out; + + /* Already running on another processor */ + if (irqd_irq_inprogress(&desc->irq_data)) { /* - * Perform real IRQ processing for the IRQ we deferred + * Already running: If it is shared get the other + * CPU to go looking for our mystery interrupt too */ - work = 1; - raw_spin_unlock(&desc->lock); - handle_IRQ_event(irq, action); - raw_spin_lock(&desc->lock); - desc->status &= ~IRQ_PENDING; + desc->istate |= IRQS_PENDING; + goto out; } - desc->status &= ~IRQ_INPROGRESS; - /* - * If we did actual work for the real IRQ line we must let the - * IRQ controller clean up too - */ - if (work) - irq_end(irq, desc); - raw_spin_unlock(&desc->lock); - return ok; + /* Mark it poll in progress */ + desc->istate |= IRQS_POLL_INPROGRESS; + do { + if (handle_irq_event(desc) == IRQ_HANDLED) + ret = IRQ_HANDLED; + action = desc->action; + } while ((desc->istate & IRQS_PENDING) && action); + desc->istate &= ~IRQS_POLL_INPROGRESS; +out: + raw_spin_unlock(&desc->lock); + return ret == IRQ_HANDLED; } static int misrouted_irq(int irq) @@ -92,6 +115,11 @@ static int misrouted_irq(int irq) struct irq_desc *desc; int i, ok = 0; + if (atomic_inc_return(&irq_poll_active) == 1) + goto out; + + irq_poll_cpu = smp_processor_id(); + for_each_irq_desc(i, desc) { if (!i) continue; @@ -99,9 +127,11 @@ static int misrouted_irq(int irq) if (i == irq) /* Already tried */ continue; - if (try_one_irq(i, desc)) + if (try_one_irq(i, desc, false)) ok = 1; } +out: + atomic_dec(&irq_poll_active); /* So the caller can adjust the irq error counts */ return ok; } @@ -111,23 +141,28 @@ static void poll_spurious_irqs(unsigned long dummy) struct irq_desc *desc; int i; + if (atomic_inc_return(&irq_poll_active) != 1) + goto out; + irq_poll_cpu = smp_processor_id(); + for_each_irq_desc(i, desc) { - unsigned int status; + unsigned int state; if (!i) continue; /* Racy but it doesn't matter */ - status = desc->status; + state = desc->istate; barrier(); - if (!(status & IRQ_SPURIOUS_DISABLED)) + if (!(state & IRQS_SPURIOUS_DISABLED)) continue; local_irq_disable(); - try_one_irq(i, desc); + try_one_irq(i, desc, true); local_irq_enable(); } - +out: + atomic_dec(&irq_poll_active); mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); } @@ -139,15 +174,13 @@ static void poll_spurious_irqs(unsigned long dummy) * * (The other 100-of-100,000 interrupts may have been a correctly * functioning device sharing an IRQ with the failing one) - * - * Called under desc->lock */ - static void __report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) { struct irqaction *action; + unsigned long flags; if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { printk(KERN_ERR "irq event %d: bogus return value %x\n", @@ -159,6 +192,13 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, dump_stack(); printk(KERN_ERR "handlers:\n"); + /* + * We need to take desc->lock here. note_interrupt() is called + * w/o desc->lock held, but IRQ_PROGRESS set. We might race + * with something else removing an action. It's ok to take + * desc->lock here. See synchronize_irq(). + */ + raw_spin_lock_irqsave(&desc->lock, flags); action = desc->action; while (action) { printk(KERN_ERR "[<%p>]", action->handler); @@ -167,6 +207,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, printk("\n"); action = action->next; } + raw_spin_unlock_irqrestore(&desc->lock, flags); } static void @@ -218,6 +259,9 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, void note_interrupt(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) { + if (desc->istate & IRQS_POLL_INPROGRESS) + return; + if (unlikely(action_ret != IRQ_HANDLED)) { /* * If we are seeing only the odd spurious IRQ caused by @@ -254,9 +298,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, * Now kill the IRQ */ printk(KERN_EMERG "Disabling IRQ #%d\n", irq); - desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; + desc->istate |= IRQS_SPURIOUS_DISABLED; desc->depth++; - desc->irq_data.chip->irq_disable(&desc->irq_data); + irq_disable(desc); mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 3b79bd938330..fa27e750dbc0 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -2,43 +2,23 @@ * jump label support * * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> + * Copyright (C) 2011 Peter Zijlstra <pzijlstr@redhat.com> * */ -#include <linux/jump_label.h> #include <linux/memory.h> #include <linux/uaccess.h> #include <linux/module.h> #include <linux/list.h> -#include <linux/jhash.h> #include <linux/slab.h> #include <linux/sort.h> #include <linux/err.h> +#include <linux/jump_label.h> #ifdef HAVE_JUMP_LABEL -#define JUMP_LABEL_HASH_BITS 6 -#define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS) -static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE]; - /* mutex to protect coming/going of the the jump_label table */ static DEFINE_MUTEX(jump_label_mutex); -struct jump_label_entry { - struct hlist_node hlist; - struct jump_entry *table; - int nr_entries; - /* hang modules off here */ - struct hlist_head modules; - unsigned long key; -}; - -struct jump_label_module_entry { - struct hlist_node hlist; - struct jump_entry *table; - int nr_entries; - struct module *mod; -}; - void jump_label_lock(void) { mutex_lock(&jump_label_mutex); @@ -49,6 +29,11 @@ void jump_label_unlock(void) mutex_unlock(&jump_label_mutex); } +bool jump_label_enabled(struct jump_label_key *key) +{ + return !!atomic_read(&key->enabled); +} + static int jump_label_cmp(const void *a, const void *b) { const struct jump_entry *jea = a; @@ -64,7 +49,7 @@ static int jump_label_cmp(const void *a, const void *b) } static void -sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop) +jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) { unsigned long size; @@ -73,118 +58,25 @@ sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop) sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); } -static struct jump_label_entry *get_jump_label_entry(jump_label_t key) -{ - struct hlist_head *head; - struct hlist_node *node; - struct jump_label_entry *e; - u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0); - - head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (key == e->key) - return e; - } - return NULL; -} +static void jump_label_update(struct jump_label_key *key, int enable); -static struct jump_label_entry * -add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table) +void jump_label_inc(struct jump_label_key *key) { - struct hlist_head *head; - struct jump_label_entry *e; - u32 hash; - - e = get_jump_label_entry(key); - if (e) - return ERR_PTR(-EEXIST); - - e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL); - if (!e) - return ERR_PTR(-ENOMEM); - - hash = jhash((void *)&key, sizeof(jump_label_t), 0); - head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)]; - e->key = key; - e->table = table; - e->nr_entries = nr_entries; - INIT_HLIST_HEAD(&(e->modules)); - hlist_add_head(&e->hlist, head); - return e; -} + if (atomic_inc_not_zero(&key->enabled)) + return; -static int -build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop) -{ - struct jump_entry *iter, *iter_begin; - struct jump_label_entry *entry; - int count; - - sort_jump_label_entries(start, stop); - iter = start; - while (iter < stop) { - entry = get_jump_label_entry(iter->key); - if (!entry) { - iter_begin = iter; - count = 0; - while ((iter < stop) && - (iter->key == iter_begin->key)) { - iter++; - count++; - } - entry = add_jump_label_entry(iter_begin->key, - count, iter_begin); - if (IS_ERR(entry)) - return PTR_ERR(entry); - } else { - WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n"); - return -1; - } - } - return 0; + jump_label_lock(); + if (atomic_add_return(1, &key->enabled) == 1) + jump_label_update(key, JUMP_LABEL_ENABLE); + jump_label_unlock(); } -/*** - * jump_label_update - update jump label text - * @key - key value associated with a a jump label - * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE - * - * Will enable/disable the jump for jump label @key, depending on the - * value of @type. - * - */ - -void jump_label_update(unsigned long key, enum jump_label_type type) +void jump_label_dec(struct jump_label_key *key) { - struct jump_entry *iter; - struct jump_label_entry *entry; - struct hlist_node *module_node; - struct jump_label_module_entry *e_module; - int count; + if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) + return; - jump_label_lock(); - entry = get_jump_label_entry((jump_label_t)key); - if (entry) { - count = entry->nr_entries; - iter = entry->table; - while (count--) { - if (kernel_text_address(iter->code)) - arch_jump_label_transform(iter, type); - iter++; - } - /* eanble/disable jump labels in modules */ - hlist_for_each_entry(e_module, module_node, &(entry->modules), - hlist) { - count = e_module->nr_entries; - iter = e_module->table; - while (count--) { - if (iter->key && - kernel_text_address(iter->code)) - arch_jump_label_transform(iter, type); - iter++; - } - } - } + jump_label_update(key, JUMP_LABEL_DISABLE); jump_label_unlock(); } @@ -197,77 +89,36 @@ static int addr_conflict(struct jump_entry *entry, void *start, void *end) return 0; } -#ifdef CONFIG_MODULES - -static int module_conflict(void *start, void *end) -{ - struct hlist_head *head; - struct hlist_node *node, *node_next, *module_node, *module_node_next; - struct jump_label_entry *e; - struct jump_label_module_entry *e_module; - struct jump_entry *iter; - int i, count; - int conflict = 0; - - for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { - head = &jump_label_table[i]; - hlist_for_each_entry_safe(e, node, node_next, head, hlist) { - hlist_for_each_entry_safe(e_module, module_node, - module_node_next, - &(e->modules), hlist) { - count = e_module->nr_entries; - iter = e_module->table; - while (count--) { - if (addr_conflict(iter, start, end)) { - conflict = 1; - goto out; - } - iter++; - } - } - } - } -out: - return conflict; -} - -#endif - -/*** - * jump_label_text_reserved - check if addr range is reserved - * @start: start text addr - * @end: end text addr - * - * checks if the text addr located between @start and @end - * overlaps with any of the jump label patch addresses. Code - * that wants to modify kernel text should first verify that - * it does not overlap with any of the jump label addresses. - * Caller must hold jump_label_mutex. - * - * returns 1 if there is an overlap, 0 otherwise - */ -int jump_label_text_reserved(void *start, void *end) +static int __jump_label_text_reserved(struct jump_entry *iter_start, + struct jump_entry *iter_stop, void *start, void *end) { struct jump_entry *iter; - struct jump_entry *iter_start = __start___jump_table; - struct jump_entry *iter_stop = __start___jump_table; - int conflict = 0; iter = iter_start; while (iter < iter_stop) { - if (addr_conflict(iter, start, end)) { - conflict = 1; - goto out; - } + if (addr_conflict(iter, start, end)) + return 1; iter++; } - /* now check modules */ -#ifdef CONFIG_MODULES - conflict = module_conflict(start, end); -#endif -out: - return conflict; + return 0; +} + +static void __jump_label_update(struct jump_label_key *key, + struct jump_entry *entry, + struct jump_entry *stop, int enable) +{ + for (; (entry < stop) && + (entry->key == (jump_label_t)(unsigned long)key); + entry++) { + /* + * entry->code set to 0 invalidates module init text sections + * kernel_text_address() verifies we are not in core kernel + * init code, see jump_label_invalidate_module_init(). + */ + if (entry->code && kernel_text_address(entry->code)) + arch_jump_label_transform(entry, enable); + } } /* @@ -277,145 +128,181 @@ void __weak arch_jump_label_text_poke_early(jump_label_t addr) { } -static __init int init_jump_label(void) +static __init int jump_label_init(void) { - int ret; struct jump_entry *iter_start = __start___jump_table; struct jump_entry *iter_stop = __stop___jump_table; + struct jump_label_key *key = NULL; struct jump_entry *iter; jump_label_lock(); - ret = build_jump_label_hashtable(__start___jump_table, - __stop___jump_table); - iter = iter_start; - while (iter < iter_stop) { + jump_label_sort_entries(iter_start, iter_stop); + + for (iter = iter_start; iter < iter_stop; iter++) { arch_jump_label_text_poke_early(iter->code); - iter++; + if (iter->key == (jump_label_t)(unsigned long)key) + continue; + + key = (struct jump_label_key *)(unsigned long)iter->key; + atomic_set(&key->enabled, 0); + key->entries = iter; +#ifdef CONFIG_MODULES + key->next = NULL; +#endif } jump_label_unlock(); - return ret; + + return 0; } -early_initcall(init_jump_label); +early_initcall(jump_label_init); #ifdef CONFIG_MODULES -static struct jump_label_module_entry * -add_jump_label_module_entry(struct jump_label_entry *entry, - struct jump_entry *iter_begin, - int count, struct module *mod) +struct jump_label_mod { + struct jump_label_mod *next; + struct jump_entry *entries; + struct module *mod; +}; + +static int __jump_label_mod_text_reserved(void *start, void *end) { - struct jump_label_module_entry *e; - - e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL); - if (!e) - return ERR_PTR(-ENOMEM); - e->mod = mod; - e->nr_entries = count; - e->table = iter_begin; - hlist_add_head(&e->hlist, &entry->modules); - return e; + struct module *mod; + + mod = __module_text_address((unsigned long)start); + if (!mod) + return 0; + + WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); + + return __jump_label_text_reserved(mod->jump_entries, + mod->jump_entries + mod->num_jump_entries, + start, end); } -static int add_jump_label_module(struct module *mod) +static void __jump_label_mod_update(struct jump_label_key *key, int enable) { - struct jump_entry *iter, *iter_begin; - struct jump_label_entry *entry; - struct jump_label_module_entry *module_entry; - int count; + struct jump_label_mod *mod = key->next; - /* if the module doesn't have jump label entries, just return */ - if (!mod->num_jump_entries) - return 0; + while (mod) { + struct module *m = mod->mod; - sort_jump_label_entries(mod->jump_entries, - mod->jump_entries + mod->num_jump_entries); - iter = mod->jump_entries; - while (iter < mod->jump_entries + mod->num_jump_entries) { - entry = get_jump_label_entry(iter->key); - iter_begin = iter; - count = 0; - while ((iter < mod->jump_entries + mod->num_jump_entries) && - (iter->key == iter_begin->key)) { - iter++; - count++; - } - if (!entry) { - entry = add_jump_label_entry(iter_begin->key, 0, NULL); - if (IS_ERR(entry)) - return PTR_ERR(entry); - } - module_entry = add_jump_label_module_entry(entry, iter_begin, - count, mod); - if (IS_ERR(module_entry)) - return PTR_ERR(module_entry); + __jump_label_update(key, mod->entries, + m->jump_entries + m->num_jump_entries, + enable); + mod = mod->next; } - return 0; } -static void remove_jump_label_module(struct module *mod) +/*** + * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop() + * @mod: module to patch + * + * Allow for run-time selection of the optimal nops. Before the module + * loads patch these with arch_get_jump_label_nop(), which is specified by + * the arch specific jump label code. + */ +void jump_label_apply_nops(struct module *mod) { - struct hlist_head *head; - struct hlist_node *node, *node_next, *module_node, *module_node_next; - struct jump_label_entry *e; - struct jump_label_module_entry *e_module; - int i; + struct jump_entry *iter_start = mod->jump_entries; + struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; + struct jump_entry *iter; /* if the module doesn't have jump label entries, just return */ - if (!mod->num_jump_entries) + if (iter_start == iter_stop) return; - for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { - head = &jump_label_table[i]; - hlist_for_each_entry_safe(e, node, node_next, head, hlist) { - hlist_for_each_entry_safe(e_module, module_node, - module_node_next, - &(e->modules), hlist) { - if (e_module->mod == mod) { - hlist_del(&e_module->hlist); - kfree(e_module); - } - } - if (hlist_empty(&e->modules) && (e->nr_entries == 0)) { - hlist_del(&e->hlist); - kfree(e); - } + for (iter = iter_start; iter < iter_stop; iter++) + arch_jump_label_text_poke_early(iter->code); +} + +static int jump_label_add_module(struct module *mod) +{ + struct jump_entry *iter_start = mod->jump_entries; + struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; + struct jump_entry *iter; + struct jump_label_key *key = NULL; + struct jump_label_mod *jlm; + + /* if the module doesn't have jump label entries, just return */ + if (iter_start == iter_stop) + return 0; + + jump_label_sort_entries(iter_start, iter_stop); + + for (iter = iter_start; iter < iter_stop; iter++) { + if (iter->key == (jump_label_t)(unsigned long)key) + continue; + + key = (struct jump_label_key *)(unsigned long)iter->key; + + if (__module_address(iter->key) == mod) { + atomic_set(&key->enabled, 0); + key->entries = iter; + key->next = NULL; + continue; } + + jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL); + if (!jlm) + return -ENOMEM; + + jlm->mod = mod; + jlm->entries = iter; + jlm->next = key->next; + key->next = jlm; + + if (jump_label_enabled(key)) + __jump_label_update(key, iter, iter_stop, + JUMP_LABEL_ENABLE); } + + return 0; } -static void remove_jump_label_module_init(struct module *mod) +static void jump_label_del_module(struct module *mod) { - struct hlist_head *head; - struct hlist_node *node, *node_next, *module_node, *module_node_next; - struct jump_label_entry *e; - struct jump_label_module_entry *e_module; + struct jump_entry *iter_start = mod->jump_entries; + struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; - int i, count; + struct jump_label_key *key = NULL; + struct jump_label_mod *jlm, **prev; - /* if the module doesn't have jump label entries, just return */ - if (!mod->num_jump_entries) - return; + for (iter = iter_start; iter < iter_stop; iter++) { + if (iter->key == (jump_label_t)(unsigned long)key) + continue; + + key = (struct jump_label_key *)(unsigned long)iter->key; + + if (__module_address(iter->key) == mod) + continue; + + prev = &key->next; + jlm = key->next; + + while (jlm && jlm->mod != mod) { + prev = &jlm->next; + jlm = jlm->next; + } - for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { - head = &jump_label_table[i]; - hlist_for_each_entry_safe(e, node, node_next, head, hlist) { - hlist_for_each_entry_safe(e_module, module_node, - module_node_next, - &(e->modules), hlist) { - if (e_module->mod != mod) - continue; - count = e_module->nr_entries; - iter = e_module->table; - while (count--) { - if (within_module_init(iter->code, mod)) - iter->key = 0; - iter++; - } - } + if (jlm) { + *prev = jlm->next; + kfree(jlm); } } } +static void jump_label_invalidate_module_init(struct module *mod) +{ + struct jump_entry *iter_start = mod->jump_entries; + struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; + struct jump_entry *iter; + + for (iter = iter_start; iter < iter_stop; iter++) { + if (within_module_init(iter->code, mod)) + iter->code = 0; + } +} + static int jump_label_module_notify(struct notifier_block *self, unsigned long val, void *data) @@ -426,59 +313,77 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, switch (val) { case MODULE_STATE_COMING: jump_label_lock(); - ret = add_jump_label_module(mod); + ret = jump_label_add_module(mod); if (ret) - remove_jump_label_module(mod); + jump_label_del_module(mod); jump_label_unlock(); break; case MODULE_STATE_GOING: jump_label_lock(); - remove_jump_label_module(mod); + jump_label_del_module(mod); jump_label_unlock(); break; case MODULE_STATE_LIVE: jump_label_lock(); - remove_jump_label_module_init(mod); + jump_label_invalidate_module_init(mod); jump_label_unlock(); break; } - return ret; -} -/*** - * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop() - * @mod: module to patch - * - * Allow for run-time selection of the optimal nops. Before the module - * loads patch these with arch_get_jump_label_nop(), which is specified by - * the arch specific jump label code. - */ -void jump_label_apply_nops(struct module *mod) -{ - struct jump_entry *iter; - - /* if the module doesn't have jump label entries, just return */ - if (!mod->num_jump_entries) - return; - - iter = mod->jump_entries; - while (iter < mod->jump_entries + mod->num_jump_entries) { - arch_jump_label_text_poke_early(iter->code); - iter++; - } + return notifier_from_errno(ret); } struct notifier_block jump_label_module_nb = { .notifier_call = jump_label_module_notify, - .priority = 0, + .priority = 1, /* higher than tracepoints */ }; -static __init int init_jump_label_module(void) +static __init int jump_label_init_module(void) { return register_module_notifier(&jump_label_module_nb); } -early_initcall(init_jump_label_module); +early_initcall(jump_label_init_module); #endif /* CONFIG_MODULES */ +/*** + * jump_label_text_reserved - check if addr range is reserved + * @start: start text addr + * @end: end text addr + * + * checks if the text addr located between @start and @end + * overlaps with any of the jump label patch addresses. Code + * that wants to modify kernel text should first verify that + * it does not overlap with any of the jump label addresses. + * Caller must hold jump_label_mutex. + * + * returns 1 if there is an overlap, 0 otherwise + */ +int jump_label_text_reserved(void *start, void *end) +{ + int ret = __jump_label_text_reserved(__start___jump_table, + __stop___jump_table, start, end); + + if (ret) + return ret; + +#ifdef CONFIG_MODULES + ret = __jump_label_mod_text_reserved(start, end); +#endif + return ret; +} + +static void jump_label_update(struct jump_label_key *key, int enable) +{ + struct jump_entry *entry = key->entries; + + /* if there are no users, entry can be NULL */ + if (entry) + __jump_label_update(key, entry, __stop___jump_table, enable); + +#ifdef CONFIG_MODULES + __jump_label_mod_update(key, enable); +#endif +} + #endif diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 6f6d091b5757..079f1d39a8b8 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -64,14 +64,14 @@ static inline int is_kernel_text(unsigned long addr) if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || arch_is_kernel_text(addr)) return 1; - return in_gate_area_no_task(addr); + return in_gate_area_no_mm(addr); } static inline int is_kernel(unsigned long addr) { if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) return 1; - return in_gate_area_no_task(addr); + return in_gate_area_no_mm(addr); } static int is_ksym_addr(unsigned long addr) @@ -342,13 +342,15 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, } /* Look up a kernel symbol and return it in a text buffer. */ -int sprint_symbol(char *buffer, unsigned long address) +static int __sprint_symbol(char *buffer, unsigned long address, + int symbol_offset) { char *modname; const char *name; unsigned long offset, size; int len; + address += symbol_offset; name = kallsyms_lookup(address, &size, &offset, &modname, buffer); if (!name) return sprintf(buffer, "0x%lx", address); @@ -357,17 +359,53 @@ int sprint_symbol(char *buffer, unsigned long address) strcpy(buffer, name); len = strlen(buffer); buffer += len; + offset -= symbol_offset; if (modname) - len += sprintf(buffer, "+%#lx/%#lx [%s]", - offset, size, modname); + len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname); else len += sprintf(buffer, "+%#lx/%#lx", offset, size); return len; } + +/** + * sprint_symbol - Look up a kernel symbol and return it in a text buffer + * @buffer: buffer to be stored + * @address: address to lookup + * + * This function looks up a kernel symbol with @address and stores its name, + * offset, size and module name to @buffer if possible. If no symbol was found, + * just saves its @address as is. + * + * This function returns the number of bytes stored in @buffer. + */ +int sprint_symbol(char *buffer, unsigned long address) +{ + return __sprint_symbol(buffer, address, 0); +} + EXPORT_SYMBOL_GPL(sprint_symbol); +/** + * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer + * @buffer: buffer to be stored + * @address: address to lookup + * + * This function is for stack backtrace and does the same thing as + * sprint_symbol() but with modified/decreased @address. If there is a + * tail-call to the function marked "noreturn", gcc optimized out code after + * the call so that the stack-saved return address could point outside of the + * caller. This function ensures that kallsyms will find the original caller + * by decreasing @address. + * + * This function returns the number of bytes stored in @buffer. + */ +int sprint_backtrace(char *buffer, unsigned long address) +{ + return __sprint_symbol(buffer, address, -1); +} + /* Look up a kernel symbol and print it to the kernel messages. */ void __print_symbol(const char *fmt, unsigned long address) { @@ -477,13 +515,11 @@ static int s_show(struct seq_file *m, void *p) */ type = iter->exported ? toupper(iter->type) : tolower(iter->type); - seq_printf(m, "%0*lx %c %s\t[%s]\n", - (int)(2 * sizeof(void *)), - iter->value, type, iter->name, iter->module_name); + seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value, + type, iter->name, iter->module_name); } else - seq_printf(m, "%0*lx %c %s\n", - (int)(2 * sizeof(void *)), - iter->value, iter->type, iter->name); + seq_printf(m, "%pK %c %s\n", (void *)iter->value, + iter->type, iter->name); return 0; } diff --git a/kernel/kexec.c b/kernel/kexec.c index ec19b92c7ebd..8d814cbc8109 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -33,6 +33,7 @@ #include <linux/vmalloc.h> #include <linux/swap.h> #include <linux/kmsg_dump.h> +#include <linux/syscore_ops.h> #include <asm/page.h> #include <asm/uaccess.h> @@ -144,7 +145,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, /* Initialize the list of destination pages */ INIT_LIST_HEAD(&image->dest_pages); - /* Initialize the list of unuseable pages */ + /* Initialize the list of unusable pages */ INIT_LIST_HEAD(&image->unuseable_pages); /* Read in the segments */ @@ -454,7 +455,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image, /* Deal with the destination pages I have inadvertently allocated. * * Ideally I would convert multi-page allocations into single - * page allocations, and add everyting to image->dest_pages. + * page allocations, and add everything to image->dest_pages. * * For now it is simpler to just free the pages. */ @@ -602,7 +603,7 @@ static void kimage_free_extra_pages(struct kimage *image) /* Walk through and free any extra destination pages I may have */ kimage_free_page_list(&image->dest_pages); - /* Walk through and free any unuseable pages I have cached */ + /* Walk through and free any unusable pages I have cached */ kimage_free_page_list(&image->unuseable_pages); } @@ -1099,7 +1100,8 @@ size_t crash_get_memory_size(void) return size; } -static void free_reserved_phys_range(unsigned long begin, unsigned long end) +void __weak crash_free_reserved_phys_range(unsigned long begin, + unsigned long end) { unsigned long addr; @@ -1135,7 +1137,7 @@ int crash_shrink_memory(unsigned long new_size) start = roundup(start, PAGE_SIZE); end = roundup(start + new_size, PAGE_SIZE); - free_reserved_phys_range(end, crashk_res.end); + crash_free_reserved_phys_range(end, crashk_res.end); if ((start == end) && (crashk_res.parent != NULL)) release_resource(&crashk_res); @@ -1529,8 +1531,7 @@ int kernel_kexec(void) if (error) goto Enable_cpus; local_irq_disable(); - /* Suspend system devices */ - error = sysdev_suspend(PMSG_FREEZE); + error = syscore_suspend(); if (error) goto Enable_irqs; } else @@ -1545,7 +1546,7 @@ int kernel_kexec(void) #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { - sysdev_resume(); + syscore_resume(); Enable_irqs: local_irq_enable(); Enable_cpus: diff --git a/kernel/kmod.c b/kernel/kmod.c index 9cd0591c96a2..ad6a81c58b44 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -25,6 +25,7 @@ #include <linux/kmod.h> #include <linux/slab.h> #include <linux/completion.h> +#include <linux/cred.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/workqueue.h> @@ -43,6 +44,13 @@ extern int max_threads; static struct workqueue_struct *khelper_wq; +#define CAP_BSET (void *)1 +#define CAP_PI (void *)2 + +static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; +static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; +static DEFINE_SPINLOCK(umh_sysctl_lock); + #ifdef CONFIG_MODULES /* @@ -132,6 +140,7 @@ EXPORT_SYMBOL(__request_module); static int ____call_usermodehelper(void *data) { struct subprocess_info *sub_info = data; + struct cred *new; int retval; spin_lock_irq(¤t->sighand->siglock); @@ -153,6 +162,19 @@ static int ____call_usermodehelper(void *data) goto fail; } + retval = -ENOMEM; + new = prepare_kernel_cred(current); + if (!new) + goto fail; + + spin_lock(&umh_sysctl_lock); + new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); + new->cap_inheritable = cap_intersect(usermodehelper_inheritable, + new->cap_inheritable); + spin_unlock(&umh_sysctl_lock); + + commit_creds(new); + retval = kernel_execve(sub_info->path, (const char *const *)sub_info->argv, (const char *const *)sub_info->envp); @@ -245,7 +267,6 @@ static void __call_usermodehelper(struct work_struct *work) } } -#ifdef CONFIG_PM_SLEEP /* * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY * (used for preventing user land processes from being created after the user @@ -301,6 +322,15 @@ void usermodehelper_enable(void) usermodehelper_disabled = 0; } +/** + * usermodehelper_is_disabled - check if new helpers are allowed to be started + */ +bool usermodehelper_is_disabled(void) +{ + return usermodehelper_disabled; +} +EXPORT_SYMBOL_GPL(usermodehelper_is_disabled); + static void helper_lock(void) { atomic_inc(&running_helpers); @@ -312,12 +342,6 @@ static void helper_unlock(void) if (atomic_dec_and_test(&running_helpers)) wake_up(&running_helpers_waitq); } -#else /* CONFIG_PM_SLEEP */ -#define usermodehelper_disabled 0 - -static inline void helper_lock(void) {} -static inline void helper_unlock(void) {} -#endif /* CONFIG_PM_SLEEP */ /** * call_usermodehelper_setup - prepare to call a usermode helper @@ -418,6 +442,84 @@ unlock: } EXPORT_SYMBOL(call_usermodehelper_exec); +static int proc_cap_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; + kernel_cap_t new_cap; + int err, i; + + if (write && (!capable(CAP_SETPCAP) || + !capable(CAP_SYS_MODULE))) + return -EPERM; + + /* + * convert from the global kernel_cap_t to the ulong array to print to + * userspace if this is a read. + */ + spin_lock(&umh_sysctl_lock); + for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) { + if (table->data == CAP_BSET) + cap_array[i] = usermodehelper_bset.cap[i]; + else if (table->data == CAP_PI) + cap_array[i] = usermodehelper_inheritable.cap[i]; + else + BUG(); + } + spin_unlock(&umh_sysctl_lock); + + t = *table; + t.data = &cap_array; + + /* + * actually read or write and array of ulongs from userspace. Remember + * these are least significant 32 bits first + */ + err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + + /* + * convert from the sysctl array of ulongs to the kernel_cap_t + * internal representation + */ + for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) + new_cap.cap[i] = cap_array[i]; + + /* + * Drop everything not in the new_cap (but don't add things) + */ + spin_lock(&umh_sysctl_lock); + if (write) { + if (table->data == CAP_BSET) + usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap); + if (table->data == CAP_PI) + usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap); + } + spin_unlock(&umh_sysctl_lock); + + return 0; +} + +struct ctl_table usermodehelper_table[] = { + { + .procname = "bset", + .data = CAP_BSET, + .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), + .mode = 0600, + .proc_handler = proc_cap_handler, + }, + { + .procname = "inheritable", + .data = CAP_PI, + .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), + .mode = 0600, + .proc_handler = proc_cap_handler, + }, + { } +}; + void __init usermodehelper_init(void) { khelper_wq = create_singlethread_workqueue("khelper"); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 0b624e791805..3b053c04dd86 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -16,6 +16,7 @@ #include <linux/kexec.h> #include <linux/profile.h> #include <linux/sched.h> +#include <linux/capability.h> #define KERNEL_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) @@ -131,6 +132,14 @@ KERNEL_ATTR_RO(vmcoreinfo); #endif /* CONFIG_KEXEC */ +/* whether file capabilities are enabled */ +static ssize_t fscaps_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", file_caps_enabled); +} +KERNEL_ATTR_RO(fscaps); + /* * Make /sys/kernel/notes give the raw contents of our kernel .notes section. */ @@ -158,6 +167,7 @@ struct kobject *kernel_kobj; EXPORT_SYMBOL_GPL(kernel_kobj); static struct attribute * kernel_attrs[] = { + &fscaps_attr.attr, #if defined(CONFIG_HOTPLUG) &uevent_seqnum_attr.attr, &uevent_helper_attr.attr, diff --git a/kernel/kthread.c b/kernel/kthread.c index c55afba990a3..4ba7cccb4994 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -27,6 +27,7 @@ struct kthread_create_info /* Information passed to kthread() from kthreadd. */ int (*threadfn)(void *data); void *data; + int node; /* Result passed back to kthread_create() from kthreadd. */ struct task_struct *result; @@ -98,10 +99,23 @@ static int kthread(void *_create) do_exit(ret); } +/* called from do_fork() to get node information for about to be created task */ +int tsk_fork_get_node(struct task_struct *tsk) +{ +#ifdef CONFIG_NUMA + if (tsk == kthreadd_task) + return tsk->pref_node_fork; +#endif + return numa_node_id(); +} + static void create_kthread(struct kthread_create_info *create) { int pid; +#ifdef CONFIG_NUMA + current->pref_node_fork = create->node; +#endif /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { @@ -111,33 +125,38 @@ static void create_kthread(struct kthread_create_info *create) } /** - * kthread_create - create a kthread. + * kthread_create_on_node - create a kthread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. + * @node: memory node number. * @namefmt: printf-style name for the thread. * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start * it. See also kthread_run(). * + * If thread is going to be bound on a particular cpu, give its node + * in @node, to get NUMA affinity for kthread stack, or else give -1. * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either call do_exit() directly if it is a - * standalone thread for which noone will call kthread_stop(), or + * standalone thread for which no one will call kthread_stop(), or * return when 'kthread_should_stop()' is true (which means * kthread_stop() has been called). The return value should be zero * or a negative error number; it will be passed to kthread_stop(). * * Returns a task_struct or ERR_PTR(-ENOMEM). */ -struct task_struct *kthread_create(int (*threadfn)(void *data), - void *data, - const char namefmt[], - ...) +struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), + void *data, + int node, + const char namefmt[], + ...) { struct kthread_create_info create; create.threadfn = threadfn; create.data = data; + create.node = node; init_completion(&create.done); spin_lock(&kthread_create_lock); @@ -164,7 +183,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), } return create.result; } -EXPORT_SYMBOL(kthread_create); +EXPORT_SYMBOL(kthread_create_on_node); /** * kthread_bind - bind a just-created kthread to a cpu. @@ -183,8 +202,8 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) return; } - p->cpus_allowed = cpumask_of_cpu(cpu); - p->rt.nr_cpus_allowed = 1; + /* It's safe because the task is inactive. */ + do_set_cpus_allowed(p, cpumask_of(cpu)); p->flags |= PF_THREAD_BOUND; } EXPORT_SYMBOL(kthread_bind); diff --git a/kernel/latencytop.c b/kernel/latencytop.c index ee74b35e528d..376066e10413 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -153,7 +153,7 @@ static inline void store_stacktrace(struct task_struct *tsk, } /** - * __account_scheduler_latency - record an occured latency + * __account_scheduler_latency - record an occurred latency * @tsk - the task struct of the task hitting the latency * @usecs - the duration of the latency in microseconds * @inter - 1 if the sleep was interruptible, 0 if uninterruptible diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 0d2058da80f5..63437d065ac8 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -490,6 +490,18 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) usage[i] = '\0'; } +static int __print_lock_name(struct lock_class *class) +{ + char str[KSYM_NAME_LEN]; + const char *name; + + name = class->name; + if (!name) + name = __get_key_name(class->key, str); + + return printk("%s", name); +} + static void print_lock_name(struct lock_class *class) { char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; @@ -1053,6 +1065,56 @@ print_circular_bug_entry(struct lock_list *target, int depth) return 0; } +static void +print_circular_lock_scenario(struct held_lock *src, + struct held_lock *tgt, + struct lock_list *prt) +{ + struct lock_class *source = hlock_class(src); + struct lock_class *target = hlock_class(tgt); + struct lock_class *parent = prt->class; + + /* + * A direct locking problem where unsafe_class lock is taken + * directly by safe_class lock, then all we need to show + * is the deadlock scenario, as it is obvious that the + * unsafe lock is taken under the safe lock. + * + * But if there is a chain instead, where the safe lock takes + * an intermediate lock (middle_class) where this lock is + * not the same as the safe lock, then the lock chain is + * used to describe the problem. Otherwise we would need + * to show a different CPU case for each link in the chain + * from the safe_class lock to the unsafe_class lock. + */ + if (parent != source) { + printk("Chain exists of:\n "); + __print_lock_name(source); + printk(" --> "); + __print_lock_name(parent); + printk(" --> "); + __print_lock_name(target); + printk("\n\n"); + } + + printk(" Possible unsafe locking scenario:\n\n"); + printk(" CPU0 CPU1\n"); + printk(" ---- ----\n"); + printk(" lock("); + __print_lock_name(target); + printk(");\n"); + printk(" lock("); + __print_lock_name(parent); + printk(");\n"); + printk(" lock("); + __print_lock_name(target); + printk(");\n"); + printk(" lock("); + __print_lock_name(source); + printk(");\n"); + printk("\n *** DEADLOCK ***\n\n"); +} + /* * When a circular dependency is detected, print the * header first: @@ -1096,6 +1158,7 @@ static noinline int print_circular_bug(struct lock_list *this, { struct task_struct *curr = current; struct lock_list *parent; + struct lock_list *first_parent; int depth; if (!debug_locks_off_graph_unlock() || debug_locks_silent) @@ -1109,6 +1172,7 @@ static noinline int print_circular_bug(struct lock_list *this, print_circular_bug_header(target, depth, check_src, check_tgt); parent = get_lock_parent(target); + first_parent = parent; while (parent) { print_circular_bug_entry(parent, --depth); @@ -1116,6 +1180,9 @@ static noinline int print_circular_bug(struct lock_list *this, } printk("\nother info that might help us debug this:\n\n"); + print_circular_lock_scenario(check_src, check_tgt, + first_parent); + lockdep_print_held_locks(curr); printk("\nstack backtrace:\n"); @@ -1314,7 +1381,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf, printk("\n"); if (depth == 0 && (entry != root)) { - printk("lockdep:%s bad BFS generated tree\n", __func__); + printk("lockdep:%s bad path found in chain graph\n", __func__); break; } @@ -1325,6 +1392,62 @@ print_shortest_lock_dependencies(struct lock_list *leaf, return; } +static void +print_irq_lock_scenario(struct lock_list *safe_entry, + struct lock_list *unsafe_entry, + struct lock_class *prev_class, + struct lock_class *next_class) +{ + struct lock_class *safe_class = safe_entry->class; + struct lock_class *unsafe_class = unsafe_entry->class; + struct lock_class *middle_class = prev_class; + + if (middle_class == safe_class) + middle_class = next_class; + + /* + * A direct locking problem where unsafe_class lock is taken + * directly by safe_class lock, then all we need to show + * is the deadlock scenario, as it is obvious that the + * unsafe lock is taken under the safe lock. + * + * But if there is a chain instead, where the safe lock takes + * an intermediate lock (middle_class) where this lock is + * not the same as the safe lock, then the lock chain is + * used to describe the problem. Otherwise we would need + * to show a different CPU case for each link in the chain + * from the safe_class lock to the unsafe_class lock. + */ + if (middle_class != unsafe_class) { + printk("Chain exists of:\n "); + __print_lock_name(safe_class); + printk(" --> "); + __print_lock_name(middle_class); + printk(" --> "); + __print_lock_name(unsafe_class); + printk("\n\n"); + } + + printk(" Possible interrupt unsafe locking scenario:\n\n"); + printk(" CPU0 CPU1\n"); + printk(" ---- ----\n"); + printk(" lock("); + __print_lock_name(unsafe_class); + printk(");\n"); + printk(" local_irq_disable();\n"); + printk(" lock("); + __print_lock_name(safe_class); + printk(");\n"); + printk(" lock("); + __print_lock_name(middle_class); + printk(");\n"); + printk(" <Interrupt>\n"); + printk(" lock("); + __print_lock_name(safe_class); + printk(");\n"); + printk("\n *** DEADLOCK ***\n\n"); +} + static int print_bad_irq_dependency(struct task_struct *curr, struct lock_list *prev_root, @@ -1376,6 +1499,9 @@ print_bad_irq_dependency(struct task_struct *curr, print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); printk("\nother info that might help us debug this:\n\n"); + print_irq_lock_scenario(backwards_entry, forwards_entry, + hlock_class(prev), hlock_class(next)); + lockdep_print_held_locks(curr); printk("\nthe dependencies between %s-irq-safe lock", irqclass); @@ -1539,6 +1665,26 @@ static inline void inc_chains(void) #endif +static void +print_deadlock_scenario(struct held_lock *nxt, + struct held_lock *prv) +{ + struct lock_class *next = hlock_class(nxt); + struct lock_class *prev = hlock_class(prv); + + printk(" Possible unsafe locking scenario:\n\n"); + printk(" CPU0\n"); + printk(" ----\n"); + printk(" lock("); + __print_lock_name(prev); + printk(");\n"); + printk(" lock("); + __print_lock_name(next); + printk(");\n"); + printk("\n *** DEADLOCK ***\n\n"); + printk(" May be due to missing lock nesting notation\n\n"); +} + static int print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, struct held_lock *next) @@ -1557,6 +1703,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, print_lock(prev); printk("\nother info that might help us debug this:\n"); + print_deadlock_scenario(next, prev); lockdep_print_held_locks(curr); printk("\nstack backtrace:\n"); @@ -1826,7 +1973,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, struct list_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; struct held_lock *hlock_curr, *hlock_next; - int i, j, n, cn; + int i, j; if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; @@ -1886,15 +2033,9 @@ cache_hit: } i++; chain->depth = curr->lockdep_depth + 1 - i; - cn = nr_chain_hlocks; - while (cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS) { - n = cmpxchg(&nr_chain_hlocks, cn, cn + chain->depth); - if (n == cn) - break; - cn = n; - } - if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { - chain->base = cn; + if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { + chain->base = nr_chain_hlocks; + nr_chain_hlocks += chain->depth; for (j = 0; j < chain->depth - 1; j++, i++) { int lock_id = curr->held_locks[i].class_idx - 1; chain_hlocks[chain->base + j] = lock_id; @@ -2011,6 +2152,24 @@ static void check_chain_key(struct task_struct *curr) #endif } +static void +print_usage_bug_scenario(struct held_lock *lock) +{ + struct lock_class *class = hlock_class(lock); + + printk(" Possible unsafe locking scenario:\n\n"); + printk(" CPU0\n"); + printk(" ----\n"); + printk(" lock("); + __print_lock_name(class); + printk(");\n"); + printk(" <Interrupt>\n"); + printk(" lock("); + __print_lock_name(class); + printk(");\n"); + printk("\n *** DEADLOCK ***\n\n"); +} + static int print_usage_bug(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) @@ -2039,6 +2198,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, print_irqtrace_events(curr); printk("\nother info that might help us debug this:\n"); + print_usage_bug_scenario(this); + lockdep_print_held_locks(curr); printk("\nstack backtrace:\n"); @@ -2073,6 +2234,10 @@ print_irq_inversion_bug(struct task_struct *curr, struct held_lock *this, int forwards, const char *irqclass) { + struct lock_list *entry = other; + struct lock_list *middle = NULL; + int depth; + if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; @@ -2091,6 +2256,25 @@ print_irq_inversion_bug(struct task_struct *curr, printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); printk("\nother info that might help us debug this:\n"); + + /* Find a middle lock (if one exists) */ + depth = get_lock_depth(other); + do { + if (depth == 0 && (entry != root)) { + printk("lockdep:%s bad path found in chain graph\n", __func__); + break; + } + middle = entry; + entry = get_lock_parent(entry); + depth--; + } while (entry && entry != root && (depth >= 0)); + if (forwards) + print_irq_lock_scenario(root, other, + middle ? middle->class : root->class, other->class); + else + print_irq_lock_scenario(other, root, + middle ? middle->class : other->class, root->class); + lockdep_print_held_locks(curr); printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); @@ -2309,7 +2493,7 @@ void trace_hardirqs_on_caller(unsigned long ip) if (unlikely(curr->hardirqs_enabled)) { /* * Neither irq nor preemption are disabled here - * so this is racy by nature but loosing one hit + * so this is racy by nature but losing one hit * in a stat is not a big deal. */ __debug_atomic_inc(redundant_hardirqs_on); @@ -2620,7 +2804,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, if (!graph_lock()) return 0; /* - * Make sure we didnt race: + * Make sure we didn't race: */ if (unlikely(hlock_class(this)->usage_mask & new_mask)) { graph_unlock(); diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 1969d2fc4b36..71edd2f60c02 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -225,7 +225,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v) nr_irq_read_safe = 0, nr_irq_read_unsafe = 0, nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0, nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, - sum_forward_deps = 0, factor = 0; + sum_forward_deps = 0; list_for_each_entry(class, &all_lock_classes, lock_entry) { @@ -283,13 +283,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v) nr_hardirq_unsafe * nr_hardirq_safe + nr_list_entries); - /* - * Estimated factor between direct and indirect - * dependencies: - */ - if (nr_list_entries) - factor = sum_forward_deps / nr_list_entries; - #ifdef CONFIG_PROVE_LOCKING seq_printf(m, " dependency chains: %11lu [max: %lu]\n", nr_lock_chains, MAX_LOCKDEP_CHAINS); diff --git a/kernel/module.c b/kernel/module.c index efa290ea94bf..795bdc7f5c3f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -57,6 +57,7 @@ #include <linux/kmemleak.h> #include <linux/jump_label.h> #include <linux/pfn.h> +#include <linux/bsearch.h> #define CREATE_TRACE_POINTS #include <trace/events/module.h> @@ -240,23 +241,24 @@ static bool each_symbol_in_section(const struct symsearch *arr, struct module *owner, bool (*fn)(const struct symsearch *syms, struct module *owner, - unsigned int symnum, void *data), + void *data), void *data) { - unsigned int i, j; + unsigned int j; for (j = 0; j < arrsize; j++) { - for (i = 0; i < arr[j].stop - arr[j].start; i++) - if (fn(&arr[j], owner, i, data)) - return true; + if (fn(&arr[j], owner, data)) + return true; } return false; } /* Returns true as soon as fn returns true, otherwise false. */ -bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner, - unsigned int symnum, void *data), void *data) +bool each_symbol_section(bool (*fn)(const struct symsearch *arr, + struct module *owner, + void *data), + void *data) { struct module *mod; static const struct symsearch arr[] = { @@ -309,7 +311,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner, } return false; } -EXPORT_SYMBOL_GPL(each_symbol); +EXPORT_SYMBOL_GPL(each_symbol_section); struct find_symbol_arg { /* Input */ @@ -323,15 +325,12 @@ struct find_symbol_arg { const struct kernel_symbol *sym; }; -static bool find_symbol_in_section(const struct symsearch *syms, - struct module *owner, - unsigned int symnum, void *data) +static bool check_symbol(const struct symsearch *syms, + struct module *owner, + unsigned int symnum, void *data) { struct find_symbol_arg *fsa = data; - if (strcmp(syms->start[symnum].name, fsa->name) != 0) - return false; - if (!fsa->gplok) { if (syms->licence == GPL_ONLY) return false; @@ -365,6 +364,30 @@ static bool find_symbol_in_section(const struct symsearch *syms, return true; } +static int cmp_name(const void *va, const void *vb) +{ + const char *a; + const struct kernel_symbol *b; + a = va; b = vb; + return strcmp(a, b->name); +} + +static bool find_symbol_in_section(const struct symsearch *syms, + struct module *owner, + void *data) +{ + struct find_symbol_arg *fsa = data; + struct kernel_symbol *sym; + + sym = bsearch(fsa->name, syms->start, syms->stop - syms->start, + sizeof(struct kernel_symbol), cmp_name); + + if (sym != NULL && check_symbol(syms, owner, sym - syms->start, data)) + return true; + + return false; +} + /* Find a symbol and return it, along with, (optional) crc and * (optional) module which owns it. Needs preempt disabled or module_mutex. */ const struct kernel_symbol *find_symbol(const char *name, @@ -379,7 +402,7 @@ const struct kernel_symbol *find_symbol(const char *name, fsa.gplok = gplok; fsa.warn = warn; - if (each_symbol(find_symbol_in_section, &fsa)) { + if (each_symbol_section(find_symbol_in_section, &fsa)) { if (owner) *owner = fsa.owner; if (crc) @@ -809,7 +832,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, wait_for_zero_refcount(mod); mutex_unlock(&module_mutex); - /* Final destruction now noone is using it. */ + /* Final destruction now no one is using it. */ if (mod->exit != NULL) mod->exit(); blocking_notifier_call_chain(&module_notify_list, @@ -1168,7 +1191,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr, { struct module_sect_attr *sattr = container_of(mattr, struct module_sect_attr, mattr); - return sprintf(buf, "0x%lx\n", sattr->address); + return sprintf(buf, "0x%pK\n", (void *)sattr->address); } static void free_sect_attrs(struct module_sect_attrs *sect_attrs) @@ -1607,27 +1630,28 @@ static void set_section_ro_nx(void *base, } } -/* Setting memory back to RW+NX before releasing it */ -void unset_section_ro_nx(struct module *mod, void *module_region) +static void unset_module_core_ro_nx(struct module *mod) { - unsigned long total_pages; - - if (mod->module_core == module_region) { - /* Set core as NX+RW */ - total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size); - set_memory_nx((unsigned long)mod->module_core, total_pages); - set_memory_rw((unsigned long)mod->module_core, total_pages); + set_page_attributes(mod->module_core + mod->core_text_size, + mod->module_core + mod->core_size, + set_memory_x); + set_page_attributes(mod->module_core, + mod->module_core + mod->core_ro_size, + set_memory_rw); +} - } else if (mod->module_init == module_region) { - /* Set init as NX+RW */ - total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size); - set_memory_nx((unsigned long)mod->module_init, total_pages); - set_memory_rw((unsigned long)mod->module_init, total_pages); - } +static void unset_module_init_ro_nx(struct module *mod) +{ + set_page_attributes(mod->module_init + mod->init_text_size, + mod->module_init + mod->init_size, + set_memory_x); + set_page_attributes(mod->module_init, + mod->module_init + mod->init_ro_size, + set_memory_rw); } /* Iterate through all modules and set each module's text as RW */ -void set_all_modules_text_rw() +void set_all_modules_text_rw(void) { struct module *mod; @@ -1648,7 +1672,7 @@ void set_all_modules_text_rw() } /* Iterate through all modules and set each module's text as RO */ -void set_all_modules_text_ro() +void set_all_modules_text_ro(void) { struct module *mod; @@ -1669,7 +1693,8 @@ void set_all_modules_text_ro() } #else static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { } -static inline void unset_section_ro_nx(struct module *mod, void *module_region) { } +static void unset_module_core_ro_nx(struct module *mod) { } +static void unset_module_init_ro_nx(struct module *mod) { } #endif /* Free a module, remove from lists, etc. */ @@ -1696,7 +1721,7 @@ static void free_module(struct module *mod) destroy_params(mod->kp, mod->num_kp); /* This may be NULL, but that's OK */ - unset_section_ro_nx(mod, mod->module_init); + unset_module_init_ro_nx(mod); module_free(mod, mod->module_init); kfree(mod->args); percpu_modfree(mod); @@ -1705,7 +1730,7 @@ static void free_module(struct module *mod) lockdep_free_key_range(mod->module_core, mod->core_size); /* Finally, free the core (containing the module structure) */ - unset_section_ro_nx(mod, mod->module_core); + unset_module_core_ro_nx(mod); module_free(mod, mod->module_core); #ifdef CONFIG_MPU @@ -2030,11 +2055,8 @@ static const struct kernel_symbol *lookup_symbol(const char *name, const struct kernel_symbol *start, const struct kernel_symbol *stop) { - const struct kernel_symbol *ks = start; - for (; ks < stop; ks++) - if (strcmp(ks->name, name) == 0) - return ks; - return NULL; + return bsearch(name, start, stop - start, + sizeof(struct kernel_symbol), cmp_name); } static int is_exported(const char *name, unsigned long value, @@ -2777,7 +2799,7 @@ static struct module *load_module(void __user *umod, mod->state = MODULE_STATE_COMING; /* Now sew it into the lists so we can get lockdep and oops - * info during argument parsing. Noone should access us, since + * info during argument parsing. No one should access us, since * strong_try_module_get() will fail. * lockdep/oops can run asynchronous, so use the RCU list insertion * function to insert in a way safe to concurrent readers. @@ -2790,7 +2812,7 @@ static struct module *load_module(void __user *umod, } /* This has to be done once we're sure module name is unique. */ - if (!mod->taints) + if (!mod->taints || mod->taints == (1U<<TAINT_CRAP)) dynamic_debug_setup(info.debug, info.num_debug); /* Find duplicate symbols */ @@ -2827,7 +2849,7 @@ static struct module *load_module(void __user *umod, module_bug_cleanup(mod); ddebug: - if (!mod->taints) + if (!mod->taints || mod->taints == (1U<<TAINT_CRAP)) dynamic_debug_remove(info.debug); unlock: mutex_unlock(&module_mutex); @@ -2931,10 +2953,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, mod->symtab = mod->core_symtab; mod->strtab = mod->core_strtab; #endif - unset_section_ro_nx(mod, mod->module_init); + unset_module_init_ro_nx(mod); module_free(mod, mod->module_init); mod->module_init = NULL; mod->init_size = 0; + mod->init_ro_size = 0; mod->init_text_size = 0; mutex_unlock(&module_mutex); @@ -2971,7 +2994,7 @@ static const char *get_ksymbol(struct module *mod, else nextval = (unsigned long)mod->module_core+mod->core_text_size; - /* Scan for closest preceeding symbol, and next symbol. (ELF + /* Scan for closest preceding symbol, and next symbol. (ELF starts real symbols at 1). */ for (i = 1; i < mod->num_symtab; i++) { if (mod->symtab[i].st_shndx == SHN_UNDEF) @@ -3224,7 +3247,7 @@ static int m_show(struct seq_file *m, void *p) mod->state == MODULE_STATE_COMING ? "Loading": "Live"); /* Used by oprofile and other similar tools. */ - seq_printf(m, " 0x%p", mod->module_core); + seq_printf(m, " 0x%pK", mod->module_core); /* Taints info */ if (mod->taints) diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index ec815a960b5d..73da83aff418 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -75,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lock) return; DEBUG_LOCKS_WARN_ON(lock->magic != lock); - DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); + DEBUG_LOCKS_WARN_ON(lock->owner != current); DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); mutex_clear_owner(lock); } diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index 57d527a16f9d..0799fd3e4cfa 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h @@ -29,7 +29,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name, static inline void mutex_set_owner(struct mutex *lock) { - lock->owner = current_thread_info(); + lock->owner = current; } static inline void mutex_clear_owner(struct mutex *lock) diff --git a/kernel/mutex.c b/kernel/mutex.c index a5889fb28ecf..d607ed5dd441 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -131,14 +131,14 @@ EXPORT_SYMBOL(mutex_unlock); */ static inline int __sched __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, - unsigned long ip) + struct lockdep_map *nest_lock, unsigned long ip) { struct task_struct *task = current; struct mutex_waiter waiter; unsigned long flags; preempt_disable(); - mutex_acquire(&lock->dep_map, subclass, 0, ip); + mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER /* @@ -160,14 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, */ for (;;) { - struct thread_info *owner; - - /* - * If we own the BKL, then don't spin. The owner of - * the mutex might be waiting on us to release the BKL. - */ - if (unlikely(current->lock_depth >= 0)) - break; + struct task_struct *owner; /* * If there's an owner, wait for it to either @@ -245,7 +238,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } __set_task_state(task, state); - /* didnt get the lock, go to sleep: */ + /* didn't get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); preempt_enable_no_resched(); schedule(); @@ -276,16 +269,25 @@ void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, _RET_IP_); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); } EXPORT_SYMBOL_GPL(mutex_lock_nested); +void __sched +_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) +{ + might_sleep(); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_); +} + +EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); + int __sched mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); - return __mutex_lock_common(lock, TASK_KILLABLE, subclass, _RET_IP_); + return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); } EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); @@ -294,7 +296,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, - subclass, _RET_IP_); + subclass, NULL, _RET_IP_); } EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); @@ -400,7 +402,7 @@ __mutex_lock_slowpath(atomic_t *lock_count) { struct mutex *lock = container_of(lock_count, struct mutex, count); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); } static noinline int __sched @@ -408,7 +410,7 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count) { struct mutex *lock = container_of(lock_count, struct mutex, count); - return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_); + return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); } static noinline int __sched @@ -416,7 +418,7 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count) { struct mutex *lock = container_of(lock_count, struct mutex, count); - return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, _RET_IP_); + return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); } #endif diff --git a/kernel/mutex.h b/kernel/mutex.h index 67578ca48f94..4115fbf83b12 100644 --- a/kernel/mutex.h +++ b/kernel/mutex.h @@ -19,7 +19,7 @@ #ifdef CONFIG_SMP static inline void mutex_set_owner(struct mutex *lock) { - lock->owner = current_thread_info(); + lock->owner = current; } static inline void mutex_clear_owner(struct mutex *lock) diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c deleted file mode 100644 index 2c98ad94ba0e..000000000000 --- a/kernel/ns_cgroup.c +++ /dev/null @@ -1,118 +0,0 @@ -/* - * ns_cgroup.c - namespace cgroup subsystem - * - * Copyright 2006, 2007 IBM Corp - */ - -#include <linux/module.h> -#include <linux/cgroup.h> -#include <linux/fs.h> -#include <linux/proc_fs.h> -#include <linux/slab.h> -#include <linux/nsproxy.h> - -struct ns_cgroup { - struct cgroup_subsys_state css; -}; - -struct cgroup_subsys ns_subsys; - -static inline struct ns_cgroup *cgroup_to_ns( - struct cgroup *cgroup) -{ - return container_of(cgroup_subsys_state(cgroup, ns_subsys_id), - struct ns_cgroup, css); -} - -int ns_cgroup_clone(struct task_struct *task, struct pid *pid) -{ - char name[PROC_NUMBUF]; - - snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid)); - return cgroup_clone(task, &ns_subsys, name); -} - -/* - * Rules: - * 1. you can only enter a cgroup which is a descendant of your current - * cgroup - * 2. you can only place another process into a cgroup if - * a. you have CAP_SYS_ADMIN - * b. your cgroup is an ancestor of task's destination cgroup - * (hence either you are in the same cgroup as task, or in an - * ancestor cgroup thereof) - */ -static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, - struct task_struct *task, bool threadgroup) -{ - if (current != task) { - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (!cgroup_is_descendant(new_cgroup, current)) - return -EPERM; - } - - if (!cgroup_is_descendant(new_cgroup, task)) - return -EPERM; - - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &task->thread_group, thread_group) { - if (!cgroup_is_descendant(new_cgroup, c)) { - rcu_read_unlock(); - return -EPERM; - } - } - rcu_read_unlock(); - } - - return 0; -} - -/* - * Rules: you can only create a cgroup if - * 1. you are capable(CAP_SYS_ADMIN) - * 2. the target cgroup is a descendant of your own cgroup - */ -static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss, - struct cgroup *cgroup) -{ - struct ns_cgroup *ns_cgroup; - - if (!capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - if (!cgroup_is_descendant(cgroup, current)) - return ERR_PTR(-EPERM); - if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) { - printk("ns_cgroup can't be created with parent " - "'clone_children' set.\n"); - return ERR_PTR(-EINVAL); - } - - printk_once("ns_cgroup deprecated: consider using the " - "'clone_children' flag without the ns_cgroup.\n"); - - ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); - if (!ns_cgroup) - return ERR_PTR(-ENOMEM); - return &ns_cgroup->css; -} - -static void ns_destroy(struct cgroup_subsys *ss, - struct cgroup *cgroup) -{ - struct ns_cgroup *ns_cgroup; - - ns_cgroup = cgroup_to_ns(cgroup); - kfree(ns_cgroup); -} - -struct cgroup_subsys ns_subsys = { - .name = "ns", - .can_attach = ns_can_attach, - .create = ns_create, - .destroy = ns_destroy, - .subsys_id = ns_subsys_id, -}; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index f74e6c00e26d..d6a00f3de15d 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -22,6 +22,9 @@ #include <linux/pid_namespace.h> #include <net/net_namespace.h> #include <linux/ipc_namespace.h> +#include <linux/proc_fs.h> +#include <linux/file.h> +#include <linux/syscalls.h> static struct kmem_cache *nsproxy_cachep; @@ -69,13 +72,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_ns; } - new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns); + new_nsp->uts_ns = copy_utsname(flags, tsk); if (IS_ERR(new_nsp->uts_ns)) { err = PTR_ERR(new_nsp->uts_ns); goto out_uts; } - new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); + new_nsp->ipc_ns = copy_ipcs(flags, tsk); if (IS_ERR(new_nsp->ipc_ns)) { err = PTR_ERR(new_nsp->ipc_ns); goto out_ipc; @@ -198,10 +201,6 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, goto out; } - err = ns_cgroup_clone(current, task_pid(current)); - if (err) - put_nsproxy(*new_nsp); - out: return err; } @@ -233,6 +232,45 @@ void exit_task_namespaces(struct task_struct *p) switch_task_namespaces(p, NULL); } +SYSCALL_DEFINE2(setns, int, fd, int, nstype) +{ + const struct proc_ns_operations *ops; + struct task_struct *tsk = current; + struct nsproxy *new_nsproxy; + struct proc_inode *ei; + struct file *file; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + file = proc_ns_fget(fd); + if (IS_ERR(file)) + return PTR_ERR(file); + + err = -EINVAL; + ei = PROC_I(file->f_dentry->d_inode); + ops = ei->ns_ops; + if (nstype && (ops->type != nstype)) + goto out; + + new_nsproxy = create_new_namespaces(0, tsk, tsk->fs); + if (IS_ERR(new_nsproxy)) { + err = PTR_ERR(new_nsproxy); + goto out; + } + + err = ops->install(new_nsproxy, ei->ns); + if (err) { + free_nsproxy(new_nsproxy); + goto out; + } + switch_task_namespaces(tsk, new_nsproxy); +out: + fput(file); + return err; +} + static int __init nsproxy_cache_init(void) { nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); diff --git a/kernel/padata.c b/kernel/padata.c index 751019415d23..b91941df5e63 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -262,7 +262,7 @@ static void padata_reorder(struct parallel_data *pd) /* * This cpu has to do the parallel processing of the next * object. It's waiting in the cpu's parallelization queue, - * so exit imediately. + * so exit immediately. */ if (PTR_ERR(padata) == -ENODATA) { del_timer(&pd->timer); @@ -284,7 +284,7 @@ static void padata_reorder(struct parallel_data *pd) /* * The next object that needs serialization might have arrived to * the reorder queues in the meantime, we will be called again - * from the timer function if noone else cares for it. + * from the timer function if no one else cares for it. */ if (atomic_read(&pd->reorder_objects) && !(pinst->flags & PADATA_RESET)) @@ -515,7 +515,7 @@ static void __padata_stop(struct padata_instance *pinst) put_online_cpus(); } -/* Replace the internal control stucture with a new one. */ +/* Replace the internal control structure with a new one. */ static void padata_replace(struct padata_instance *pinst, struct parallel_data *pd_new) { @@ -768,7 +768,7 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) } /** - * padata_remove_cpu - remove a cpu from the one or both(serial and paralell) + * padata_remove_cpu - remove a cpu from the one or both(serial and parallel) * padata cpumasks. * * @pinst: padata instance diff --git a/kernel/panic.c b/kernel/panic.c index 991bb87a1704..69231670eb95 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -433,3 +433,13 @@ EXPORT_SYMBOL(__stack_chk_fail); core_param(panic, panic_timeout, int, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); + +static int __init oops_setup(char *s) +{ + if (!s) + return -EINVAL; + if (!strcmp(s, "panic")) + panic_on_oops = 1; + return 0; +} +early_param("oops", oops_setup); diff --git a/kernel/params.c b/kernel/params.c index 0da1411222b9..ed72e1330862 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -95,7 +95,7 @@ static int parse_one(char *param, /* Find parameter */ for (i = 0; i < num_params; i++) { if (parameq(param, params[i].name)) { - /* Noone handled NULL, so do it here. */ + /* No one handled NULL, so do it here. */ if (!val && params[i].ops->set != param_set_bool) return -EINVAL; DEBUGP("They are equal! Calling %p\n", @@ -297,21 +297,15 @@ EXPORT_SYMBOL(param_ops_charp); int param_set_bool(const char *val, const struct kernel_param *kp) { bool v; + int ret; /* No equals means "set"... */ if (!val) val = "1"; /* One of =[yYnN01] */ - switch (val[0]) { - case 'y': case 'Y': case '1': - v = true; - break; - case 'n': case 'N': case '0': - v = false; - break; - default: - return -EINVAL; - } + ret = strtobool(val, &v); + if (ret) + return ret; if (kp->flags & KPARAM_ISBOOL) *(bool *)kp->arg = v; @@ -821,15 +815,18 @@ ssize_t __modver_version_show(struct module_attribute *mattr, return sprintf(buf, "%s\n", vattr->version); } -extern struct module_version_attribute __start___modver[], __stop___modver[]; +extern const struct module_version_attribute *__start___modver[]; +extern const struct module_version_attribute *__stop___modver[]; static void __init version_sysfs_builtin(void) { - const struct module_version_attribute *vattr; + const struct module_version_attribute **p; struct module_kobject *mk; int err; - for (vattr = __start___modver; vattr < __stop___modver; vattr++) { + for (p = __start___modver; p < __stop___modver; p++) { + const struct module_version_attribute *vattr = *p; + mk = locate_module_kobject(vattr->module_name); if (mk) { err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); diff --git a/kernel/pid.c b/kernel/pid.c index 39b65b69584f..57a8346a270e 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -217,11 +217,14 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) return -1; } -int next_pidmap(struct pid_namespace *pid_ns, int last) +int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) { int offset; struct pidmap *map, *end; + if (last >= PID_MAX_LIMIT) + return -1; + offset = (last + 1) & BITS_PER_PAGE_MASK; map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; end = &pid_ns->pidmap[PIDMAP_ENTRIES]; @@ -435,6 +438,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type) rcu_read_unlock(); return pid; } +EXPORT_SYMBOL_GPL(get_task_pid); struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) { @@ -446,6 +450,7 @@ struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) rcu_read_unlock(); return result; } +EXPORT_SYMBOL_GPL(get_pid_task); struct pid *find_get_pid(pid_t nr) { diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a5aff94e1f0b..e9c9adc84ca6 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -14,6 +14,7 @@ #include <linux/err.h> #include <linux/acct.h> #include <linux/slab.h> +#include <linux/proc_fs.h> #define BITS_PER_PAGE (PAGE_SIZE*8) @@ -72,7 +73,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p { struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; - int i; + int i, err = -ENOMEM; ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); if (ns == NULL) @@ -96,14 +97,20 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p for (i = 1; i < PIDMAP_ENTRIES; i++) atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); + err = pid_ns_prepare_proc(ns); + if (err) + goto out_put_parent_pid_ns; + return ns; +out_put_parent_pid_ns: + put_pid_ns(parent_pid_ns); out_free_map: kfree(ns->pidmap[0].page); out_free: kmem_cache_free(pid_ns_cachep, ns); out: - return ERR_PTR(-ENOMEM); + return ERR_PTR(err); } static void destroy_pid_namespace(struct pid_namespace *ns) diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 6a8fad82a3ad..6824ca7d4d0c 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -40,6 +40,7 @@ #include <linux/string.h> #include <linux/platform_device.h> #include <linux/init.h> +#include <linux/kernel.h> #include <linux/uaccess.h> @@ -112,11 +113,14 @@ static struct pm_qos_object *pm_qos_array[] = { static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, size_t count, loff_t *f_pos); +static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, + size_t count, loff_t *f_pos); static int pm_qos_power_open(struct inode *inode, struct file *filp); static int pm_qos_power_release(struct inode *inode, struct file *filp); static const struct file_operations pm_qos_power_fops = { .write = pm_qos_power_write, + .read = pm_qos_power_read, .open = pm_qos_power_open, .release = pm_qos_power_release, .llseek = noop_llseek, @@ -389,28 +393,61 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp) } +static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, + size_t count, loff_t *f_pos) +{ + s32 value; + unsigned long flags; + struct pm_qos_object *o; + struct pm_qos_request_list *pm_qos_req = filp->private_data; + + if (!pm_qos_req) + return -EINVAL; + if (!pm_qos_request_active(pm_qos_req)) + return -EINVAL; + + o = pm_qos_array[pm_qos_req->pm_qos_class]; + spin_lock_irqsave(&pm_qos_lock, flags); + value = pm_qos_get_value(o); + spin_unlock_irqrestore(&pm_qos_lock, flags); + + return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); +} + static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, size_t count, loff_t *f_pos) { s32 value; - int x; - char ascii_value[11]; struct pm_qos_request_list *pm_qos_req; if (count == sizeof(s32)) { if (copy_from_user(&value, buf, sizeof(s32))) return -EFAULT; - } else if (count == 11) { /* len('0x12345678/0') */ - if (copy_from_user(ascii_value, buf, 11)) + } else if (count <= 11) { /* ASCII perhaps? */ + char ascii_value[11]; + unsigned long int ulval; + int ret; + + if (copy_from_user(ascii_value, buf, count)) return -EFAULT; - if (strlen(ascii_value) != 10) - return -EINVAL; - x = sscanf(ascii_value, "%x", &value); - if (x != 1) + + if (count > 10) { + if (ascii_value[10] == '\n') + ascii_value[10] = '\0'; + else + return -EINVAL; + } else { + ascii_value[count] = '\0'; + } + ret = strict_strtoul(ascii_value, 16, &ulval); + if (ret) { + pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret); return -EINVAL; - pr_debug("%s, %d, 0x%x\n", ascii_value, x, value); - } else + } + value = (s32)lower_32_bits(ulval); + } else { return -EINVAL; + } pm_qos_req = filp->private_data; pm_qos_update_request(pm_qos_req, value); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 05bb7173850e..58f405b581e7 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -176,7 +176,8 @@ static inline cputime_t virt_ticks(struct task_struct *p) return p->utime; } -int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) +static int +posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) { int error = check_clock(which_clock); if (!error) { @@ -194,7 +195,8 @@ int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) return error; } -int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) +static int +posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) { /* * You can never reset a CPU clock, but we check for other errors @@ -317,7 +319,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock, } -int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) +static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) { const pid_t pid = CPUCLOCK_PID(which_clock); int error = -EINVAL; @@ -379,7 +381,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) * This is called from sys_timer_create() and do_cpu_nanosleep() with the * new timer already all-zeros initialized. */ -int posix_cpu_timer_create(struct k_itimer *new_timer) +static int posix_cpu_timer_create(struct k_itimer *new_timer) { int ret = 0; const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); @@ -425,7 +427,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) * If we return TIMER_RETRY, it's necessary to release the timer's lock * and try again. (This happens when the timer is in the middle of firing.) */ -int posix_cpu_timer_del(struct k_itimer *timer) +static int posix_cpu_timer_del(struct k_itimer *timer) { struct task_struct *p = timer->it.cpu.task; int ret = 0; @@ -665,8 +667,8 @@ static int cpu_timer_sample_group(const clockid_t which_clock, * If we return TIMER_RETRY, it's necessary to release the timer's lock * and try again. (This happens when the timer is in the middle of firing.) */ -int posix_cpu_timer_set(struct k_itimer *timer, int flags, - struct itimerspec *new, struct itimerspec *old) +static int posix_cpu_timer_set(struct k_itimer *timer, int flags, + struct itimerspec *new, struct itimerspec *old) { struct task_struct *p = timer->it.cpu.task; union cpu_time_count old_expires, new_expires, old_incr, val; @@ -820,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, return ret; } -void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) +static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) { union cpu_time_count now; struct task_struct *p = timer->it.cpu.task; @@ -1345,7 +1347,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) /* * Now that all the timers on our list have the firing flag, - * noone will touch their list entries but us. We'll take + * no one will touch their list entries but us. We'll take * each timer's lock before clearing its firing flag, so no * timer call will interfere. */ @@ -1481,11 +1483,13 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, return error; } -int posix_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct timespec __user *rmtp) +static long posix_cpu_nsleep_restart(struct restart_block *restart_block); + +static int posix_cpu_nsleep(const clockid_t which_clock, int flags, + struct timespec *rqtp, struct timespec __user *rmtp) { struct restart_block *restart_block = - ¤t_thread_info()->restart_block; + ¤t_thread_info()->restart_block; struct itimerspec it; int error; @@ -1501,56 +1505,47 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags, if (error == -ERESTART_RESTARTBLOCK) { - if (flags & TIMER_ABSTIME) + if (flags & TIMER_ABSTIME) return -ERESTARTNOHAND; /* - * Report back to the user the time still remaining. - */ - if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + * Report back to the user the time still remaining. + */ + if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) return -EFAULT; restart_block->fn = posix_cpu_nsleep_restart; - restart_block->arg0 = which_clock; - restart_block->arg1 = (unsigned long) rmtp; - restart_block->arg2 = rqtp->tv_sec; - restart_block->arg3 = rqtp->tv_nsec; + restart_block->nanosleep.clockid = which_clock; + restart_block->nanosleep.rmtp = rmtp; + restart_block->nanosleep.expires = timespec_to_ns(rqtp); } return error; } -long posix_cpu_nsleep_restart(struct restart_block *restart_block) +static long posix_cpu_nsleep_restart(struct restart_block *restart_block) { - clockid_t which_clock = restart_block->arg0; - struct timespec __user *rmtp; + clockid_t which_clock = restart_block->nanosleep.clockid; struct timespec t; struct itimerspec it; int error; - rmtp = (struct timespec __user *) restart_block->arg1; - t.tv_sec = restart_block->arg2; - t.tv_nsec = restart_block->arg3; + t = ns_to_timespec(restart_block->nanosleep.expires); - restart_block->fn = do_no_restart_syscall; error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); if (error == -ERESTART_RESTARTBLOCK) { + struct timespec __user *rmtp = restart_block->nanosleep.rmtp; /* - * Report back to the user the time still remaining. - */ - if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + * Report back to the user the time still remaining. + */ + if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) return -EFAULT; - restart_block->fn = posix_cpu_nsleep_restart; - restart_block->arg0 = which_clock; - restart_block->arg1 = (unsigned long) rmtp; - restart_block->arg2 = t.tv_sec; - restart_block->arg3 = t.tv_nsec; + restart_block->nanosleep.expires = timespec_to_ns(&t); } return error; } - #define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) #define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) @@ -1594,38 +1589,37 @@ static int thread_cpu_timer_create(struct k_itimer *timer) timer->it_clock = THREAD_CLOCK; return posix_cpu_timer_create(timer); } -static int thread_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct timespec __user *rmtp) -{ - return -EINVAL; -} -static long thread_cpu_nsleep_restart(struct restart_block *restart_block) -{ - return -EINVAL; -} + +struct k_clock clock_posix_cpu = { + .clock_getres = posix_cpu_clock_getres, + .clock_set = posix_cpu_clock_set, + .clock_get = posix_cpu_clock_get, + .timer_create = posix_cpu_timer_create, + .nsleep = posix_cpu_nsleep, + .nsleep_restart = posix_cpu_nsleep_restart, + .timer_set = posix_cpu_timer_set, + .timer_del = posix_cpu_timer_del, + .timer_get = posix_cpu_timer_get, +}; static __init int init_posix_cpu_timers(void) { struct k_clock process = { - .clock_getres = process_cpu_clock_getres, - .clock_get = process_cpu_clock_get, - .clock_set = do_posix_clock_nosettime, - .timer_create = process_cpu_timer_create, - .nsleep = process_cpu_nsleep, - .nsleep_restart = process_cpu_nsleep_restart, + .clock_getres = process_cpu_clock_getres, + .clock_get = process_cpu_clock_get, + .timer_create = process_cpu_timer_create, + .nsleep = process_cpu_nsleep, + .nsleep_restart = process_cpu_nsleep_restart, }; struct k_clock thread = { - .clock_getres = thread_cpu_clock_getres, - .clock_get = thread_cpu_clock_get, - .clock_set = do_posix_clock_nosettime, - .timer_create = thread_cpu_timer_create, - .nsleep = thread_cpu_nsleep, - .nsleep_restart = thread_cpu_nsleep_restart, + .clock_getres = thread_cpu_clock_getres, + .clock_get = thread_cpu_clock_get, + .timer_create = thread_cpu_timer_create, }; struct timespec ts; - register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); - register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); + posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process); + posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread); cputime_to_timespec(cputime_one_jiffy, &ts); onecputick = ts.tv_nsec; diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 93bd2eb2bc53..4556182527f3 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -41,6 +41,7 @@ #include <linux/init.h> #include <linux/compiler.h> #include <linux/idr.h> +#include <linux/posix-clock.h> #include <linux/posix-timers.h> #include <linux/syscalls.h> #include <linux/wait.h> @@ -81,6 +82,14 @@ static DEFINE_SPINLOCK(idr_lock); #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" #endif +/* + * parisc wants ENOTSUP instead of EOPNOTSUPP + */ +#ifndef ENOTSUP +# define ENANOSLEEP_NOTSUP EOPNOTSUPP +#else +# define ENANOSLEEP_NOTSUP ENOTSUP +#endif /* * The timer ID is turned into a timer address by idr_find(). @@ -94,11 +103,7 @@ static DEFINE_SPINLOCK(idr_lock); /* * CLOCKs: The POSIX standard calls for a couple of clocks and allows us * to implement others. This structure defines the various - * clocks and allows the possibility of adding others. We - * provide an interface to add clocks to the table and expect - * the "arch" code to add at least one clock that is high - * resolution. Here we define the standard CLOCK_REALTIME as a - * 1/HZ resolution clock. + * clocks. * * RESOLUTION: Clock resolution is used to round up timer and interval * times, NOT to report clock times, which are reported with as @@ -108,20 +113,13 @@ static DEFINE_SPINLOCK(idr_lock); * necessary code is written. The standard says we should say * something about this issue in the documentation... * - * FUNCTIONS: The CLOCKs structure defines possible functions to handle - * various clock functions. For clocks that use the standard - * system timer code these entries should be NULL. This will - * allow dispatch without the overhead of indirect function - * calls. CLOCKS that depend on other sources (e.g. WWV or GPS) - * must supply functions here, even if the function just returns - * ENOSYS. The standard POSIX timer management code assumes the - * following: 1.) The k_itimer struct (sched.h) is used for the - * timer. 2.) The list, it_lock, it_clock, it_id and it_pid - * fields are not modified by timer code. + * FUNCTIONS: The CLOCKs structure defines possible functions to + * handle various clock functions. * - * At this time all functions EXCEPT clock_nanosleep can be - * redirected by the CLOCKS structure. Clock_nanosleep is in - * there, but the code ignores it. + * The standard POSIX timer management code assumes the + * following: 1.) The k_itimer struct (sched.h) is used for + * the timer. 2.) The list, it_lock, it_clock, it_id and + * it_pid fields are not modified by timer code. * * Permissions: It is assumed that the clock_settime() function defined * for each clock will take care of permission checks. Some @@ -138,6 +136,7 @@ static struct k_clock posix_clocks[MAX_CLOCKS]; */ static int common_nsleep(const clockid_t, int flags, struct timespec *t, struct timespec __user *rmtp); +static int common_timer_create(struct k_itimer *new_timer); static void common_timer_get(struct k_itimer *, struct itimerspec *); static int common_timer_set(struct k_itimer *, int, struct itimerspec *, struct itimerspec *); @@ -158,76 +157,24 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) spin_unlock_irqrestore(&timr->it_lock, flags); } -/* - * Call the k_clock hook function if non-null, or the default function. - */ -#define CLOCK_DISPATCH(clock, call, arglist) \ - ((clock) < 0 ? posix_cpu_##call arglist : \ - (posix_clocks[clock].call != NULL \ - ? (*posix_clocks[clock].call) arglist : common_##call arglist)) - -/* - * Default clock hook functions when the struct k_clock passed - * to register_posix_clock leaves a function pointer null. - * - * The function common_CALL is the default implementation for - * the function pointer CALL in struct k_clock. - */ - -static inline int common_clock_getres(const clockid_t which_clock, - struct timespec *tp) -{ - tp->tv_sec = 0; - tp->tv_nsec = posix_clocks[which_clock].res; - return 0; -} - -/* - * Get real time for posix timers - */ -static int common_clock_get(clockid_t which_clock, struct timespec *tp) +/* Get clock_realtime */ +static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp) { ktime_get_real_ts(tp); return 0; } -static inline int common_clock_set(const clockid_t which_clock, - struct timespec *tp) +/* Set clock_realtime */ +static int posix_clock_realtime_set(const clockid_t which_clock, + const struct timespec *tp) { return do_sys_settimeofday(tp, NULL); } -static int common_timer_create(struct k_itimer *new_timer) -{ - hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); - return 0; -} - -static int no_timer_create(struct k_itimer *new_timer) -{ - return -EOPNOTSUPP; -} - -static int no_nsleep(const clockid_t which_clock, int flags, - struct timespec *tsave, struct timespec __user *rmtp) -{ - return -EOPNOTSUPP; -} - -/* - * Return nonzero if we know a priori this clockid_t value is bogus. - */ -static inline int invalid_clockid(const clockid_t which_clock) +static int posix_clock_realtime_adj(const clockid_t which_clock, + struct timex *t) { - if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */ - return 0; - if ((unsigned) which_clock >= MAX_CLOCKS) - return 1; - if (posix_clocks[which_clock].clock_getres != NULL) - return 0; - if (posix_clocks[which_clock].res != 0) - return 0; - return 1; + return do_adjtimex(t); } /* @@ -240,7 +187,7 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) } /* - * Get monotonic time for posix timers + * Get monotonic-raw time for posix timers */ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) { @@ -267,46 +214,70 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp *tp = ktime_to_timespec(KTIME_LOW_RES); return 0; } + +static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp) +{ + get_monotonic_boottime(tp); + return 0; +} + + /* * Initialize everything, well, just everything in Posix clocks/timers ;) */ static __init int init_posix_timers(void) { struct k_clock clock_realtime = { - .clock_getres = hrtimer_get_res, + .clock_getres = hrtimer_get_res, + .clock_get = posix_clock_realtime_get, + .clock_set = posix_clock_realtime_set, + .clock_adj = posix_clock_realtime_adj, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, }; struct k_clock clock_monotonic = { - .clock_getres = hrtimer_get_res, - .clock_get = posix_ktime_get_ts, - .clock_set = do_posix_clock_nosettime, + .clock_getres = hrtimer_get_res, + .clock_get = posix_ktime_get_ts, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, }; struct k_clock clock_monotonic_raw = { - .clock_getres = hrtimer_get_res, - .clock_get = posix_get_monotonic_raw, - .clock_set = do_posix_clock_nosettime, - .timer_create = no_timer_create, - .nsleep = no_nsleep, + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_monotonic_raw, }; struct k_clock clock_realtime_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_realtime_coarse, - .clock_set = do_posix_clock_nosettime, - .timer_create = no_timer_create, - .nsleep = no_nsleep, + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_realtime_coarse, }; struct k_clock clock_monotonic_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_monotonic_coarse, - .clock_set = do_posix_clock_nosettime, - .timer_create = no_timer_create, - .nsleep = no_nsleep, + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_monotonic_coarse, + }; + struct k_clock clock_boottime = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_boottime, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, }; - register_posix_clock(CLOCK_REALTIME, &clock_realtime); - register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); - register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); - register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); - register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); + posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime); + posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic); + posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); + posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); + posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); + posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, @@ -342,7 +313,7 @@ static void schedule_next_timer(struct k_itimer *timr) * restarted (i.e. we have flagged this in the sys_private entry of the * info block). * - * To protect aginst the timer going away while the interrupt is queued, + * To protect against the timer going away while the interrupt is queued, * we require that the it_requeue_pending flag be set. */ void do_schedule_next_timer(struct siginfo *info) @@ -482,17 +453,29 @@ static struct pid *good_sigevent(sigevent_t * event) return task_pid(rtn); } -void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) +void posix_timers_register_clock(const clockid_t clock_id, + struct k_clock *new_clock) { if ((unsigned) clock_id >= MAX_CLOCKS) { - printk("POSIX clock register failed for clock_id %d\n", + printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n", + clock_id); + return; + } + + if (!new_clock->clock_get) { + printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n", + clock_id); + return; + } + if (!new_clock->clock_getres) { + printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n", clock_id); return; } posix_clocks[clock_id] = *new_clock; } -EXPORT_SYMBOL_GPL(register_posix_clock); +EXPORT_SYMBOL_GPL(posix_timers_register_clock); static struct k_itimer * alloc_posix_timer(void) { @@ -508,6 +491,13 @@ static struct k_itimer * alloc_posix_timer(void) return tmr; } +static void k_itimer_rcu_free(struct rcu_head *head) +{ + struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu); + + kmem_cache_free(posix_timers_cache, tmr); +} + #define IT_ID_SET 1 #define IT_ID_NOT_SET 0 static void release_posix_timer(struct k_itimer *tmr, int it_id_set) @@ -520,7 +510,24 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) } put_pid(tmr->it_pid); sigqueue_free(tmr->sigq); - kmem_cache_free(posix_timers_cache, tmr); + call_rcu(&tmr->it.rcu, k_itimer_rcu_free); +} + +static struct k_clock *clockid_to_kclock(const clockid_t id) +{ + if (id < 0) + return (id & CLOCKFD_MASK) == CLOCKFD ? + &clock_posix_dynamic : &clock_posix_cpu; + + if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres) + return NULL; + return &posix_clocks[id]; +} + +static int common_timer_create(struct k_itimer *new_timer) +{ + hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); + return 0; } /* Create a POSIX.1b interval timer. */ @@ -529,13 +536,16 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, struct sigevent __user *, timer_event_spec, timer_t __user *, created_timer_id) { + struct k_clock *kc = clockid_to_kclock(which_clock); struct k_itimer *new_timer; int error, new_timer_id; sigevent_t event; int it_id_set = IT_ID_NOT_SET; - if (invalid_clockid(which_clock)) + if (!kc) return -EINVAL; + if (!kc->timer_create) + return -EOPNOTSUPP; new_timer = alloc_posix_timer(); if (unlikely(!new_timer)) @@ -597,7 +607,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, goto out; } - error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); + error = kc->timer_create(new_timer); if (error) goto out; @@ -607,7 +617,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, spin_unlock_irq(¤t->sighand->siglock); return 0; - /* + /* * In the case of the timer belonging to another task, after * the task is unlocked, the timer is owned by the other task * and may cease to exist at any time. Don't use or modify @@ -628,22 +638,18 @@ out: static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) { struct k_itimer *timr; - /* - * Watch out here. We do a irqsave on the idr_lock and pass the - * flags part over to the timer lock. Must not let interrupts in - * while we are moving the lock. - */ - spin_lock_irqsave(&idr_lock, *flags); + + rcu_read_lock(); timr = idr_find(&posix_timers_id, (int)timer_id); if (timr) { - spin_lock(&timr->it_lock); + spin_lock_irqsave(&timr->it_lock, *flags); if (timr->it_signal == current->signal) { - spin_unlock(&idr_lock); + rcu_read_unlock(); return timr; } - spin_unlock(&timr->it_lock); + spin_unlock_irqrestore(&timr->it_lock, *flags); } - spin_unlock_irqrestore(&idr_lock, *flags); + rcu_read_unlock(); return NULL; } @@ -709,22 +715,28 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, struct itimerspec __user *, setting) { - struct k_itimer *timr; struct itimerspec cur_setting; + struct k_itimer *timr; + struct k_clock *kc; unsigned long flags; + int ret = 0; timr = lock_timer(timer_id, &flags); if (!timr) return -EINVAL; - CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting)); + kc = clockid_to_kclock(timr->it_clock); + if (WARN_ON_ONCE(!kc || !kc->timer_get)) + ret = -EINVAL; + else + kc->timer_get(timr, &cur_setting); unlock_timer(timr, flags); - if (copy_to_user(setting, &cur_setting, sizeof (cur_setting))) + if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting))) return -EFAULT; - return 0; + return ret; } /* @@ -813,6 +825,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, int error = 0; unsigned long flag; struct itimerspec *rtn = old_setting ? &old_spec : NULL; + struct k_clock *kc; if (!new_setting) return -EINVAL; @@ -828,8 +841,11 @@ retry: if (!timr) return -EINVAL; - error = CLOCK_DISPATCH(timr->it_clock, timer_set, - (timr, flags, &new_spec, rtn)); + kc = clockid_to_kclock(timr->it_clock); + if (WARN_ON_ONCE(!kc || !kc->timer_set)) + error = -EINVAL; + else + error = kc->timer_set(timr, flags, &new_spec, rtn); unlock_timer(timr, flag); if (error == TIMER_RETRY) { @@ -844,7 +860,7 @@ retry: return error; } -static inline int common_timer_del(struct k_itimer *timer) +static int common_timer_del(struct k_itimer *timer) { timer->it.real.interval.tv64 = 0; @@ -855,7 +871,11 @@ static inline int common_timer_del(struct k_itimer *timer) static inline int timer_delete_hook(struct k_itimer *timer) { - return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer)); + struct k_clock *kc = clockid_to_kclock(timer->it_clock); + + if (WARN_ON_ONCE(!kc || !kc->timer_del)) + return -EINVAL; + return kc->timer_del(timer); } /* Delete a POSIX.1b interval timer. */ @@ -927,69 +947,76 @@ void exit_itimers(struct signal_struct *sig) } } -/* Not available / possible... functions */ -int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp) -{ - return -EINVAL; -} -EXPORT_SYMBOL_GPL(do_posix_clock_nosettime); - -int do_posix_clock_nonanosleep(const clockid_t clock, int flags, - struct timespec *t, struct timespec __user *r) -{ -#ifndef ENOTSUP - return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */ -#else /* parisc does define it separately. */ - return -ENOTSUP; -#endif -} -EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep); - SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, const struct timespec __user *, tp) { + struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec new_tp; - if (invalid_clockid(which_clock)) + if (!kc || !kc->clock_set) return -EINVAL; + if (copy_from_user(&new_tp, tp, sizeof (*tp))) return -EFAULT; - return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp)); + return kc->clock_set(which_clock, &new_tp); } SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, struct timespec __user *,tp) { + struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec kernel_tp; int error; - if (invalid_clockid(which_clock)) + if (!kc) return -EINVAL; - error = CLOCK_DISPATCH(which_clock, clock_get, - (which_clock, &kernel_tp)); + + error = kc->clock_get(which_clock, &kernel_tp); + if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) error = -EFAULT; return error; +} + +SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, + struct timex __user *, utx) +{ + struct k_clock *kc = clockid_to_kclock(which_clock); + struct timex ktx; + int err; + + if (!kc) + return -EINVAL; + if (!kc->clock_adj) + return -EOPNOTSUPP; + if (copy_from_user(&ktx, utx, sizeof(ktx))) + return -EFAULT; + + err = kc->clock_adj(which_clock, &ktx); + + if (!err && copy_to_user(utx, &ktx, sizeof(ktx))) + return -EFAULT; + + return err; } SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) { + struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec rtn_tp; int error; - if (invalid_clockid(which_clock)) + if (!kc) return -EINVAL; - error = CLOCK_DISPATCH(which_clock, clock_getres, - (which_clock, &rtn_tp)); + error = kc->clock_getres(which_clock, &rtn_tp); - if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) { + if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) error = -EFAULT; - } return error; } @@ -1009,10 +1036,13 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, const struct timespec __user *, rqtp, struct timespec __user *, rmtp) { + struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec t; - if (invalid_clockid(which_clock)) + if (!kc) return -EINVAL; + if (!kc->nsleep) + return -ENANOSLEEP_NOTSUP; if (copy_from_user(&t, rqtp, sizeof (struct timespec))) return -EFAULT; @@ -1020,27 +1050,20 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, if (!timespec_valid(&t)) return -EINVAL; - return CLOCK_DISPATCH(which_clock, nsleep, - (which_clock, flags, &t, rmtp)); -} - -/* - * nanosleep_restart for monotonic and realtime clocks - */ -static int common_nsleep_restart(struct restart_block *restart_block) -{ - return hrtimer_nanosleep_restart(restart_block); + return kc->nsleep(which_clock, flags, &t, rmtp); } /* * This will restart clock_nanosleep. This is required only by * compat_clock_nanosleep_restart for now. */ -long -clock_nanosleep_restart(struct restart_block *restart_block) +long clock_nanosleep_restart(struct restart_block *restart_block) { - clockid_t which_clock = restart_block->arg0; + clockid_t which_clock = restart_block->nanosleep.clockid; + struct k_clock *kc = clockid_to_kclock(which_clock); + + if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) + return -EINVAL; - return CLOCK_DISPATCH(which_clock, nsleep_restart, - (restart_block)); + return kc->nsleep_restart(restart_block); } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 265729966ece..87f4d24b55b0 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -1,125 +1,12 @@ -config PM - bool "Power Management support" - depends on !IA64_HP_SIM - ---help--- - "Power Management" means that parts of your computer are shut - off or put into a power conserving "sleep" mode if they are not - being used. There are two competing standards for doing this: APM - and ACPI. If you want to use either one, say Y here and then also - to the requisite support below. - - Power Management is most important for battery powered laptop - computers; if you have a laptop, check out the Linux Laptop home - page on the WWW at <http://www.linux-on-laptops.com/> or - Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/> - and the Battery Powered Linux mini-HOWTO, available from - <http://www.tldp.org/docs.html#howto>. - - Note that, even if you say N here, Linux on the x86 architecture - will issue the hlt instruction if nothing is to be done, thereby - sending the processor to sleep and saving power. - -config PM_DEBUG - bool "Power Management Debug Support" - depends on PM - ---help--- - This option enables various debugging support in the Power Management - code. This is helpful when debugging and reporting PM bugs, like - suspend support. - -config PM_ADVANCED_DEBUG - bool "Extra PM attributes in sysfs for low-level debugging/testing" - depends on PM_DEBUG - default n - ---help--- - Add extra sysfs attributes allowing one to access some Power Management - fields of device objects from user space. If you are not a kernel - developer interested in debugging/testing Power Management, say "no". - -config PM_VERBOSE - bool "Verbose Power Management debugging" - depends on PM_DEBUG - default n - ---help--- - This option enables verbose messages from the Power Management code. - -config CAN_PM_TRACE - def_bool y - depends on PM_DEBUG && PM_SLEEP && EXPERIMENTAL - -config PM_TRACE - bool - help - This enables code to save the last PM event point across - reboot. The architecture needs to support this, x86 for - example does by saving things in the RTC, see below. - - The architecture specific code must provide the extern - functions from <linux/resume-trace.h> as well as the - <asm/resume-trace.h> header with a TRACE_RESUME() macro. - - The way the information is presented is architecture- - dependent, x86 will print the information during a - late_initcall. - -config PM_TRACE_RTC - bool "Suspend/resume event tracing" - depends on CAN_PM_TRACE - depends on X86 - select PM_TRACE - default n - ---help--- - This enables some cheesy code to save the last PM event point in the - RTC across reboots, so that you can debug a machine that just hangs - during suspend (or more commonly, during resume). - - To use this debugging feature you should attempt to suspend the - machine, reboot it and then run - - dmesg -s 1000000 | grep 'hash matches' - - CAUTION: this option will cause your machine's real-time clock to be - set to an invalid time after a resume. - -config PM_SLEEP_SMP - bool - depends on SMP - depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE - depends on PM_SLEEP - select HOTPLUG - select HOTPLUG_CPU - default y - -config PM_SLEEP - bool - depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE - default y - -config PM_SLEEP_ADVANCED_DEBUG - bool - depends on PM_ADVANCED_DEBUG - default n - config SUSPEND bool "Suspend to RAM and standby" - depends on PM && ARCH_SUSPEND_POSSIBLE + depends on ARCH_SUSPEND_POSSIBLE default y ---help--- Allow the system to enter sleep states in which main memory is powered and thus its contents are preserved, such as the suspend-to-RAM state (e.g. the ACPI S3 state). -config PM_TEST_SUSPEND - bool "Test suspend/resume and wakealarm during bootup" - depends on SUSPEND && PM_DEBUG && RTC_CLASS=y - ---help--- - This option will let you suspend your machine during bootup, and - make it wake up a few seconds later using an RTC wakeup alarm. - Enable this with a kernel parameter like "test_suspend=mem". - - You probably want to have your system's RTC driver statically - linked, ensuring that it's available when this test runs. - config SUSPEND_FREEZER bool "Enable freezer for suspend to RAM/standby" \ if ARCH_WANTS_FREEZER_CONTROL || BROKEN @@ -131,9 +18,13 @@ config SUSPEND_FREEZER Turning OFF this setting is NOT recommended! If in doubt, say Y. +config HIBERNATE_CALLBACKS + bool + config HIBERNATION bool "Hibernation (aka 'suspend to disk')" - depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE + depends on SWAP && ARCH_HIBERNATION_POSSIBLE + select HIBERNATE_CALLBACKS select LZO_COMPRESS select LZO_DECOMPRESS ---help--- @@ -196,6 +87,100 @@ config PM_STD_PARTITION suspended image to. It will simply pick the first available swap device. +config PM_SLEEP + def_bool y + depends on SUSPEND || HIBERNATE_CALLBACKS + +config PM_SLEEP_SMP + def_bool y + depends on SMP + depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE + depends on PM_SLEEP + select HOTPLUG + select HOTPLUG_CPU + +config PM_RUNTIME + bool "Run-time PM core functionality" + depends on !IA64_HP_SIM + ---help--- + Enable functionality allowing I/O devices to be put into energy-saving + (low power) states at run time (or autosuspended) after a specified + period of inactivity and woken up in response to a hardware-generated + wake-up event or a driver's request. + + Hardware support is generally required for this functionality to work + and the bus type drivers of the buses the devices are on are + responsible for the actual handling of the autosuspend requests and + wake-up events. + +config PM + def_bool y + depends on PM_SLEEP || PM_RUNTIME + +config PM_DEBUG + bool "Power Management Debug Support" + depends on PM + ---help--- + This option enables various debugging support in the Power Management + code. This is helpful when debugging and reporting PM bugs, like + suspend support. + +config PM_ADVANCED_DEBUG + bool "Extra PM attributes in sysfs for low-level debugging/testing" + depends on PM_DEBUG + ---help--- + Add extra sysfs attributes allowing one to access some Power Management + fields of device objects from user space. If you are not a kernel + developer interested in debugging/testing Power Management, say "no". + +config PM_TEST_SUSPEND + bool "Test suspend/resume and wakealarm during bootup" + depends on SUSPEND && PM_DEBUG && RTC_CLASS=y + ---help--- + This option will let you suspend your machine during bootup, and + make it wake up a few seconds later using an RTC wakeup alarm. + Enable this with a kernel parameter like "test_suspend=mem". + + You probably want to have your system's RTC driver statically + linked, ensuring that it's available when this test runs. + +config CAN_PM_TRACE + def_bool y + depends on PM_DEBUG && PM_SLEEP + +config PM_TRACE + bool + help + This enables code to save the last PM event point across + reboot. The architecture needs to support this, x86 for + example does by saving things in the RTC, see below. + + The architecture specific code must provide the extern + functions from <linux/resume-trace.h> as well as the + <asm/resume-trace.h> header with a TRACE_RESUME() macro. + + The way the information is presented is architecture- + dependent, x86 will print the information during a + late_initcall. + +config PM_TRACE_RTC + bool "Suspend/resume event tracing" + depends on CAN_PM_TRACE + depends on X86 + select PM_TRACE + ---help--- + This enables some cheesy code to save the last PM event point in the + RTC across reboots, so that you can debug a machine that just hangs + during suspend (or more commonly, during resume). + + To use this debugging feature you should attempt to suspend the + machine, reboot it and then run + + dmesg -s 1000000 | grep 'hash matches' + + CAUTION: this option will cause your machine's real-time clock to be + set to an invalid time after a resume. + config APM_EMULATION tristate "Advanced Power Management Emulation" depends on PM && SYS_SUPPORTS_APM_EMULATION @@ -222,31 +207,11 @@ config APM_EMULATION anything, try disabling/enabling this option (or disabling/enabling APM in your BIOS). -config PM_RUNTIME - bool "Run-time PM core functionality" - depends on PM - ---help--- - Enable functionality allowing I/O devices to be put into energy-saving - (low power) states at run time (or autosuspended) after a specified - period of inactivity and woken up in response to a hardware-generated - wake-up event or a driver's request. - - Hardware support is generally required for this functionality to work - and the bus type drivers of the buses the devices are on are - responsible for the actual handling of the autosuspend requests and - wake-up events. - -config PM_OPS - bool - depends on PM_SLEEP || PM_RUNTIME - default y - config ARCH_HAS_OPP bool config PM_OPP bool "Operating Performance Point (OPP) Layer library" - depends on PM depends on ARCH_HAS_OPP ---help--- SOCs have a standard set of tuples consisting of frequency and @@ -258,3 +223,7 @@ config PM_OPP representing individual voltage domains and provides SOC implementations a ready to use framework to manage OPPs. For more information, read <file:Documentation/power/opp.txt> + +config PM_RUNTIME_CLK + def_bool y + depends on PM_RUNTIME && HAVE_CLK diff --git a/kernel/power/Makefile b/kernel/power/Makefile index c350e18b53e3..c5ebc6a90643 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,4 +1,5 @@ -ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG + +ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG obj-$(CONFIG_PM) += main.o obj-$(CONFIG_PM_SLEEP) += console.o diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c index 83bbc7c02df9..d09dd10c5a5e 100644 --- a/kernel/power/block_io.c +++ b/kernel/power/block_io.c @@ -28,7 +28,7 @@ static int submit(int rw, struct block_device *bdev, sector_t sector, struct page *page, struct bio **bio_chain) { - const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG; + const int bio_rw = rw | REQ_SYNC; struct bio *bio; bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 1832bd264219..8f7b1db1ece1 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -23,8 +23,8 @@ #include <linux/cpu.h> #include <linux/freezer.h> #include <linux/gfp.h> +#include <linux/syscore_ops.h> #include <scsi/scsi_scan.h> -#include <asm/suspend.h> #include "power.h" @@ -54,10 +54,9 @@ static int hibernation_mode = HIBERNATION_SHUTDOWN; static const struct platform_hibernation_ops *hibernation_ops; /** - * hibernation_set_ops - set the global hibernate operations - * @ops: the hibernation operations to use in subsequent hibernation transitions + * hibernation_set_ops - Set the global hibernate operations. + * @ops: Hibernation operations to use in subsequent hibernation transitions. */ - void hibernation_set_ops(const struct platform_hibernation_ops *ops) { if (ops && !(ops->begin && ops->end && ops->pre_snapshot @@ -114,10 +113,9 @@ static int hibernation_test(int level) { return 0; } #endif /* !CONFIG_PM_DEBUG */ /** - * platform_begin - tell the platform driver that we're starting - * hibernation + * platform_begin - Call platform to start hibernation. + * @platform_mode: Whether or not to use the platform driver. */ - static int platform_begin(int platform_mode) { return (platform_mode && hibernation_ops) ? @@ -125,10 +123,9 @@ static int platform_begin(int platform_mode) } /** - * platform_end - tell the platform driver that we've entered the - * working state + * platform_end - Call platform to finish transition to the working state. + * @platform_mode: Whether or not to use the platform driver. */ - static void platform_end(int platform_mode) { if (platform_mode && hibernation_ops) @@ -136,8 +133,11 @@ static void platform_end(int platform_mode) } /** - * platform_pre_snapshot - prepare the machine for hibernation using the - * platform driver if so configured and return an error code if it fails + * platform_pre_snapshot - Call platform to prepare the machine for hibernation. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver to prepare the system for creating a hibernate image, + * if so configured, and return an error code if that fails. */ static int platform_pre_snapshot(int platform_mode) @@ -147,10 +147,14 @@ static int platform_pre_snapshot(int platform_mode) } /** - * platform_leave - prepare the machine for switching to the normal mode - * of operation using the platform driver (called with interrupts disabled) + * platform_leave - Call platform to prepare a transition to the working state. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver prepare to prepare the machine for switching to the + * normal mode of operation. + * + * This routine is called on one CPU with interrupts disabled. */ - static void platform_leave(int platform_mode) { if (platform_mode && hibernation_ops) @@ -158,10 +162,14 @@ static void platform_leave(int platform_mode) } /** - * platform_finish - switch the machine to the normal mode of operation - * using the platform driver (must be called after platform_prepare()) + * platform_finish - Call platform to switch the system to the working state. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver to switch the machine to the normal mode of + * operation. + * + * This routine must be called after platform_prepare(). */ - static void platform_finish(int platform_mode) { if (platform_mode && hibernation_ops) @@ -169,11 +177,15 @@ static void platform_finish(int platform_mode) } /** - * platform_pre_restore - prepare the platform for the restoration from a - * hibernation image. If the restore fails after this function has been - * called, platform_restore_cleanup() must be called. + * platform_pre_restore - Prepare for hibernate image restoration. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver to prepare the system for resume from a hibernation + * image. + * + * If the restore fails after this function has been called, + * platform_restore_cleanup() must be called. */ - static int platform_pre_restore(int platform_mode) { return (platform_mode && hibernation_ops) ? @@ -181,12 +193,16 @@ static int platform_pre_restore(int platform_mode) } /** - * platform_restore_cleanup - switch the platform to the normal mode of - * operation after a failing restore. If platform_pre_restore() has been - * called before the failing restore, this function must be called too, - * regardless of the result of platform_pre_restore(). + * platform_restore_cleanup - Switch to the working state after failing restore. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver to switch the system to the normal mode of operation + * after a failing restore. + * + * If platform_pre_restore() has been called before the failing restore, this + * function must be called too, regardless of the result of + * platform_pre_restore(). */ - static void platform_restore_cleanup(int platform_mode) { if (platform_mode && hibernation_ops) @@ -194,10 +210,9 @@ static void platform_restore_cleanup(int platform_mode) } /** - * platform_recover - recover the platform from a failure to suspend - * devices. + * platform_recover - Recover from a failure to suspend devices. + * @platform_mode: Whether or not to use the platform driver. */ - static void platform_recover(int platform_mode) { if (platform_mode && hibernation_ops && hibernation_ops->recover) @@ -205,13 +220,12 @@ static void platform_recover(int platform_mode) } /** - * swsusp_show_speed - print the time elapsed between two events. - * @start: Starting event. - * @stop: Final event. - * @nr_pages - number of pages processed between @start and @stop - * @msg - introductory message to print + * swsusp_show_speed - Print time elapsed between two events during hibernation. + * @start: Starting event. + * @stop: Final event. + * @nr_pages: Number of memory pages processed between @start and @stop. + * @msg: Additional diagnostic message to print. */ - void swsusp_show_speed(struct timeval *start, struct timeval *stop, unsigned nr_pages, char *msg) { @@ -234,25 +248,18 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop, } /** - * create_image - freeze devices that need to be frozen with interrupts - * off, create the hibernation image and thaw those devices. Control - * reappears in this routine after a restore. + * create_image - Create a hibernation image. + * @platform_mode: Whether or not to use the platform driver. + * + * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image + * and execute the drivers' .thaw_noirq() callbacks. + * + * Control reappears in this routine after the subsequent restore. */ - static int create_image(int platform_mode) { int error; - error = arch_prepare_suspend(); - if (error) - return error; - - /* At this point, dpm_suspend_start() has been called, but *not* - * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now. - * Otherwise, drivers for some devices (e.g. interrupt controllers) - * become desynchronized with the actual state of the hardware - * at resume time, and evil weirdness ensues. - */ error = dpm_suspend_noirq(PMSG_FREEZE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " @@ -271,7 +278,7 @@ static int create_image(int platform_mode) local_irq_disable(); - error = sysdev_suspend(PMSG_FREEZE); + error = syscore_suspend(); if (error) { printk(KERN_ERR "PM: Some system devices failed to power down, " "aborting hibernation\n"); @@ -295,10 +302,7 @@ static int create_image(int platform_mode) } Power_up: - sysdev_resume(); - /* NOTE: dpm_resume_noirq() is just a resume() for devices - * that suspended with irqs off ... no overall powerup. - */ + syscore_resume(); Enable_irqs: local_irq_enable(); @@ -316,30 +320,32 @@ static int create_image(int platform_mode) } /** - * hibernation_snapshot - quiesce devices and create the hibernation - * snapshot image. - * @platform_mode - if set, use the platform driver, if available, to - * prepare the platform firmware for the power transition. + * hibernation_snapshot - Quiesce devices and create a hibernation image. + * @platform_mode: If set, use platform driver to prepare for the transition. * - * Must be called with pm_mutex held + * This routine must be called with pm_mutex held. */ - int hibernation_snapshot(int platform_mode) { + pm_message_t msg = PMSG_RECOVER; int error; error = platform_begin(platform_mode); if (error) goto Close; + error = dpm_prepare(PMSG_FREEZE); + if (error) + goto Complete_devices; + /* Preallocate image memory before shutting down devices. */ error = hibernate_preallocate_memory(); if (error) - goto Close; + goto Complete_devices; suspend_console(); pm_restrict_gfp_mask(); - error = dpm_suspend_start(PMSG_FREEZE); + error = dpm_suspend(PMSG_FREEZE); if (error) goto Recover_platform; @@ -357,13 +363,17 @@ int hibernation_snapshot(int platform_mode) if (error || !in_suspend) swsusp_free(); - dpm_resume_end(in_suspend ? - (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); + msg = in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE; + dpm_resume(msg); if (error || !in_suspend) pm_restore_gfp_mask(); resume_console(); + + Complete_devices: + dpm_complete(msg); + Close: platform_end(platform_mode); return error; @@ -374,13 +384,14 @@ int hibernation_snapshot(int platform_mode) } /** - * resume_target_kernel - prepare devices that need to be suspended with - * interrupts off, restore the contents of highmem that have not been - * restored yet from the image and run the low level code that will restore - * the remaining contents of memory and switch to the just restored target - * kernel. + * resume_target_kernel - Restore system state from a hibernation image. + * @platform_mode: Whether or not to use the platform driver. + * + * Execute device drivers' .freeze_noirq() callbacks, restore the contents of + * highmem that have not been restored yet from the image and run the low-level + * code that will restore the remaining contents of memory and switch to the + * just restored target kernel. */ - static int resume_target_kernel(bool platform_mode) { int error; @@ -402,34 +413,36 @@ static int resume_target_kernel(bool platform_mode) local_irq_disable(); - error = sysdev_suspend(PMSG_QUIESCE); + error = syscore_suspend(); if (error) goto Enable_irqs; - /* We'll ignore saved state, but this gets preempt count (etc) right */ save_processor_state(); error = restore_highmem(); if (!error) { error = swsusp_arch_resume(); /* * The code below is only ever reached in case of a failure. - * Otherwise execution continues at place where - * swsusp_arch_suspend() was called + * Otherwise, execution continues at the place where + * swsusp_arch_suspend() was called. */ BUG_ON(!error); - /* This call to restore_highmem() undos the previous one */ + /* + * This call to restore_highmem() reverts the changes made by + * the previous one. + */ restore_highmem(); } /* * The only reason why swsusp_arch_resume() can fail is memory being * very tight, so we have to free it as soon as we can to avoid - * subsequent failures + * subsequent failures. */ swsusp_free(); restore_processor_state(); touch_softlockup_watchdog(); - sysdev_resume(); + syscore_resume(); Enable_irqs: local_irq_enable(); @@ -446,14 +459,12 @@ static int resume_target_kernel(bool platform_mode) } /** - * hibernation_restore - quiesce devices and restore the hibernation - * snapshot image. If successful, control returns in hibernation_snaphot() - * @platform_mode - if set, use the platform driver, if available, to - * prepare the platform firmware for the transition. + * hibernation_restore - Quiesce devices and restore from a hibernation image. + * @platform_mode: If set, use platform driver to prepare for the transition. * - * Must be called with pm_mutex held + * This routine must be called with pm_mutex held. If it is successful, control + * reappears in the restored target kernel in hibernation_snaphot(). */ - int hibernation_restore(int platform_mode) { int error; @@ -473,10 +484,8 @@ int hibernation_restore(int platform_mode) } /** - * hibernation_platform_enter - enter the hibernation state using the - * platform driver (if available) + * hibernation_platform_enter - Power off the system using the platform driver. */ - int hibernation_platform_enter(void) { int error; @@ -515,7 +524,7 @@ int hibernation_platform_enter(void) goto Platform_finish; local_irq_disable(); - sysdev_suspend(PMSG_HIBERNATE); + syscore_suspend(); if (pm_wakeup_pending()) { error = -EAGAIN; goto Power_up; @@ -526,7 +535,7 @@ int hibernation_platform_enter(void) while (1); Power_up: - sysdev_resume(); + syscore_resume(); local_irq_enable(); enable_nonboot_cpus(); @@ -547,12 +556,12 @@ int hibernation_platform_enter(void) } /** - * power_down - Shut the machine down for hibernation. + * power_down - Shut the machine down for hibernation. * - * Use the platform driver, if configured so; otherwise try - * to power off or reboot. + * Use the platform driver, if configured, to put the system into the sleep + * state corresponding to hibernation, or try to power it off or reboot, + * depending on the value of hibernation_mode. */ - static void power_down(void) { switch (hibernation_mode) { @@ -589,9 +598,8 @@ static int prepare_processes(void) } /** - * hibernate - The granpappy of the built-in hibernation management + * hibernate - Carry out system hibernation, including saving the image. */ - int hibernate(void) { int error; @@ -669,17 +677,20 @@ int hibernate(void) /** - * software_resume - Resume from a saved image. + * software_resume - Resume from a saved hibernation image. * - * Called as a late_initcall (so all devices are discovered and - * initialized), we call swsusp to see if we have a saved image or not. - * If so, we quiesce devices, the restore the saved image. We will - * return above (in hibernate() ) if everything goes well. - * Otherwise, we fail gracefully and return to the normally - * scheduled program. + * This routine is called as a late initcall, when all devices have been + * discovered and initialized already. * + * The image reading code is called to see if there is a hibernation image + * available for reading. If that is the case, devices are quiesced and the + * contents of memory is restored from the saved image. + * + * If this is successful, control reappears in the restored target kernel in + * hibernation_snaphot() which returns to hibernate(). Otherwise, the routine + * attempts to recover gracefully and make the kernel return to the normal mode + * of operation. */ - static int software_resume(void) { int error; @@ -809,21 +820,17 @@ static const char * const hibernation_modes[] = { [HIBERNATION_TESTPROC] = "testproc", }; -/** - * disk - Control hibernation mode - * - * Suspend-to-disk can be handled in several ways. We have a few options - * for putting the system to sleep - using the platform driver (e.g. ACPI - * or other hibernation_ops), powering off the system or rebooting the - * system (for testing) as well as the two test modes. +/* + * /sys/power/disk - Control hibernation mode. * - * The system can support 'platform', and that is known a priori (and - * encoded by the presence of hibernation_ops). However, the user may - * choose 'shutdown' or 'reboot' as alternatives, as well as one fo the - * test modes, 'test' or 'testproc'. + * Hibernation can be handled in several ways. There are a few different ways + * to put the system into the sleep state: using the platform driver (e.g. ACPI + * or other hibernation_ops), powering it off or rebooting it (for testing + * mostly), or using one of the two available test modes. * - * show() will display what the mode is currently set to. - * store() will accept one of + * The sysfs file /sys/power/disk provides an interface for selecting the + * hibernation mode to use. Reading from this file causes the available modes + * to be printed. There are 5 modes that can be supported: * * 'platform' * 'shutdown' @@ -831,8 +838,14 @@ static const char * const hibernation_modes[] = { * 'test' * 'testproc' * - * It will only change to 'platform' if the system - * supports it (as determined by having hibernation_ops). + * If a platform hibernation driver is in use, 'platform' will be supported + * and will be used by default. Otherwise, 'shutdown' will be used by default. + * The selected option (i.e. the one corresponding to the current value of + * hibernation_mode) is enclosed by a square bracket. + * + * To select a given hibernation mode it is necessary to write the mode's + * string representation (as returned by reading from /sys/power/disk) back + * into /sys/power/disk. */ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, @@ -865,7 +878,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, return buf-start; } - static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { @@ -967,10 +979,33 @@ static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *att power_attr(image_size); +static ssize_t reserved_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", reserved_size); +} + +static ssize_t reserved_size_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long size; + + if (sscanf(buf, "%lu", &size) == 1) { + reserved_size = size; + return n; + } + + return -EINVAL; +} + +power_attr(reserved_size); + static struct attribute * g[] = { &disk_attr.attr, &resume_attr.attr, &image_size_attr.attr, + &reserved_size_attr.attr, NULL, }; diff --git a/kernel/power/main.c b/kernel/power/main.c index 701853042c28..2981af4ce7cb 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -17,9 +17,6 @@ DEFINE_MUTEX(pm_mutex); -unsigned int pm_flags; -EXPORT_SYMBOL(pm_flags); - #ifdef CONFIG_PM_SLEEP /* Routines for PM-transition notifications */ @@ -227,7 +224,7 @@ power_attr(state); * writing to 'state'. It first should read from 'wakeup_count' and store * the read value. Then, after carrying out its own preparations for the system * transition to a sleep state, it should write the stored value to - * 'wakeup_count'. If that fails, at least one wakeup event has occured since + * 'wakeup_count'. If that fails, at least one wakeup event has occurred since * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it * is allowed to write to 'state', but the transition will be aborted if there * are any wakeup events detected after 'wakeup_count' was written to. @@ -340,6 +337,7 @@ static int __init pm_init(void) if (error) return error; hibernate_image_size_init(); + hibernate_reserved_size_init(); power_kobj = kobject_create_and_add("power", NULL); if (!power_kobj) return -ENOMEM; diff --git a/kernel/power/power.h b/kernel/power/power.h index 03634be55f62..9a00a0a26280 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -15,6 +15,7 @@ struct swsusp_info { #ifdef CONFIG_HIBERNATION /* kernel/power/snapshot.c */ +extern void __init hibernate_reserved_size_init(void); extern void __init hibernate_image_size_init(void); #ifdef CONFIG_ARCH_HIBERNATION_HEADER @@ -55,6 +56,7 @@ extern int hibernation_platform_enter(void); #else /* !CONFIG_HIBERNATION */ +static inline void hibernate_reserved_size_init(void) {} static inline void hibernate_image_size_init(void) {} #endif /* !CONFIG_HIBERNATION */ @@ -72,6 +74,8 @@ static struct kobj_attribute _name##_attr = { \ /* Preferred image size in bytes (default 500 MB) */ extern unsigned long image_size; +/* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */ +extern unsigned long reserved_size; extern int in_suspend; extern dev_t swsusp_resume_device; extern sector_t swsusp_resume_block; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 64db648ff911..ace55889f702 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -41,6 +41,18 @@ static void swsusp_set_page_forbidden(struct page *); static void swsusp_unset_page_forbidden(struct page *); /* + * Number of bytes to reserve for memory allocations made by device drivers + * from their ->freeze() and ->freeze_noirq() callbacks so that they don't + * cause image creation to fail (tunable via /sys/power/reserved_size). + */ +unsigned long reserved_size; + +void __init hibernate_reserved_size_init(void) +{ + reserved_size = SPARE_PAGES * PAGE_SIZE; +} + +/* * Preferred image size in bytes (tunable via /sys/power/image_size). * When it is set to N, swsusp will do its best to ensure the image * size will not exceed N bytes, but if that is impossible, it will @@ -1263,11 +1275,13 @@ static unsigned long minimum_image_size(unsigned long saveable) * frame in use. We also need a number of page frames to be free during * hibernation for allocations made while saving the image and for device * drivers, in case they need to allocate memory from their hibernation - * callbacks (these two numbers are given by PAGES_FOR_IO and SPARE_PAGES, - * respectively, both of which are rough estimates). To make this happen, we - * compute the total number of available page frames and allocate at least + * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough + * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through + * /sys/power/reserved_size, respectively). To make this happen, we compute the + * total number of available page frames and allocate at least * - * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + 2 * SPARE_PAGES + * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE) * * of them, which corresponds to the maximum size of a hibernation image. * @@ -1322,7 +1336,8 @@ int hibernate_preallocate_memory(void) count -= totalreserve_pages; /* Compute the maximum number of saveable pages to leave in memory. */ - max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES; + max_size = (count - (size + PAGES_FOR_IO)) / 2 + - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE); /* Compute the desired number of image pages specified by image_size. */ size = DIV_ROUND_UP(image_size, PAGE_SIZE); if (size > max_size) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index de6f86bfa303..1c41ba215419 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -22,6 +22,7 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/suspend.h> +#include <linux/syscore_ops.h> #include <trace/events/power.h> #include "power.h" @@ -162,13 +163,13 @@ static int suspend_enter(suspend_state_t state) arch_suspend_disable_irqs(); BUG_ON(!irqs_disabled()); - error = sysdev_suspend(PMSG_SUSPEND); + error = syscore_suspend(); if (!error) { if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { error = suspend_ops->enter(state); events_check_enabled = false; } - sysdev_resume(); + syscore_resume(); } arch_suspend_enable_irqs(); @@ -209,7 +210,6 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); - pm_restrict_gfp_mask(); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { @@ -220,13 +220,12 @@ int suspend_devices_and_enter(suspend_state_t state) if (suspend_test(TEST_DEVICES)) goto Recover_platform; - suspend_enter(state); + error = suspend_enter(state); Resume_devices: suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); - pm_restore_gfp_mask(); resume_console(); Close: if (suspend_ops->end) @@ -287,7 +286,9 @@ int enter_state(suspend_state_t state) goto Finish; pr_debug("PM: Entering %s sleep\n", pm_states[state]); + pm_restrict_gfp_mask(); error = suspend_devices_and_enter(state); + pm_restore_gfp_mask(); Finish: pr_debug("PM: Finishing wakeup.\n"); diff --git a/kernel/power/user.c b/kernel/power/user.c index c36c3b9e8a84..7d02d33be699 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -135,8 +135,10 @@ static int snapshot_release(struct inode *inode, struct file *filp) free_basic_memory_bitmaps(); data = filp->private_data; free_all_swap_pages(data->swap); - if (data->frozen) + if (data->frozen) { + pm_restore_gfp_mask(); thaw_processes(); + } pm_notifier_call_chain(data->mode == O_RDONLY ? PM_POST_HIBERNATION : PM_POST_RESTORE); atomic_inc(&snapshot_device_available); @@ -379,6 +381,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, * PM_HIBERNATION_PREPARE */ error = suspend_devices_and_enter(PM_SUSPEND_MEM); + data->ready = 0; break; case SNAPSHOT_PLATFORM_SUPPORT: diff --git a/kernel/printk.c b/kernel/printk.c index 36231525e22f..35185392173f 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -31,6 +31,7 @@ #include <linux/smp.h> #include <linux/security.h> #include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/syscalls.h> #include <linux/kexec.h> #include <linux/kdb.h> @@ -53,7 +54,7 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) /* printk's without a loglevel use this.. */ -#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */ +#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL /* We show everything that is MORE important than this.. */ #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ @@ -113,6 +114,11 @@ static unsigned con_start; /* Index into log_buf: next char to be sent to consol static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ /* + * If exclusive_console is non-NULL then only this console is to be printed to. + */ +static struct console *exclusive_console; + +/* * Array of consoles built from command line options (console=) */ struct console_cmdline @@ -162,46 +168,74 @@ void log_buf_kexec_setup(void) } #endif +/* requested log_buf_len from kernel cmdline */ +static unsigned long __initdata new_log_buf_len; + +/* save requested log_buf_len since it's too early to process it */ static int __init log_buf_len_setup(char *str) { unsigned size = memparse(str, &str); - unsigned long flags; if (size) size = roundup_pow_of_two(size); - if (size > log_buf_len) { - unsigned start, dest_idx, offset; - char *new_log_buf; + if (size > log_buf_len) + new_log_buf_len = size; - new_log_buf = alloc_bootmem(size); - if (!new_log_buf) { - printk(KERN_WARNING "log_buf_len: allocation failed\n"); - goto out; - } + return 0; +} +early_param("log_buf_len", log_buf_len_setup); - spin_lock_irqsave(&logbuf_lock, flags); - log_buf_len = size; - log_buf = new_log_buf; - - offset = start = min(con_start, log_start); - dest_idx = 0; - while (start != log_end) { - log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)]; - start++; - dest_idx++; - } - log_start -= offset; - con_start -= offset; - log_end -= offset; - spin_unlock_irqrestore(&logbuf_lock, flags); +void __init setup_log_buf(int early) +{ + unsigned long flags; + unsigned start, dest_idx, offset; + char *new_log_buf; + int free; + + if (!new_log_buf_len) + return; + + if (early) { + unsigned long mem; + + mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); + if (mem == MEMBLOCK_ERROR) + return; + new_log_buf = __va(mem); + } else { + new_log_buf = alloc_bootmem_nopanic(new_log_buf_len); + } - printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len); + if (unlikely(!new_log_buf)) { + pr_err("log_buf_len: %ld bytes not available\n", + new_log_buf_len); + return; } -out: - return 1; -} -__setup("log_buf_len=", log_buf_len_setup); + spin_lock_irqsave(&logbuf_lock, flags); + log_buf_len = new_log_buf_len; + log_buf = new_log_buf; + new_log_buf_len = 0; + free = __LOG_BUF_LEN - log_end; + + offset = start = min(con_start, log_start); + dest_idx = 0; + while (start != log_end) { + unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1); + + log_buf[dest_idx] = __log_buf[log_idx_mask]; + start++; + dest_idx++; + } + log_start -= offset; + con_start -= offset; + log_end -= offset; + spin_unlock_irqrestore(&logbuf_lock, flags); + + pr_info("log_buf_len: %d\n", log_buf_len); + pr_info("early log buf free: %d(%d%%)\n", + free, (free * 100) / __LOG_BUF_LEN); +} #ifdef CONFIG_BOOT_PRINTK_DELAY @@ -476,6 +510,8 @@ static void __call_console_drivers(unsigned start, unsigned end) struct console *con; for_each_console(con) { + if (exclusive_console && con != exclusive_console) + continue; if ((con->flags & CON_ENABLED) && con->write && (cpu_online(smp_processor_id()) || (con->flags & CON_ANYTIME))) @@ -515,6 +551,71 @@ static void _call_console_drivers(unsigned start, } /* + * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the + * lower 3 bit are the log level, the rest are the log facility. In case + * userspace passes usual userspace syslog messages to /dev/kmsg or + * /dev/ttyprintk, the log prefix might contain the facility. Printk needs + * to extract the correct log level for in-kernel processing, and not mangle + * the original value. + * + * If a prefix is found, the length of the prefix is returned. If 'level' is + * passed, it will be filled in with the log level without a possible facility + * value. If 'special' is passed, the special printk prefix chars are accepted + * and returned. If no valid header is found, 0 is returned and the passed + * variables are not touched. + */ +static size_t log_prefix(const char *p, unsigned int *level, char *special) +{ + unsigned int lev = 0; + char sp = '\0'; + size_t len; + + if (p[0] != '<' || !p[1]) + return 0; + if (p[2] == '>') { + /* usual single digit level number or special char */ + switch (p[1]) { + case '0' ... '7': + lev = p[1] - '0'; + break; + case 'c': /* KERN_CONT */ + case 'd': /* KERN_DEFAULT */ + sp = p[1]; + break; + default: + return 0; + } + len = 3; + } else { + /* multi digit including the level and facility number */ + char *endp = NULL; + + if (p[1] < '0' && p[1] > '9') + return 0; + + lev = (simple_strtoul(&p[1], &endp, 10) & 7); + if (endp == NULL || endp[0] != '>') + return 0; + len = (endp + 1) - p; + } + + /* do not accept special char if not asked for */ + if (sp && !special) + return 0; + + if (special) { + *special = sp; + /* return special char, do not touch level */ + if (sp) + return len; + } + + if (level) + *level = lev; + return len; +} + +/* * Call the console drivers, asking them to write out * log_buf[start] to log_buf[end - 1]. * The console_lock must be held. @@ -529,13 +630,9 @@ static void call_console_drivers(unsigned start, unsigned end) cur_index = start; start_print = start; while (cur_index != end) { - if (msg_level < 0 && ((end - cur_index) > 2) && - LOG_BUF(cur_index + 0) == '<' && - LOG_BUF(cur_index + 1) >= '0' && - LOG_BUF(cur_index + 1) <= '7' && - LOG_BUF(cur_index + 2) == '>') { - msg_level = LOG_BUF(cur_index + 1) - '0'; - cur_index += 3; + if (msg_level < 0 && ((end - cur_index) > 2)) { + /* strip log prefix */ + cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL); start_print = cur_index; } while (cur_index != end) { @@ -733,6 +830,8 @@ asmlinkage int vprintk(const char *fmt, va_list args) unsigned long flags; int this_cpu; char *p; + size_t plen; + char special; boot_delay_msec(); printk_delay(); @@ -773,45 +872,52 @@ asmlinkage int vprintk(const char *fmt, va_list args) printed_len += vscnprintf(printk_buf + printed_len, sizeof(printk_buf) - printed_len, fmt, args); - p = printk_buf; - /* Do we have a loglevel in the string? */ - if (p[0] == '<') { - unsigned char c = p[1]; - if (c && p[2] == '>') { - switch (c) { - case '0' ... '7': /* loglevel */ - current_log_level = c - '0'; - /* Fallthrough - make sure we're on a new line */ - case 'd': /* KERN_DEFAULT */ - if (!new_text_line) { - emit_log_char('\n'); - new_text_line = 1; - } - /* Fallthrough - skip the loglevel */ - case 'c': /* KERN_CONT */ - p += 3; - break; + /* Read log level and handle special printk prefix */ + plen = log_prefix(p, ¤t_log_level, &special); + if (plen) { + p += plen; + + switch (special) { + case 'c': /* Strip <c> KERN_CONT, continue line */ + plen = 0; + break; + case 'd': /* Strip <d> KERN_DEFAULT, start new line */ + plen = 0; + default: + if (!new_text_line) { + emit_log_char('\n'); + new_text_line = 1; } } } /* - * Copy the output into log_buf. If the caller didn't provide - * appropriate log level tags, we insert them here + * Copy the output into log_buf. If the caller didn't provide + * the appropriate log prefix, we insert them here */ - for ( ; *p; p++) { + for (; *p; p++) { if (new_text_line) { - /* Always output the token */ - emit_log_char('<'); - emit_log_char(current_log_level + '0'); - emit_log_char('>'); - printed_len += 3; new_text_line = 0; + if (plen) { + /* Copy original log prefix */ + int i; + + for (i = 0; i < plen; i++) + emit_log_char(printk_buf[i]); + printed_len += plen; + } else { + /* Add log prefix */ + emit_log_char('<'); + emit_log_char(current_log_level + '0'); + emit_log_char('>'); + printed_len += 3; + } + if (printk_time) { - /* Follow the token with the time */ + /* Add the current time stamp */ char tbuf[50], *tp; unsigned tlen; unsigned long long t; @@ -1160,6 +1266,11 @@ void console_unlock(void) local_irq_restore(flags); } console_locked = 0; + + /* Release the exclusive_console once it is used */ + if (unlikely(exclusive_console)) + exclusive_console = NULL; + up(&console_sem); spin_unlock_irqrestore(&logbuf_lock, flags); if (wake_klogd) @@ -1246,6 +1357,18 @@ void console_start(struct console *console) } EXPORT_SYMBOL(console_start); +static int __read_mostly keep_bootcon; + +static int __init keep_bootcon_setup(char *str) +{ + keep_bootcon = 1; + printk(KERN_INFO "debug: skip boot console de-registration.\n"); + + return 0; +} + +early_param("keep_bootcon", keep_bootcon_setup); + /* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to @@ -1382,6 +1505,12 @@ void register_console(struct console *newcon) spin_lock_irqsave(&logbuf_lock, flags); con_start = log_start; spin_unlock_irqrestore(&logbuf_lock, flags); + /* + * We're about to replay the log buffer. Only do this to the + * just-registered console to avoid excessive message spam to + * the already-registered consoles. + */ + exclusive_console = newcon; } console_unlock(); console_sysfs_notify(); @@ -1393,7 +1522,9 @@ void register_console(struct console *newcon) * users know there might be something in the kernel's log buffer that * went to the bootconsole (that they do not see on the real console) */ - if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { + if (bcon && + ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && + !keep_bootcon) { /* we need to iterate through twice, to make sure we print * everything out, before we unregister the console(s) */ diff --git a/kernel/profile.c b/kernel/profile.c index 66f841b7fbd3..961b389fe52f 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -126,11 +126,9 @@ int __ref profile_init(void) if (prof_buffer) return 0; - prof_buffer = vmalloc(buffer_bytes); - if (prof_buffer) { - memset(prof_buffer, 0, buffer_bytes); + prof_buffer = vzalloc(buffer_bytes); + if (prof_buffer) return 0; - } free_cpumask_var(prof_cpu_mask); return -ENOMEM; @@ -305,14 +303,12 @@ static void profile_discard_flip_buffers(void) mutex_unlock(&profile_flip_mutex); } -void profile_hits(int type, void *__pc, unsigned int nr_hits) +static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) { unsigned long primary, secondary, flags, pc = (unsigned long)__pc; int i, j, cpu; struct profile_hit *hits; - if (prof_on != type || !prof_buffer) - return; pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; @@ -419,16 +415,20 @@ out_free: #define profile_discard_flip_buffers() do { } while (0) #define profile_cpu_callback NULL -void profile_hits(int type, void *__pc, unsigned int nr_hits) +static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) { unsigned long pc; - - if (prof_on != type || !prof_buffer) - return; pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); } #endif /* !CONFIG_SMP */ + +void profile_hits(int type, void *__pc, unsigned int nr_hits) +{ + if (prof_on != type || !prof_buffer) + return; + do_profile_hits(type, __pc, nr_hits); +} EXPORT_SYMBOL_GPL(profile_hits); void profile_tick(int type) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index e2302e40b360..2df115790cd9 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -22,6 +22,7 @@ #include <linux/syscalls.h> #include <linux/uaccess.h> #include <linux/regset.h> +#include <linux/hw_breakpoint.h> /* @@ -37,35 +38,33 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) child->parent = new_parent; } -/* - * Turn a tracing stop into a normal stop now, since with no tracer there - * would be no way to wake it up with SIGCONT or SIGKILL. If there was a - * signal sent that would resume the child, but didn't because it was in - * TASK_TRACED, resume it now. - * Requires that irqs be disabled. - */ -static void ptrace_untrace(struct task_struct *child) -{ - spin_lock(&child->sighand->siglock); - if (task_is_traced(child)) { - /* - * If the group stop is completed or in progress, - * this thread was already counted as stopped. - */ - if (child->signal->flags & SIGNAL_STOP_STOPPED || - child->signal->group_stop_count) - __set_task_state(child, TASK_STOPPED); - else - signal_wake_up(child, 1); - } - spin_unlock(&child->sighand->siglock); -} - -/* - * unptrace a task: move it back to its original parent and - * remove it from the ptrace list. +/** + * __ptrace_unlink - unlink ptracee and restore its execution state + * @child: ptracee to be unlinked * - * Must be called with the tasklist lock write-held. + * Remove @child from the ptrace list, move it back to the original parent, + * and restore the execution state so that it conforms to the group stop + * state. + * + * Unlinking can happen via two paths - explicit PTRACE_DETACH or ptracer + * exiting. For PTRACE_DETACH, unless the ptracee has been killed between + * ptrace_check_attach() and here, it's guaranteed to be in TASK_TRACED. + * If the ptracer is exiting, the ptracee can be in any state. + * + * After detach, the ptracee should be in a state which conforms to the + * group stop. If the group is stopped or in the process of stopping, the + * ptracee should be put into TASK_STOPPED; otherwise, it should be woken + * up from TASK_TRACED. + * + * If the ptracee is in TASK_TRACED and needs to be moved to TASK_STOPPED, + * it goes through TRACED -> RUNNING -> STOPPED transition which is similar + * to but in the opposite direction of what happens while attaching to a + * stopped task. However, in this direction, the intermediate RUNNING + * state is not hidden even from the current ptracer and if it immediately + * re-attaches and performs a WNOHANG wait(2), it may fail. + * + * CONTEXT: + * write_lock_irq(tasklist_lock) */ void __ptrace_unlink(struct task_struct *child) { @@ -75,8 +74,27 @@ void __ptrace_unlink(struct task_struct *child) child->parent = child->real_parent; list_del_init(&child->ptrace_entry); - if (task_is_traced(child)) - ptrace_untrace(child); + spin_lock(&child->sighand->siglock); + + /* + * Reinstate GROUP_STOP_PENDING if group stop is in effect and + * @child isn't dead. + */ + if (!(child->flags & PF_EXITING) && + (child->signal->flags & SIGNAL_STOP_STOPPED || + child->signal->group_stop_count)) + child->group_stop |= GROUP_STOP_PENDING; + + /* + * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick + * @child in the butt. Note that @resume should be used iff @child + * is in TASK_TRACED; otherwise, we might unduly disrupt + * TASK_KILLABLE sleeps. + */ + if (child->group_stop & GROUP_STOP_PENDING || task_is_traced(child)) + signal_wake_up(child, task_is_traced(child)); + + spin_unlock(&child->sighand->siglock); } /* @@ -95,16 +113,14 @@ int ptrace_check_attach(struct task_struct *child, int kill) */ read_lock(&tasklist_lock); if ((child->ptrace & PT_PTRACED) && child->parent == current) { - ret = 0; /* * child->sighand can't be NULL, release_task() * does ptrace_unlink() before __exit_signal(). */ spin_lock_irq(&child->sighand->siglock); - if (task_is_stopped(child)) - child->state = TASK_TRACED; - else if (!task_is_traced(child) && !kill) - ret = -ESRCH; + WARN_ON_ONCE(task_is_stopped(child)); + if (task_is_traced(child) || kill) + ret = 0; spin_unlock_irq(&child->sighand->siglock); } read_unlock(&tasklist_lock); @@ -134,21 +150,24 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) return 0; rcu_read_lock(); tcred = __task_cred(task); - if ((cred->uid != tcred->euid || - cred->uid != tcred->suid || - cred->uid != tcred->uid || - cred->gid != tcred->egid || - cred->gid != tcred->sgid || - cred->gid != tcred->gid) && - !capable(CAP_SYS_PTRACE)) { - rcu_read_unlock(); - return -EPERM; - } + if (cred->user->user_ns == tcred->user->user_ns && + (cred->uid == tcred->euid && + cred->uid == tcred->suid && + cred->uid == tcred->uid && + cred->gid == tcred->egid && + cred->gid == tcred->sgid && + cred->gid == tcred->gid)) + goto ok; + if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE)) + goto ok; + rcu_read_unlock(); + return -EPERM; +ok: rcu_read_unlock(); smp_rmb(); if (task->mm) dumpable = get_dumpable(task->mm); - if (!dumpable && !capable(CAP_SYS_PTRACE)) + if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE)) return -EPERM; return security_ptrace_access_check(task, mode); @@ -165,6 +184,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) static int ptrace_attach(struct task_struct *task) { + bool wait_trap = false; int retval; audit_ptrace(task); @@ -198,18 +218,48 @@ static int ptrace_attach(struct task_struct *task) goto unlock_tasklist; task->ptrace = PT_PTRACED; - if (capable(CAP_SYS_PTRACE)) + if (task_ns_capable(task, CAP_SYS_PTRACE)) task->ptrace |= PT_PTRACE_CAP; __ptrace_link(task, current); send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); + spin_lock(&task->sighand->siglock); + + /* + * If the task is already STOPPED, set GROUP_STOP_PENDING and + * TRAPPING, and kick it so that it transits to TRACED. TRAPPING + * will be cleared if the child completes the transition or any + * event which clears the group stop states happens. We'll wait + * for the transition to complete before returning from this + * function. + * + * This hides STOPPED -> RUNNING -> TRACED transition from the + * attaching thread but a different thread in the same group can + * still observe the transient RUNNING state. IOW, if another + * thread's WNOHANG wait(2) on the stopped tracee races against + * ATTACH, the wait(2) may fail due to the transient RUNNING. + * + * The following task_is_stopped() test is safe as both transitions + * in and out of STOPPED are protected by siglock. + */ + if (task_is_stopped(task)) { + task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING; + signal_wake_up(task, 1); + wait_trap = true; + } + + spin_unlock(&task->sighand->siglock); + retval = 0; unlock_tasklist: write_unlock_irq(&tasklist_lock); unlock_creds: mutex_unlock(&task->signal->cred_guard_mutex); out: + if (wait_trap) + wait_event(current->signal->wait_chldexit, + !(task->group_stop & GROUP_STOP_TRAPPING)); return retval; } @@ -312,8 +362,6 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) if (child->ptrace) { child->exit_code = data; dead = __ptrace_detach(current, child); - if (!child->exit_state) - wake_up_state(child, TASK_TRACED | TASK_STOPPED); } write_unlock_irq(&tasklist_lock); @@ -514,7 +562,7 @@ static int ptrace_resume(struct task_struct *child, long request, } child->exit_code = data; - wake_up_process(child); + wake_up_state(child, __TASK_TRACED); return 0; } @@ -876,3 +924,19 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, return ret; } #endif /* CONFIG_COMPAT */ + +#ifdef CONFIG_HAVE_HW_BREAKPOINT +int ptrace_get_breakpoints(struct task_struct *tsk) +{ + if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt)) + return 0; + + return -1; +} + +void ptrace_put_breakpoints(struct task_struct *tsk) +{ + if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt)) + flush_ptrace_hw_breakpoint(tsk); +} +#endif /* CONFIG_HAVE_HW_BREAKPOINT */ diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a23a57a976d1..7784bd216b6a 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -142,10 +142,17 @@ static int rcuhead_fixup_init(void *addr, enum debug_obj_state state) * Ensure that queued callbacks are all executed. * If we detect that we are nested in a RCU read-side critical * section, we should simply fail, otherwise we would deadlock. + * In !PREEMPT configurations, there is no way to tell if we are + * in a RCU read-side critical section or not, so we never + * attempt any fixup and just print a warning. */ +#ifndef CONFIG_PREEMPT + WARN_ON_ONCE(1); + return 0; +#endif if (rcu_preempt_depth() != 0 || preempt_count() != 0 || irqs_disabled()) { - WARN_ON(1); + WARN_ON_ONCE(1); return 0; } rcu_barrier(); @@ -184,10 +191,17 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) * Ensure that queued callbacks are all executed. * If we detect that we are nested in a RCU read-side critical * section, we should simply fail, otherwise we would deadlock. + * In !PREEMPT configurations, there is no way to tell if we are + * in a RCU read-side critical section or not, so we never + * attempt any fixup and just print a warning. */ +#ifndef CONFIG_PREEMPT + WARN_ON_ONCE(1); + return 0; +#endif if (rcu_preempt_depth() != 0 || preempt_count() != 0 || irqs_disabled()) { - WARN_ON(1); + WARN_ON_ONCE(1); return 0; } rcu_barrier(); @@ -214,14 +228,17 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) * Ensure that queued callbacks are all executed. * If we detect that we are nested in a RCU read-side critical * section, we should simply fail, otherwise we would deadlock. + * In !PREEMPT configurations, there is no way to tell if we are + * in a RCU read-side critical section or not, so we never + * attempt any fixup and just print a warning. */ #ifndef CONFIG_PREEMPT - WARN_ON(1); + WARN_ON_ONCE(1); return 0; -#else +#endif if (rcu_preempt_depth() != 0 || preempt_count() != 0 || irqs_disabled()) { - WARN_ON(1); + WARN_ON_ONCE(1); return 0; } rcu_barrier(); @@ -229,7 +246,6 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) rcu_barrier_bh(); debug_object_free(head, &rcuhead_debug_descr); return 1; -#endif default: return 0; } diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 0c343b9a46d5..7bbac7d0f5ab 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -35,15 +35,16 @@ #include <linux/init.h> #include <linux/time.h> #include <linux/cpu.h> +#include <linux/prefetch.h> /* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ static struct task_struct *rcu_kthread_task; static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); static unsigned long have_rcu_kthread_work; -static void invoke_rcu_kthread(void); /* Forward declarations for rcutiny_plugin.h. */ struct rcu_ctrlblk; +static void invoke_rcu_kthread(void); static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); static int rcu_kthread(void *arg); static void __call_rcu(struct rcu_head *head, @@ -79,36 +80,45 @@ void rcu_exit_nohz(void) #endif /* #ifdef CONFIG_NO_HZ */ /* - * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc(). - * Also disable irqs to avoid confusion due to interrupt handlers + * Helper function for rcu_sched_qs() and rcu_bh_qs(). + * Also irqs are disabled to avoid confusion due to interrupt handlers * invoking call_rcu(). */ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) { - unsigned long flags; - - local_irq_save(flags); if (rcp->rcucblist != NULL && rcp->donetail != rcp->curtail) { rcp->donetail = rcp->curtail; - local_irq_restore(flags); return 1; } - local_irq_restore(flags); return 0; } /* + * Wake up rcu_kthread() to process callbacks now eligible for invocation + * or to boost readers. + */ +static void invoke_rcu_kthread(void) +{ + have_rcu_kthread_work = 1; + wake_up(&rcu_kthread_wq); +} + +/* * Record an rcu quiescent state. And an rcu_bh quiescent state while we * are at it, given that any rcu quiescent state is also an rcu_bh * quiescent state. Use "+" instead of "||" to defeat short circuiting. */ void rcu_sched_qs(int cpu) { + unsigned long flags; + + local_irq_save(flags); if (rcu_qsctr_help(&rcu_sched_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk)) invoke_rcu_kthread(); + local_irq_restore(flags); } /* @@ -116,8 +126,12 @@ void rcu_sched_qs(int cpu) */ void rcu_bh_qs(int cpu) { + unsigned long flags; + + local_irq_save(flags); if (rcu_qsctr_help(&rcu_bh_ctrlblk)) invoke_rcu_kthread(); + local_irq_restore(flags); } /* @@ -167,7 +181,7 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) prefetch(next); debug_rcu_head_unqueue(list); local_bh_disable(); - list->func(list); + __rcu_reclaim(list); local_bh_enable(); list = next; RCU_TRACE(cb_count++); @@ -208,20 +222,6 @@ static int rcu_kthread(void *arg) } /* - * Wake up rcu_kthread() to process callbacks now eligible for invocation - * or to boost readers. - */ -static void invoke_rcu_kthread(void) -{ - unsigned long flags; - - local_irq_save(flags); - have_rcu_kthread_work = 1; - wake_up(&rcu_kthread_wq); - local_irq_restore(flags); -} - -/* * Wait for a grace period to elapse. But it is illegal to invoke * synchronize_sched() from within an RCU read-side critical section. * Therefore, any legal call to synchronize_sched() is a quiescent diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 015abaea962a..f259c676195f 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -100,23 +100,28 @@ struct rcu_preempt_ctrlblk { u8 completed; /* Last grace period completed. */ /* If all three are equal, RCU is idle. */ #ifdef CONFIG_RCU_BOOST - s8 boosted_this_gp; /* Has boosting already happened? */ unsigned long boost_time; /* When to start boosting (jiffies) */ #endif /* #ifdef CONFIG_RCU_BOOST */ #ifdef CONFIG_RCU_TRACE unsigned long n_grace_periods; #ifdef CONFIG_RCU_BOOST unsigned long n_tasks_boosted; + /* Total number of tasks boosted. */ unsigned long n_exp_boosts; + /* Number of tasks boosted for expedited GP. */ unsigned long n_normal_boosts; - unsigned long n_normal_balk_blkd_tasks; - unsigned long n_normal_balk_gp_tasks; - unsigned long n_normal_balk_boost_tasks; - unsigned long n_normal_balk_boosted; - unsigned long n_normal_balk_notyet; - unsigned long n_normal_balk_nos; - unsigned long n_exp_balk_blkd_tasks; - unsigned long n_exp_balk_nos; + /* Number of tasks boosted for normal GP. */ + unsigned long n_balk_blkd_tasks; + /* Refused to boost: no blocked tasks. */ + unsigned long n_balk_exp_gp_tasks; + /* Refused to boost: nothing blocking GP. */ + unsigned long n_balk_boost_tasks; + /* Refused to boost: already boosting. */ + unsigned long n_balk_notyet; + /* Refused to boost: not yet time. */ + unsigned long n_balk_nos; + /* Refused to boost: not sure why, though. */ + /* This can happen due to race conditions. */ #endif /* #ifdef CONFIG_RCU_BOOST */ #endif /* #ifdef CONFIG_RCU_TRACE */ }; @@ -201,7 +206,6 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t) #ifdef CONFIG_RCU_BOOST static void rcu_initiate_boost_trace(void); -static void rcu_initiate_exp_boost_trace(void); #endif /* #ifdef CONFIG_RCU_BOOST */ /* @@ -219,41 +223,21 @@ static void show_tiny_preempt_stats(struct seq_file *m) "N."[!rcu_preempt_ctrlblk.gp_tasks], "E."[!rcu_preempt_ctrlblk.exp_tasks]); #ifdef CONFIG_RCU_BOOST - seq_printf(m, " ttb=%c btg=", - "B."[!rcu_preempt_ctrlblk.boost_tasks]); - switch (rcu_preempt_ctrlblk.boosted_this_gp) { - case -1: - seq_puts(m, "exp"); - break; - case 0: - seq_puts(m, "no"); - break; - case 1: - seq_puts(m, "begun"); - break; - case 2: - seq_puts(m, "done"); - break; - default: - seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp); - } - seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n", + seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n", + " ", + "B."[!rcu_preempt_ctrlblk.boost_tasks], rcu_preempt_ctrlblk.n_tasks_boosted, rcu_preempt_ctrlblk.n_exp_boosts, rcu_preempt_ctrlblk.n_normal_boosts, (int)(jiffies & 0xffff), (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); - seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n", - "normal balk", - rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks, - rcu_preempt_ctrlblk.n_normal_balk_gp_tasks, - rcu_preempt_ctrlblk.n_normal_balk_boost_tasks, - rcu_preempt_ctrlblk.n_normal_balk_boosted, - rcu_preempt_ctrlblk.n_normal_balk_notyet, - rcu_preempt_ctrlblk.n_normal_balk_nos); - seq_printf(m, " exp balk: bt=%lu nos=%lu\n", - rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks, - rcu_preempt_ctrlblk.n_exp_balk_nos); + seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n", + " balk", + rcu_preempt_ctrlblk.n_balk_blkd_tasks, + rcu_preempt_ctrlblk.n_balk_exp_gp_tasks, + rcu_preempt_ctrlblk.n_balk_boost_tasks, + rcu_preempt_ctrlblk.n_balk_notyet, + rcu_preempt_ctrlblk.n_balk_nos); #endif /* #ifdef CONFIG_RCU_BOOST */ } @@ -271,25 +255,59 @@ static int rcu_boost(void) { unsigned long flags; struct rt_mutex mtx; - struct list_head *np; struct task_struct *t; + struct list_head *tb; - if (rcu_preempt_ctrlblk.boost_tasks == NULL) + if (rcu_preempt_ctrlblk.boost_tasks == NULL && + rcu_preempt_ctrlblk.exp_tasks == NULL) return 0; /* Nothing to boost. */ + raw_local_irq_save(flags); - rcu_preempt_ctrlblk.boosted_this_gp++; - t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct, - rcu_node_entry); - np = rcu_next_node_entry(t); + + /* + * Recheck with irqs disabled: all tasks in need of boosting + * might exit their RCU read-side critical sections on their own + * if we are preempted just before disabling irqs. + */ + if (rcu_preempt_ctrlblk.boost_tasks == NULL && + rcu_preempt_ctrlblk.exp_tasks == NULL) { + raw_local_irq_restore(flags); + return 0; + } + + /* + * Preferentially boost tasks blocking expedited grace periods. + * This cannot starve the normal grace periods because a second + * expedited grace period must boost all blocked tasks, including + * those blocking the pre-existing normal grace period. + */ + if (rcu_preempt_ctrlblk.exp_tasks != NULL) { + tb = rcu_preempt_ctrlblk.exp_tasks; + RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++); + } else { + tb = rcu_preempt_ctrlblk.boost_tasks; + RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++); + } + RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); + + /* + * We boost task t by manufacturing an rt_mutex that appears to + * be held by task t. We leave a pointer to that rt_mutex where + * task t can find it, and task t will release the mutex when it + * exits its outermost RCU read-side critical section. Then + * simply acquiring this artificial rt_mutex will boost task + * t's priority. (Thanks to tglx for suggesting this approach!) + */ + t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&mtx, t); t->rcu_boost_mutex = &mtx; t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; raw_local_irq_restore(flags); rt_mutex_lock(&mtx); - RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); - rcu_preempt_ctrlblk.boosted_this_gp++; - rt_mutex_unlock(&mtx); - return rcu_preempt_ctrlblk.boost_tasks != NULL; + rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ + + return rcu_preempt_ctrlblk.boost_tasks != NULL || + rcu_preempt_ctrlblk.exp_tasks != NULL; } /* @@ -304,42 +322,25 @@ static int rcu_boost(void) */ static int rcu_initiate_boost(void) { - if (!rcu_preempt_blocked_readers_cgp()) { - RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++); + if (!rcu_preempt_blocked_readers_cgp() && + rcu_preempt_ctrlblk.exp_tasks == NULL) { + RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++); return 0; } - if (rcu_preempt_ctrlblk.gp_tasks != NULL && - rcu_preempt_ctrlblk.boost_tasks == NULL && - rcu_preempt_ctrlblk.boosted_this_gp == 0 && - ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) { - rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks; + if (rcu_preempt_ctrlblk.exp_tasks != NULL || + (rcu_preempt_ctrlblk.gp_tasks != NULL && + rcu_preempt_ctrlblk.boost_tasks == NULL && + ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) { + if (rcu_preempt_ctrlblk.exp_tasks == NULL) + rcu_preempt_ctrlblk.boost_tasks = + rcu_preempt_ctrlblk.gp_tasks; invoke_rcu_kthread(); - RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++); } else RCU_TRACE(rcu_initiate_boost_trace()); return 1; } -/* - * Initiate boosting for an expedited grace period. - */ -static void rcu_initiate_expedited_boost(void) -{ - unsigned long flags; - - raw_local_irq_save(flags); - if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) { - rcu_preempt_ctrlblk.boost_tasks = - rcu_preempt_ctrlblk.blkd_tasks.next; - rcu_preempt_ctrlblk.boosted_this_gp = -1; - invoke_rcu_kthread(); - RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++); - } else - RCU_TRACE(rcu_initiate_exp_boost_trace()); - raw_local_irq_restore(flags); -} - -#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000); +#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) /* * Do priority-boost accounting for the start of a new grace period. @@ -347,8 +348,6 @@ static void rcu_initiate_expedited_boost(void) static void rcu_preempt_boost_start_gp(void) { rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; - if (rcu_preempt_ctrlblk.boosted_this_gp > 0) - rcu_preempt_ctrlblk.boosted_this_gp = 0; } #else /* #ifdef CONFIG_RCU_BOOST */ @@ -372,13 +371,6 @@ static int rcu_initiate_boost(void) } /* - * If there is no RCU priority boosting, we don't initiate expedited boosting. - */ -static void rcu_initiate_expedited_boost(void) -{ -} - -/* * If there is no RCU priority boosting, nothing to do at grace-period start. */ static void rcu_preempt_boost_start_gp(void) @@ -418,7 +410,7 @@ static void rcu_preempt_cpu_qs(void) if (!rcu_preempt_gp_in_progress()) return; /* - * Check up on boosting. If there are no readers blocking the + * Check up on boosting. If there are readers blocking the * current grace period, leave. */ if (rcu_initiate_boost()) @@ -578,7 +570,7 @@ static void rcu_read_unlock_special(struct task_struct *t) empty = !rcu_preempt_blocked_readers_cgp(); empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; np = rcu_next_node_entry(t); - list_del(&t->rcu_node_entry); + list_del_init(&t->rcu_node_entry); if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) rcu_preempt_ctrlblk.gp_tasks = np; if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) @@ -587,7 +579,6 @@ static void rcu_read_unlock_special(struct task_struct *t) if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) rcu_preempt_ctrlblk.boost_tasks = np; #endif /* #ifdef CONFIG_RCU_BOOST */ - INIT_LIST_HEAD(&t->rcu_node_entry); /* * If this was the last task on the current list, and if @@ -812,13 +803,16 @@ void synchronize_rcu_expedited(void) rpcp->exp_tasks = rpcp->blkd_tasks.next; if (rpcp->exp_tasks == &rpcp->blkd_tasks) rpcp->exp_tasks = NULL; - local_irq_restore(flags); /* Wait for tail of ->blkd_tasks list to drain. */ - if (rcu_preempted_readers_exp()) - rcu_initiate_expedited_boost(); + if (!rcu_preempted_readers_exp()) + local_irq_restore(flags); + else { + rcu_initiate_boost(); + local_irq_restore(flags); wait_event(sync_rcu_preempt_exp_wq, !rcu_preempted_readers_exp()); + } /* Clean up and exit. */ barrier(); /* ensure expedited GP seen before counter increment. */ @@ -852,7 +846,7 @@ void exit_rcu(void) if (t->rcu_read_lock_nesting == 0) return; t->rcu_read_lock_nesting = 1; - rcu_read_unlock(); + __rcu_read_unlock(); } #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ @@ -931,24 +925,17 @@ void __init rcu_scheduler_starting(void) static void rcu_initiate_boost_trace(void) { - if (rcu_preempt_ctrlblk.gp_tasks == NULL) - rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++; + if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) + rcu_preempt_ctrlblk.n_balk_blkd_tasks++; + else if (rcu_preempt_ctrlblk.gp_tasks == NULL && + rcu_preempt_ctrlblk.exp_tasks == NULL) + rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++; else if (rcu_preempt_ctrlblk.boost_tasks != NULL) - rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++; - else if (rcu_preempt_ctrlblk.boosted_this_gp != 0) - rcu_preempt_ctrlblk.n_normal_balk_boosted++; + rcu_preempt_ctrlblk.n_balk_boost_tasks++; else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) - rcu_preempt_ctrlblk.n_normal_balk_notyet++; - else - rcu_preempt_ctrlblk.n_normal_balk_nos++; -} - -static void rcu_initiate_exp_boost_trace(void) -{ - if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) - rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++; + rcu_preempt_ctrlblk.n_balk_notyet++; else - rcu_preempt_ctrlblk.n_exp_balk_nos++; + rcu_preempt_ctrlblk.n_balk_nos++; } #endif /* #ifdef CONFIG_RCU_BOOST */ diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 89613f97ff26..2e138db03382 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -47,7 +47,6 @@ #include <linux/srcu.h> #include <linux/slab.h> #include <asm/byteorder.h> -#include <linux/sched.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " @@ -132,7 +131,7 @@ struct rcu_torture { static LIST_HEAD(rcu_torture_freelist); static struct rcu_torture __rcu *rcu_torture_current; -static long rcu_torture_current_version; +static unsigned long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = @@ -147,8 +146,6 @@ static atomic_t n_rcu_torture_mberror; static atomic_t n_rcu_torture_error; static long n_rcu_torture_boost_ktrerror; static long n_rcu_torture_boost_rterror; -static long n_rcu_torture_boost_allocerror; -static long n_rcu_torture_boost_afferror; static long n_rcu_torture_boost_failure; static long n_rcu_torture_boosts; static long n_rcu_torture_timers; @@ -164,11 +161,11 @@ static int stutter_pause_test; #endif int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; -#ifdef CONFIG_RCU_BOOST +#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) #define rcu_can_boost() 1 -#else /* #ifdef CONFIG_RCU_BOOST */ +#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ #define rcu_can_boost() 0 -#endif /* #else #ifdef CONFIG_RCU_BOOST */ +#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ static unsigned long boost_starttime; /* jiffies of next boost test start. */ DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ @@ -752,6 +749,7 @@ static int rcu_torture_boost(void *arg) n_rcu_torture_boost_rterror++; } + init_rcu_head_on_stack(&rbi.rcu); /* Each pass through the following loop does one boost-test cycle. */ do { /* Wait for the next test interval. */ @@ -811,6 +809,7 @@ checkwait: rcu_stutter_wait("rcu_torture_boost"); /* Clean up and exit. */ VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); + destroy_rcu_head_on_stack(&rbi.rcu); rcutorture_shutdown_absorb("rcu_torture_boost"); while (!kthread_should_stop() || rbi.inflight) schedule_timeout_uninterruptible(1); @@ -887,7 +886,7 @@ rcu_torture_writer(void *arg) old_rp->rtort_pipe_count++; cur_ops->deferred_free(old_rp); } - rcu_torture_current_version++; + rcutorture_record_progress(++rcu_torture_current_version); oldbatch = cur_ops->completed(); rcu_stutter_wait("rcu_torture_writer"); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); @@ -1067,8 +1066,8 @@ rcu_torture_printk(char *page) } cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); cnt += sprintf(&page[cnt], - "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " - "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld " + "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " + "rtmbe: %d rtbke: %ld rtbre: %ld " "rtbf: %ld rtb: %ld nt: %ld", rcu_torture_current, rcu_torture_current_version, @@ -1079,16 +1078,12 @@ rcu_torture_printk(char *page) atomic_read(&n_rcu_torture_mberror), n_rcu_torture_boost_ktrerror, n_rcu_torture_boost_rterror, - n_rcu_torture_boost_allocerror, - n_rcu_torture_boost_afferror, n_rcu_torture_boost_failure, n_rcu_torture_boosts, n_rcu_torture_timers); if (atomic_read(&n_rcu_torture_mberror) != 0 || n_rcu_torture_boost_ktrerror != 0 || n_rcu_torture_boost_rterror != 0 || - n_rcu_torture_boost_allocerror != 0 || - n_rcu_torture_boost_afferror != 0 || n_rcu_torture_boost_failure != 0) cnt += sprintf(&page[cnt], " !!!"); cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); @@ -1332,6 +1327,7 @@ rcu_torture_cleanup(void) int i; mutex_lock(&fullstop_mutex); + rcutorture_record_test_transition(); if (fullstop == FULLSTOP_SHUTDOWN) { printk(KERN_WARNING /* but going down anyway, so... */ "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); @@ -1487,8 +1483,6 @@ rcu_torture_init(void) atomic_set(&n_rcu_torture_error, 0); n_rcu_torture_boost_ktrerror = 0; n_rcu_torture_boost_rterror = 0; - n_rcu_torture_boost_allocerror = 0; - n_rcu_torture_boost_afferror = 0; n_rcu_torture_boost_failure = 0; n_rcu_torture_boosts = 0; for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) @@ -1625,6 +1619,7 @@ rcu_torture_init(void) } } register_reboot_notifier(&rcutorture_shutdown_nb); + rcutorture_record_test_transition(); mutex_unlock(&fullstop_mutex); return 0; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index dd4aea806f8e..77a7671dd147 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -36,7 +36,7 @@ #include <linux/interrupt.h> #include <linux/sched.h> #include <linux/nmi.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/bitops.h> #include <linux/module.h> #include <linux/completion.h> @@ -47,6 +47,9 @@ #include <linux/mutex.h> #include <linux/time.h> #include <linux/kernel_stat.h> +#include <linux/wait.h> +#include <linux/kthread.h> +#include <linux/prefetch.h> #include "rcutree.h" @@ -79,10 +82,40 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); +static struct rcu_state *rcu_state; + int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(rcu_scheduler_active); /* + * Control variables for per-CPU and per-rcu_node kthreads. These + * handle all flavors of RCU. + */ +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); +DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu); +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); +DEFINE_PER_CPU(char, rcu_cpu_has_work); +static char rcu_kthreads_spawnable; + +static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); +static void invoke_rcu_cpu_kthread(void); + +#define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */ + +/* + * Track the rcutorture test sequence number and the update version + * number within a given test. The rcutorture_testseq is incremented + * on every rcutorture module load and unload, so has an odd value + * when a test is running. The rcutorture_vernum is set to zero + * when rcutorture starts and is incremented on each rcutorture update. + * These variables enable correlating rcutorture output with the + * RCU tracing information. + */ +unsigned long rcutorture_testseq; +unsigned long rcutorture_vernum; + +/* * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s * permit this function to be invoked without holding the root rcu_node * structure's ->lock, but of course results can be subject to change. @@ -124,11 +157,12 @@ void rcu_note_context_switch(int cpu) rcu_sched_qs(cpu); rcu_preempt_note_context_switch(cpu); } +EXPORT_SYMBOL_GPL(rcu_note_context_switch); #ifdef CONFIG_NO_HZ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = 1, - .dynticks = 1, + .dynticks = ATOMIC_INIT(1), }; #endif /* #ifdef CONFIG_NO_HZ */ @@ -140,10 +174,8 @@ module_param(blimit, int, 0); module_param(qhimark, int, 0); module_param(qlowmark, int, 0); -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR -int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT; +int rcu_cpu_stall_suppress __read_mostly; module_param(rcu_cpu_stall_suppress, int, 0644); -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ static void force_quiescent_state(struct rcu_state *rsp, int relaxed); static int rcu_pending(int cpu); @@ -176,6 +208,31 @@ void rcu_bh_force_quiescent_state(void) EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); /* + * Record the number of times rcutorture tests have been initiated and + * terminated. This information allows the debugfs tracing stats to be + * correlated to the rcutorture messages, even when the rcutorture module + * is being repeatedly loaded and unloaded. In other words, we cannot + * store this state in rcutorture itself. + */ +void rcutorture_record_test_transition(void) +{ + rcutorture_testseq++; + rcutorture_vernum = 0; +} +EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); + +/* + * Record the number of writer passes through the current rcutorture test. + * This is also used to correlate debugfs tracing stats with the rcutorture + * messages. + */ +void rcutorture_record_progress(unsigned long vernum) +{ + rcutorture_vernum++; +} +EXPORT_SYMBOL_GPL(rcutorture_record_progress); + +/* * Force a quiescent state for RCU-sched. */ void rcu_sched_force_quiescent_state(void) @@ -234,8 +291,8 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) return 1; } - /* If preemptable RCU, no point in sending reschedule IPI. */ - if (rdp->preemptable) + /* If preemptible RCU, no point in sending reschedule IPI. */ + if (rdp->preemptible) return 0; /* The CPU is online, so send it a reschedule IPI. */ @@ -264,13 +321,25 @@ void rcu_enter_nohz(void) unsigned long flags; struct rcu_dynticks *rdtp; - smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); - rdtp->dynticks++; - rdtp->dynticks_nesting--; - WARN_ON_ONCE(rdtp->dynticks & 0x1); + if (--rdtp->dynticks_nesting) { + local_irq_restore(flags); + return; + } + /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ + smp_mb__before_atomic_inc(); /* See above. */ + atomic_inc(&rdtp->dynticks); + smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ + WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); local_irq_restore(flags); + + /* If the interrupt queued a callback, get out of dyntick mode. */ + if (in_irq() && + (__get_cpu_var(rcu_sched_data).nxtlist || + __get_cpu_var(rcu_bh_data).nxtlist || + rcu_preempt_needs_cpu(smp_processor_id()))) + set_need_resched(); } /* @@ -286,11 +355,16 @@ void rcu_exit_nohz(void) local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); - rdtp->dynticks++; - rdtp->dynticks_nesting++; - WARN_ON_ONCE(!(rdtp->dynticks & 0x1)); + if (rdtp->dynticks_nesting++) { + local_irq_restore(flags); + return; + } + smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ + atomic_inc(&rdtp->dynticks); + /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ + smp_mb__after_atomic_inc(); /* See above. */ + WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); local_irq_restore(flags); - smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ } /** @@ -304,11 +378,15 @@ void rcu_nmi_enter(void) { struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); - if (rdtp->dynticks & 0x1) + if (rdtp->dynticks_nmi_nesting == 0 && + (atomic_read(&rdtp->dynticks) & 0x1)) return; - rdtp->dynticks_nmi++; - WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1)); - smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ + rdtp->dynticks_nmi_nesting++; + smp_mb__before_atomic_inc(); /* Force delay from prior write. */ + atomic_inc(&rdtp->dynticks); + /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ + smp_mb__after_atomic_inc(); /* See above. */ + WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); } /** @@ -322,11 +400,14 @@ void rcu_nmi_exit(void) { struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); - if (rdtp->dynticks & 0x1) + if (rdtp->dynticks_nmi_nesting == 0 || + --rdtp->dynticks_nmi_nesting != 0) return; - smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ - rdtp->dynticks_nmi++; - WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1); + /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ + smp_mb__before_atomic_inc(); /* See above. */ + atomic_inc(&rdtp->dynticks); + smp_mb__after_atomic_inc(); /* Force delay to next write. */ + WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); } /** @@ -337,13 +418,7 @@ void rcu_nmi_exit(void) */ void rcu_irq_enter(void) { - struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); - - if (rdtp->dynticks_nesting++) - return; - rdtp->dynticks++; - WARN_ON_ONCE(!(rdtp->dynticks & 0x1)); - smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ + rcu_exit_nohz(); } /** @@ -355,18 +430,7 @@ void rcu_irq_enter(void) */ void rcu_irq_exit(void) { - struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); - - if (--rdtp->dynticks_nesting) - return; - smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ - rdtp->dynticks++; - WARN_ON_ONCE(rdtp->dynticks & 0x1); - - /* If the interrupt queued a callback, get out of dyntick mode. */ - if (__this_cpu_read(rcu_sched_data.nxtlist) || - __this_cpu_read(rcu_bh_data.nxtlist)) - set_need_resched(); + rcu_enter_nohz(); } #ifdef CONFIG_SMP @@ -378,19 +442,8 @@ void rcu_irq_exit(void) */ static int dyntick_save_progress_counter(struct rcu_data *rdp) { - int ret; - int snap; - int snap_nmi; - - snap = rdp->dynticks->dynticks; - snap_nmi = rdp->dynticks->dynticks_nmi; - smp_mb(); /* Order sampling of snap with end of grace period. */ - rdp->dynticks_snap = snap; - rdp->dynticks_nmi_snap = snap_nmi; - ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0); - if (ret) - rdp->dynticks_fqs++; - return ret; + rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); + return 0; } /* @@ -401,16 +454,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) */ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) { - long curr; - long curr_nmi; - long snap; - long snap_nmi; + unsigned long curr; + unsigned long snap; - curr = rdp->dynticks->dynticks; - snap = rdp->dynticks_snap; - curr_nmi = rdp->dynticks->dynticks_nmi; - snap_nmi = rdp->dynticks_nmi_snap; - smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ + curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks); + snap = (unsigned long)rdp->dynticks_snap; /* * If the CPU passed through or entered a dynticks idle phase with @@ -420,8 +468,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * read-side critical section that started before the beginning * of the current RCU grace period. */ - if ((curr != snap || (curr & 0x1) == 0) && - (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) { + if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) { rdp->dynticks_fqs++; return 1; } @@ -450,8 +497,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) #endif /* #else #ifdef CONFIG_NO_HZ */ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR - int rcu_cpu_stall_suppress __read_mostly; static void record_gp_stall_check_time(struct rcu_state *rsp) @@ -537,21 +582,24 @@ static void print_cpu_stall(struct rcu_state *rsp) static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) { - long delta; + unsigned long j; + unsigned long js; struct rcu_node *rnp; if (rcu_cpu_stall_suppress) return; - delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); + j = ACCESS_ONCE(jiffies); + js = ACCESS_ONCE(rsp->jiffies_stall); rnp = rdp->mynode; - if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) { + if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); - } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) { + } else if (rcu_gp_in_progress(rsp) && + ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { - /* They had two time units to dump stack, so complain. */ + /* They had a few time units to dump stack, so complain. */ print_other_cpu_stall(rsp); } } @@ -587,26 +635,6 @@ static void __init check_cpu_stall_init(void) atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); } -#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - -static void record_gp_stall_check_time(struct rcu_state *rsp) -{ -} - -static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) -{ -} - -void rcu_cpu_stall_reset(void) -{ -} - -static void __init check_cpu_stall_init(void) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - /* * Update CPU-local rcu_data state to record the newly noticed grace period. * This is used both when we started the grace period and when we notice @@ -809,6 +837,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) rnp->completed = rsp->completed; rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ rcu_start_gp_per_cpu(rsp, rnp, rdp); + rcu_preempt_boost_start_gp(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } @@ -844,6 +873,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) rnp->completed = rsp->completed; if (rnp == rdp->mynode) rcu_start_gp_per_cpu(rsp, rnp, rdp); + rcu_preempt_boost_start_gp(rnp); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } @@ -864,7 +894,18 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { + unsigned long gp_duration; + WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); + + /* + * Ensure that all grace-period and pre-grace-period activity + * is seen before the assignment to rsp->completed. + */ + smp_mb(); /* See above block comment. */ + gp_duration = jiffies - rsp->gp_start; + if (gp_duration > rsp->gp_max) + rsp->gp_max = gp_duration; rsp->completed = rsp->gpnum; rsp->signaled = RCU_GP_IDLE; rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ @@ -894,7 +935,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, return; } rnp->qsmask &= ~mask; - if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { + if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { /* Other bits still set at this level, so done. */ raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -1037,6 +1078,8 @@ static void rcu_send_cbs_to_online(struct rcu_state *rsp) /* * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy * and move all callbacks from the outgoing CPU to the current one. + * There can only be one CPU hotplug operation at a time, so no other + * CPU can be attempting to update rcu_cpu_kthread_task. */ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) { @@ -1045,6 +1088,14 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) int need_report = 0; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp; + struct task_struct *t; + + /* Stop the CPU's kthread. */ + t = per_cpu(rcu_cpu_kthread_task, cpu); + if (t != NULL) { + per_cpu(rcu_cpu_kthread_task, cpu) = NULL; + kthread_stop(t); + } /* Exclude any attempts to start a new grace period. */ raw_spin_lock_irqsave(&rsp->onofflock, flags); @@ -1082,6 +1133,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); if (need_report & RCU_OFL_TASKS_EXP_GP) rcu_report_exp_rnp(rsp, rnp); + rcu_node_kthread_setaffinity(rnp, -1); } /* @@ -1143,7 +1195,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) next = list->next; prefetch(next); debug_rcu_head_unqueue(list); - list->func(list); + __rcu_reclaim(list); list = next; if (++count >= rdp->blimit) break; @@ -1179,7 +1231,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* Re-raise the RCU softirq if there are callbacks remaining. */ if (cpu_has_callbacks_ready_to_invoke(rdp)) - raise_softirq(RCU_SOFTIRQ); + invoke_rcu_cpu_kthread(); } /* @@ -1225,7 +1277,7 @@ void rcu_check_callbacks(int cpu, int user) } rcu_preempt_check_callbacks(cpu); if (rcu_pending(cpu)) - raise_softirq(RCU_SOFTIRQ); + invoke_rcu_cpu_kthread(); } #ifdef CONFIG_SMP @@ -1233,6 +1285,8 @@ void rcu_check_callbacks(int cpu, int user) /* * Scan the leaf rcu_node structures, processing dyntick state for any that * have not yet encountered a quiescent state, using the function specified. + * Also initiate boosting for any threads blocked on the root rcu_node. + * * The caller must have suppressed start of new grace periods. */ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) @@ -1251,7 +1305,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) return; } if (rnp->qsmask == 0) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ continue; } cpu = rnp->grplo; @@ -1269,6 +1323,11 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) } raw_spin_unlock_irqrestore(&rnp->lock, flags); } + rnp = rcu_get_root(rsp); + if (rnp->qsmask == 0) { + raw_spin_lock_irqsave(&rnp->lock, flags); + rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ + } } /* @@ -1389,31 +1448,347 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) /* * Do softirq processing for the current CPU. */ -static void rcu_process_callbacks(struct softirq_action *unused) +static void rcu_process_callbacks(void) { - /* - * Memory references from any prior RCU read-side critical sections - * executed by the interrupted code must be seen before any RCU - * grace-period manipulations below. - */ - smp_mb(); /* See above block comment. */ - __rcu_process_callbacks(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); rcu_preempt_process_callbacks(); - /* - * Memory references from any later RCU read-side critical sections - * executed by the interrupted code must be seen after any RCU - * grace-period manipulations above. - */ - smp_mb(); /* See above block comment. */ - /* If we are last CPU on way to dyntick-idle mode, accelerate it. */ rcu_needs_cpu_flush(); } +/* + * Wake up the current CPU's kthread. This replaces raise_softirq() + * in earlier versions of RCU. Note that because we are running on + * the current CPU with interrupts disabled, the rcu_cpu_kthread_task + * cannot disappear out from under us. + */ +static void invoke_rcu_cpu_kthread(void) +{ + unsigned long flags; + + local_irq_save(flags); + __this_cpu_write(rcu_cpu_has_work, 1); + if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) { + local_irq_restore(flags); + return; + } + wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); + local_irq_restore(flags); +} + +/* + * Wake up the specified per-rcu_node-structure kthread. + * Because the per-rcu_node kthreads are immortal, we don't need + * to do anything to keep them alive. + */ +static void invoke_rcu_node_kthread(struct rcu_node *rnp) +{ + struct task_struct *t; + + t = rnp->node_kthread_task; + if (t != NULL) + wake_up_process(t); +} + +/* + * Set the specified CPU's kthread to run RT or not, as specified by + * the to_rt argument. The CPU-hotplug locks are held, so the task + * is not going away. + */ +static void rcu_cpu_kthread_setrt(int cpu, int to_rt) +{ + int policy; + struct sched_param sp; + struct task_struct *t; + + t = per_cpu(rcu_cpu_kthread_task, cpu); + if (t == NULL) + return; + if (to_rt) { + policy = SCHED_FIFO; + sp.sched_priority = RCU_KTHREAD_PRIO; + } else { + policy = SCHED_NORMAL; + sp.sched_priority = 0; + } + sched_setscheduler_nocheck(t, policy, &sp); +} + +/* + * Timer handler to initiate the waking up of per-CPU kthreads that + * have yielded the CPU due to excess numbers of RCU callbacks. + * We wake up the per-rcu_node kthread, which in turn will wake up + * the booster kthread. + */ +static void rcu_cpu_kthread_timer(unsigned long arg) +{ + struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg); + struct rcu_node *rnp = rdp->mynode; + + atomic_or(rdp->grpmask, &rnp->wakemask); + invoke_rcu_node_kthread(rnp); +} + +/* + * Drop to non-real-time priority and yield, but only after posting a + * timer that will cause us to regain our real-time priority if we + * remain preempted. Either way, we restore our real-time priority + * before returning. + */ +static void rcu_yield(void (*f)(unsigned long), unsigned long arg) +{ + struct sched_param sp; + struct timer_list yield_timer; + + setup_timer_on_stack(&yield_timer, f, arg); + mod_timer(&yield_timer, jiffies + 2); + sp.sched_priority = 0; + sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); + set_user_nice(current, 19); + schedule(); + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + del_timer(&yield_timer); +} + +/* + * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU. + * This can happen while the corresponding CPU is either coming online + * or going offline. We cannot wait until the CPU is fully online + * before starting the kthread, because the various notifier functions + * can wait for RCU grace periods. So we park rcu_cpu_kthread() until + * the corresponding CPU is online. + * + * Return 1 if the kthread needs to stop, 0 otherwise. + * + * Caller must disable bh. This function can momentarily enable it. + */ +static int rcu_cpu_kthread_should_stop(int cpu) +{ + while (cpu_is_offline(cpu) || + !cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu)) || + smp_processor_id() != cpu) { + if (kthread_should_stop()) + return 1; + per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; + per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id(); + local_bh_enable(); + schedule_timeout_uninterruptible(1); + if (!cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu))) + set_cpus_allowed_ptr(current, cpumask_of(cpu)); + local_bh_disable(); + } + per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; + return 0; +} + +/* + * Per-CPU kernel thread that invokes RCU callbacks. This replaces the + * earlier RCU softirq. + */ +static int rcu_cpu_kthread(void *arg) +{ + int cpu = (int)(long)arg; + unsigned long flags; + int spincnt = 0; + unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu); + char work; + char *workp = &per_cpu(rcu_cpu_has_work, cpu); + + for (;;) { + *statusp = RCU_KTHREAD_WAITING; + rcu_wait(*workp != 0 || kthread_should_stop()); + local_bh_disable(); + if (rcu_cpu_kthread_should_stop(cpu)) { + local_bh_enable(); + break; + } + *statusp = RCU_KTHREAD_RUNNING; + per_cpu(rcu_cpu_kthread_loops, cpu)++; + local_irq_save(flags); + work = *workp; + *workp = 0; + local_irq_restore(flags); + if (work) + rcu_process_callbacks(); + local_bh_enable(); + if (*workp != 0) + spincnt++; + else + spincnt = 0; + if (spincnt > 10) { + *statusp = RCU_KTHREAD_YIELDING; + rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); + spincnt = 0; + } + } + *statusp = RCU_KTHREAD_STOPPED; + return 0; +} + +/* + * Spawn a per-CPU kthread, setting up affinity and priority. + * Because the CPU hotplug lock is held, no other CPU will be attempting + * to manipulate rcu_cpu_kthread_task. There might be another CPU + * attempting to access it during boot, but the locking in kthread_bind() + * will enforce sufficient ordering. + */ +static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) +{ + struct sched_param sp; + struct task_struct *t; + + if (!rcu_kthreads_spawnable || + per_cpu(rcu_cpu_kthread_task, cpu) != NULL) + return 0; + t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu); + if (IS_ERR(t)) + return PTR_ERR(t); + kthread_bind(t, cpu); + set_task_state(t, TASK_INTERRUPTIBLE); + per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; + WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL); + per_cpu(rcu_cpu_kthread_task, cpu) = t; + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + return 0; +} + +/* + * Per-rcu_node kthread, which is in charge of waking up the per-CPU + * kthreads when needed. We ignore requests to wake up kthreads + * for offline CPUs, which is OK because force_quiescent_state() + * takes care of this case. + */ +static int rcu_node_kthread(void *arg) +{ + int cpu; + unsigned long flags; + unsigned long mask; + struct rcu_node *rnp = (struct rcu_node *)arg; + struct sched_param sp; + struct task_struct *t; + + for (;;) { + rnp->node_kthread_status = RCU_KTHREAD_WAITING; + rcu_wait(atomic_read(&rnp->wakemask) != 0); + rnp->node_kthread_status = RCU_KTHREAD_RUNNING; + raw_spin_lock_irqsave(&rnp->lock, flags); + mask = atomic_xchg(&rnp->wakemask, 0); + rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) { + if ((mask & 0x1) == 0) + continue; + preempt_disable(); + t = per_cpu(rcu_cpu_kthread_task, cpu); + if (!cpu_online(cpu) || t == NULL) { + preempt_enable(); + continue; + } + per_cpu(rcu_cpu_has_work, cpu) = 1; + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + preempt_enable(); + } + } + /* NOTREACHED */ + rnp->node_kthread_status = RCU_KTHREAD_STOPPED; + return 0; +} + +/* + * Set the per-rcu_node kthread's affinity to cover all CPUs that are + * served by the rcu_node in question. The CPU hotplug lock is still + * held, so the value of rnp->qsmaskinit will be stable. + * + * We don't include outgoingcpu in the affinity set, use -1 if there is + * no outgoing CPU. If there are no CPUs left in the affinity set, + * this function allows the kthread to execute on any CPU. + */ +static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) +{ + cpumask_var_t cm; + int cpu; + unsigned long mask = rnp->qsmaskinit; + + if (rnp->node_kthread_task == NULL) + return; + if (!alloc_cpumask_var(&cm, GFP_KERNEL)) + return; + cpumask_clear(cm); + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) + if ((mask & 0x1) && cpu != outgoingcpu) + cpumask_set_cpu(cpu, cm); + if (cpumask_weight(cm) == 0) { + cpumask_setall(cm); + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) + cpumask_clear_cpu(cpu, cm); + WARN_ON_ONCE(cpumask_weight(cm) == 0); + } + set_cpus_allowed_ptr(rnp->node_kthread_task, cm); + rcu_boost_kthread_setaffinity(rnp, cm); + free_cpumask_var(cm); +} + +/* + * Spawn a per-rcu_node kthread, setting priority and affinity. + * Called during boot before online/offline can happen, or, if + * during runtime, with the main CPU-hotplug locks held. So only + * one of these can be executing at a time. + */ +static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, + struct rcu_node *rnp) +{ + unsigned long flags; + int rnp_index = rnp - &rsp->node[0]; + struct sched_param sp; + struct task_struct *t; + + if (!rcu_kthreads_spawnable || + rnp->qsmaskinit == 0) + return 0; + if (rnp->node_kthread_task == NULL) { + t = kthread_create(rcu_node_kthread, (void *)rnp, + "rcun%d", rnp_index); + if (IS_ERR(t)) + return PTR_ERR(t); + raw_spin_lock_irqsave(&rnp->lock, flags); + set_task_state(t, TASK_INTERRUPTIBLE); + rnp->node_kthread_task = t; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + sp.sched_priority = 99; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + } + return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index); +} + +/* + * Spawn all kthreads -- called as soon as the scheduler is running. + */ +static int __init rcu_spawn_kthreads(void) +{ + int cpu; + struct rcu_node *rnp; + + rcu_kthreads_spawnable = 1; + for_each_possible_cpu(cpu) { + per_cpu(rcu_cpu_has_work, cpu) = 0; + if (cpu_online(cpu)) + (void)rcu_spawn_one_cpu_kthread(cpu); + } + rnp = rcu_get_root(rcu_state); + (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + if (NUM_RCU_NODES > 1) { + rcu_for_each_leaf_node(rcu_state, rnp) + (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + } + return 0; +} +early_initcall(rcu_spawn_kthreads); + static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), struct rcu_state *rsp) @@ -1439,6 +1814,13 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), /* Add the callback to our list. */ *rdp->nxttail[RCU_NEXT_TAIL] = head; rdp->nxttail[RCU_NEXT_TAIL] = &head->next; + rdp->qlen++; + + /* If interrupts were disabled, don't dive into RCU core. */ + if (irqs_disabled_flags(flags)) { + local_irq_restore(flags); + return; + } /* * Force the grace period if too many callbacks or too long waiting. @@ -1447,7 +1829,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), * invoking force_quiescent_state() if the newly enqueued callback * is the only one waiting for a grace period to complete. */ - if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { + if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { /* Are we ignoring a completed grace period? */ rcu_process_gp_end(rsp, rdp); @@ -1583,7 +1965,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) * or RCU-bh, force a local reschedule. */ rdp->n_rp_qs_pending++; - if (!rdp->preemptable && + if (!rdp->preemptible && ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, jiffies)) set_need_resched(); @@ -1760,7 +2142,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) * that this CPU cannot possibly have any RCU callbacks in flight yet. */ static void __cpuinit -rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) +rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) { unsigned long flags; unsigned long mask; @@ -1772,7 +2154,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) rdp->passed_quiesc = 0; /* We could be racing with new GP, */ rdp->qs_pending = 1; /* so set up to respond to current GP. */ rdp->beenonline = 1; /* We have now been online. */ - rdp->preemptable = preemptable; + rdp->preemptible = preemptible; rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; @@ -1813,6 +2195,19 @@ static void __cpuinit rcu_online_cpu(int cpu) rcu_preempt_init_percpu_data(cpu); } +static void __cpuinit rcu_online_kthreads(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); + struct rcu_node *rnp = rdp->mynode; + + /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ + if (rcu_kthreads_spawnable) { + (void)rcu_spawn_one_cpu_kthread(cpu); + if (rnp->node_kthread_task == NULL) + (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + } +} + /* * Handle CPU online/offline notification events. */ @@ -1820,11 +2215,23 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; + struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); + struct rcu_node *rnp = rdp->mynode; switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: rcu_online_cpu(cpu); + rcu_online_kthreads(cpu); + break; + case CPU_ONLINE: + case CPU_DOWN_FAILED: + rcu_node_kthread_setaffinity(rnp, -1); + rcu_cpu_kthread_setrt(cpu, 1); + break; + case CPU_DOWN_PREPARE: + rcu_node_kthread_setaffinity(rnp, cpu); + rcu_cpu_kthread_setrt(cpu, 0); break; case CPU_DYING: case CPU_DYING_FROZEN: @@ -1943,10 +2350,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, j / rsp->levelspread[i - 1]; } rnp->level = i; - INIT_LIST_HEAD(&rnp->blocked_tasks[0]); - INIT_LIST_HEAD(&rnp->blocked_tasks[1]); - INIT_LIST_HEAD(&rnp->blocked_tasks[2]); - INIT_LIST_HEAD(&rnp->blocked_tasks[3]); + INIT_LIST_HEAD(&rnp->blkd_tasks); } } @@ -1968,7 +2372,6 @@ void __init rcu_init(void) rcu_init_one(&rcu_sched_state, &rcu_sched_data); rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); /* * We don't need protection against CPU-hotplug here because diff --git a/kernel/rcutree.h b/kernel/rcutree.h index e8f057e44e3e..7b9a08b4aaea 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -84,13 +84,19 @@ * Dynticks per-CPU state. */ struct rcu_dynticks { - int dynticks_nesting; /* Track nesting level, sort of. */ - int dynticks; /* Even value for dynticks-idle, else odd. */ - int dynticks_nmi; /* Even value for either dynticks-idle or */ - /* not in nmi handler, else odd. So this */ - /* remains even for nmi from irq handler. */ + int dynticks_nesting; /* Track irq/process nesting level. */ + int dynticks_nmi_nesting; /* Track NMI nesting level. */ + atomic_t dynticks; /* Even value for dynticks-idle, else odd. */ }; +/* RCU's kthread states for tracing. */ +#define RCU_KTHREAD_STOPPED 0 +#define RCU_KTHREAD_RUNNING 1 +#define RCU_KTHREAD_WAITING 2 +#define RCU_KTHREAD_OFFCPU 3 +#define RCU_KTHREAD_YIELDING 4 +#define RCU_KTHREAD_MAX 4 + /* * Definition for node within the RCU grace-period-detection hierarchy. */ @@ -109,10 +115,13 @@ struct rcu_node { /* an rcu_data structure, otherwise, each */ /* bit corresponds to a child rcu_node */ /* structure. */ - unsigned long expmask; /* Groups that have ->blocked_tasks[] */ + unsigned long expmask; /* Groups that have ->blkd_tasks */ /* elements that need to drain to allow the */ /* current expedited grace period to */ /* complete (only for TREE_PREEMPT_RCU). */ + atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */ + /* Since this has meaning only for leaf */ + /* rcu_node structures, 32 bits suffices. */ unsigned long qsmaskinit; /* Per-GP initial value for qsmask & expmask. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ @@ -122,11 +131,62 @@ struct rcu_node { u8 grpnum; /* CPU/group number for next level up. */ u8 level; /* root is at level 0. */ struct rcu_node *parent; - struct list_head blocked_tasks[4]; - /* Tasks blocked in RCU read-side critsect. */ - /* Grace period number (->gpnum) x blocked */ - /* by tasks on the (x & 0x1) element of the */ - /* blocked_tasks[] array. */ + struct list_head blkd_tasks; + /* Tasks blocked in RCU read-side critical */ + /* section. Tasks are placed at the head */ + /* of this list and age towards the tail. */ + struct list_head *gp_tasks; + /* Pointer to the first task blocking the */ + /* current grace period, or NULL if there */ + /* is no such task. */ + struct list_head *exp_tasks; + /* Pointer to the first task blocking the */ + /* current expedited grace period, or NULL */ + /* if there is no such task. If there */ + /* is no current expedited grace period, */ + /* then there can cannot be any such task. */ +#ifdef CONFIG_RCU_BOOST + struct list_head *boost_tasks; + /* Pointer to first task that needs to be */ + /* priority boosted, or NULL if no priority */ + /* boosting is needed for this rcu_node */ + /* structure. If there are no tasks */ + /* queued on this rcu_node structure that */ + /* are blocking the current grace period, */ + /* there can be no such task. */ + unsigned long boost_time; + /* When to start boosting (jiffies). */ + struct task_struct *boost_kthread_task; + /* kthread that takes care of priority */ + /* boosting for this rcu_node structure. */ + unsigned int boost_kthread_status; + /* State of boost_kthread_task for tracing. */ + unsigned long n_tasks_boosted; + /* Total number of tasks boosted. */ + unsigned long n_exp_boosts; + /* Number of tasks boosted for expedited GP. */ + unsigned long n_normal_boosts; + /* Number of tasks boosted for normal GP. */ + unsigned long n_balk_blkd_tasks; + /* Refused to boost: no blocked tasks. */ + unsigned long n_balk_exp_gp_tasks; + /* Refused to boost: nothing blocking GP. */ + unsigned long n_balk_boost_tasks; + /* Refused to boost: already boosting. */ + unsigned long n_balk_notblocked; + /* Refused to boost: RCU RS CS still running. */ + unsigned long n_balk_notyet; + /* Refused to boost: not yet time. */ + unsigned long n_balk_nos; + /* Refused to boost: not sure why, though. */ + /* This can happen due to race conditions. */ +#endif /* #ifdef CONFIG_RCU_BOOST */ + struct task_struct *node_kthread_task; + /* kthread that takes care of this rcu_node */ + /* structure, for example, awakening the */ + /* per-CPU kthreads as needed. */ + unsigned int node_kthread_status; + /* State of node_kthread_task for tracing. */ } ____cacheline_internodealigned_in_smp; /* @@ -175,7 +235,7 @@ struct rcu_data { bool passed_quiesc; /* User-mode/idle loop etc. */ bool qs_pending; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ - bool preemptable; /* Preemptable RCU? */ + bool preemptible; /* Preemptible RCU? */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ @@ -218,7 +278,6 @@ struct rcu_data { /* 3) dynticks interface. */ struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ int dynticks_snap; /* Per-GP tracking for dynticks. */ - int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */ #endif /* #ifdef CONFIG_NO_HZ */ /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ @@ -254,7 +313,6 @@ struct rcu_data { #endif /* #else #ifdef CONFIG_NO_HZ */ #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR #ifdef CONFIG_PROVE_RCU #define RCU_STALL_DELAY_DELTA (5 * HZ) @@ -272,13 +330,16 @@ struct rcu_data { /* scheduling clock irq */ /* before ratting on them. */ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE -#define RCU_CPU_STALL_SUPPRESS_INIT 0 -#else -#define RCU_CPU_STALL_SUPPRESS_INIT 1 -#endif - -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ +#define rcu_wait(cond) \ +do { \ + for (;;) { \ + set_current_state(TASK_INTERRUPTIBLE); \ + if (cond) \ + break; \ + schedule(); \ + } \ + __set_current_state(TASK_RUNNING); \ +} while (0) /* * RCU global state, including node hierarchy. This hierarchy is @@ -325,12 +386,12 @@ struct rcu_state { /* due to lock unavailable. */ unsigned long n_force_qs_ngp; /* Number of calls leaving */ /* due to no GP active. */ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR unsigned long gp_start; /* Time at which GP started, */ /* but in jiffies. */ unsigned long jiffies_stall; /* Time at which to check */ /* for CPU stalls. */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + unsigned long gp_max; /* Maximum GP duration in */ + /* jiffies. */ char *name; /* Name of structure. */ }; @@ -361,16 +422,14 @@ DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); static void rcu_bootup_announce(void); long rcu_batches_completed(void); static void rcu_preempt_note_context_switch(int cpu); -static int rcu_preempted_readers(struct rcu_node *rnp); +static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR static void rcu_print_detail_task_stall(struct rcu_state *rsp); static void rcu_print_task_stall(struct rcu_node *rnp); static void rcu_preempt_stall_reset(void); -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static int rcu_preempt_offline_tasks(struct rcu_state *rsp, @@ -390,5 +449,12 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu); static void rcu_preempt_send_cbs_to_online(void); static void __init __rcu_init_preempt(void); static void rcu_needs_cpu_flush(void); +static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, + cpumask_var_t cm); +static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); +static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, + struct rcu_node *rnp, + int rnp_index); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index a3638710dc67..a767b7dac365 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1,7 +1,7 @@ /* * Read-Copy Update mechanism for mutual exclusion (tree-based version) * Internal non-public definitions that provide either classic - * or preemptable semantics. + * or preemptible semantics. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -54,10 +54,6 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); #endif -#ifndef CONFIG_RCU_CPU_STALL_DETECTOR - printk(KERN_INFO - "\tRCU-based detection of stalled CPUs is disabled.\n"); -#endif #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); #endif @@ -70,6 +66,7 @@ static void __init rcu_bootup_announce_oddness(void) struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); +static struct rcu_state *rcu_state = &rcu_preempt_state; static int rcu_preempted_readers_exp(struct rcu_node *rnp); @@ -78,7 +75,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp); */ static void __init rcu_bootup_announce(void) { - printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n"); + printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n"); rcu_bootup_announce_oddness(); } @@ -111,7 +108,7 @@ void rcu_force_quiescent_state(void) EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); /* - * Record a preemptable-RCU quiescent state for the specified CPU. Note + * Record a preemptible-RCU quiescent state for the specified CPU. Note * that this just means that the task currently running on the CPU is * not in a quiescent state. There might be any number of tasks blocked * while in an RCU read-side critical section. @@ -134,12 +131,12 @@ static void rcu_preempt_qs(int cpu) * We have entered the scheduler, and the current task might soon be * context-switched away from. If this task is in an RCU read-side * critical section, we will no longer be able to rely on the CPU to - * record that fact, so we enqueue the task on the appropriate entry - * of the blocked_tasks[] array. The task will dequeue itself when - * it exits the outermost enclosing RCU read-side critical section. - * Therefore, the current grace period cannot be permitted to complete - * until the blocked_tasks[] entry indexed by the low-order bit of - * rnp->gpnum empties. + * record that fact, so we enqueue the task on the blkd_tasks list. + * The task will dequeue itself when it exits the outermost enclosing + * RCU read-side critical section. Therefore, the current grace period + * cannot be permitted to complete until the blkd_tasks list entries + * predating the current grace period drain, in other words, until + * rnp->gp_tasks becomes NULL. * * Caller must disable preemption. */ @@ -147,7 +144,6 @@ static void rcu_preempt_note_context_switch(int cpu) { struct task_struct *t = current; unsigned long flags; - int phase; struct rcu_data *rdp; struct rcu_node *rnp; @@ -169,15 +165,30 @@ static void rcu_preempt_note_context_switch(int cpu) * (i.e., this CPU has not yet passed through a quiescent * state for the current grace period), then as long * as that task remains queued, the current grace period - * cannot end. + * cannot end. Note that there is some uncertainty as + * to exactly when the current grace period started. + * We take a conservative approach, which can result + * in unnecessarily waiting on tasks that started very + * slightly after the current grace period began. C'est + * la vie!!! * * But first, note that the current CPU must still be * on line! */ WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); - phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; - list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); + if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { + list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); + rnp->gp_tasks = &t->rcu_node_entry; +#ifdef CONFIG_RCU_BOOST + if (rnp->boost_tasks != NULL) + rnp->boost_tasks = rnp->gp_tasks; +#endif /* #ifdef CONFIG_RCU_BOOST */ + } else { + list_add(&t->rcu_node_entry, &rnp->blkd_tasks); + if (rnp->qsmask & rdp->grpmask) + rnp->gp_tasks = &t->rcu_node_entry; + } raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -196,7 +207,7 @@ static void rcu_preempt_note_context_switch(int cpu) } /* - * Tree-preemptable RCU implementation for rcu_read_lock(). + * Tree-preemptible RCU implementation for rcu_read_lock(). * Just increment ->rcu_read_lock_nesting, shared state will be updated * if we block. */ @@ -212,12 +223,9 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock); * for the specified rcu_node structure. If the caller needs a reliable * answer, it must hold the rcu_node's ->lock. */ -static int rcu_preempted_readers(struct rcu_node *rnp) +static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) { - int phase = rnp->gpnum & 0x1; - - return !list_empty(&rnp->blocked_tasks[phase]) || - !list_empty(&rnp->blocked_tasks[phase + 2]); + return rnp->gp_tasks != NULL; } /* @@ -233,7 +241,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) unsigned long mask; struct rcu_node *rnp_p; - if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { + if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; /* Still need more quiescent states! */ } @@ -257,6 +265,21 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) } /* + * Advance a ->blkd_tasks-list pointer to the next entry, instead + * returning NULL if at the end of the list. + */ +static struct list_head *rcu_next_node_entry(struct task_struct *t, + struct rcu_node *rnp) +{ + struct list_head *np; + + np = t->rcu_node_entry.next; + if (np == &rnp->blkd_tasks) + np = NULL; + return np; +} + +/* * Handle special cases during rcu_read_unlock(), such as needing to * notify RCU core processing or task having blocked during the RCU * read-side critical section. @@ -266,6 +289,7 @@ static void rcu_read_unlock_special(struct task_struct *t) int empty; int empty_exp; unsigned long flags; + struct list_head *np; struct rcu_node *rnp; int special; @@ -306,10 +330,19 @@ static void rcu_read_unlock_special(struct task_struct *t) break; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } - empty = !rcu_preempted_readers(rnp); + empty = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = !rcu_preempted_readers_exp(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ + np = rcu_next_node_entry(t, rnp); list_del_init(&t->rcu_node_entry); + if (&t->rcu_node_entry == rnp->gp_tasks) + rnp->gp_tasks = np; + if (&t->rcu_node_entry == rnp->exp_tasks) + rnp->exp_tasks = np; +#ifdef CONFIG_RCU_BOOST + if (&t->rcu_node_entry == rnp->boost_tasks) + rnp->boost_tasks = np; +#endif /* #ifdef CONFIG_RCU_BOOST */ t->rcu_blocked_node = NULL; /* @@ -322,6 +355,15 @@ static void rcu_read_unlock_special(struct task_struct *t) else rcu_report_unblock_qs_rnp(rnp, flags); +#ifdef CONFIG_RCU_BOOST + /* Unboost if we were boosted. */ + if (special & RCU_READ_UNLOCK_BOOSTED) { + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED; + rt_mutex_unlock(t->rcu_boost_mutex); + t->rcu_boost_mutex = NULL; + } +#endif /* #ifdef CONFIG_RCU_BOOST */ + /* * If this was the last task on the expedited lists, * then we need to report up the rcu_node hierarchy. @@ -334,7 +376,7 @@ static void rcu_read_unlock_special(struct task_struct *t) } /* - * Tree-preemptable RCU implementation for rcu_read_unlock(). + * Tree-preemptible RCU implementation for rcu_read_unlock(). * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then * invoke rcu_read_unlock_special() to clean up after a context switch @@ -356,8 +398,6 @@ void __rcu_read_unlock(void) } EXPORT_SYMBOL_GPL(__rcu_read_unlock); -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR - #ifdef CONFIG_RCU_CPU_STALL_VERBOSE /* @@ -367,18 +407,16 @@ EXPORT_SYMBOL_GPL(__rcu_read_unlock); static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) { unsigned long flags; - struct list_head *lp; - int phase; struct task_struct *t; - if (rcu_preempted_readers(rnp)) { - raw_spin_lock_irqsave(&rnp->lock, flags); - phase = rnp->gpnum & 0x1; - lp = &rnp->blocked_tasks[phase]; - list_for_each_entry(t, lp, rcu_node_entry) - sched_show_task(t); - raw_spin_unlock_irqrestore(&rnp->lock, flags); - } + if (!rcu_preempt_blocked_readers_cgp(rnp)) + return; + raw_spin_lock_irqsave(&rnp->lock, flags); + t = list_entry(rnp->gp_tasks, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) + sched_show_task(t); + raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* @@ -408,16 +446,14 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) */ static void rcu_print_task_stall(struct rcu_node *rnp) { - struct list_head *lp; - int phase; struct task_struct *t; - if (rcu_preempted_readers(rnp)) { - phase = rnp->gpnum & 0x1; - lp = &rnp->blocked_tasks[phase]; - list_for_each_entry(t, lp, rcu_node_entry) - printk(" P%d", t->pid); - } + if (!rcu_preempt_blocked_readers_cgp(rnp)) + return; + t = list_entry(rnp->gp_tasks, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) + printk(" P%d", t->pid); } /* @@ -430,18 +466,21 @@ static void rcu_preempt_stall_reset(void) rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; } -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - /* * Check that the list of blocked tasks for the newly completed grace * period is in fact empty. It is a serious bug to complete a grace * period that still has RCU readers blocked! This function must be * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock * must be held by the caller. + * + * Also, if there are blocked tasks on the list, they automatically + * block the newly created grace period, so set up ->gp_tasks accordingly. */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { - WARN_ON_ONCE(rcu_preempted_readers(rnp)); + WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); + if (!list_empty(&rnp->blkd_tasks)) + rnp->gp_tasks = rnp->blkd_tasks.next; WARN_ON_ONCE(rnp->qsmask); } @@ -465,50 +504,68 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - int i; struct list_head *lp; struct list_head *lp_root; int retval = 0; struct rcu_node *rnp_root = rcu_get_root(rsp); - struct task_struct *tp; + struct task_struct *t; if (rnp == rnp_root) { WARN_ONCE(1, "Last CPU thought to be offlined?"); return 0; /* Shouldn't happen: at least one CPU online. */ } - WARN_ON_ONCE(rnp != rdp->mynode && - (!list_empty(&rnp->blocked_tasks[0]) || - !list_empty(&rnp->blocked_tasks[1]) || - !list_empty(&rnp->blocked_tasks[2]) || - !list_empty(&rnp->blocked_tasks[3]))); + + /* If we are on an internal node, complain bitterly. */ + WARN_ON_ONCE(rnp != rdp->mynode); /* - * Move tasks up to root rcu_node. Rely on the fact that the - * root rcu_node can be at most one ahead of the rest of the - * rcu_nodes in terms of gp_num value. This fact allows us to - * move the blocked_tasks[] array directly, element by element. + * Move tasks up to root rcu_node. Don't try to get fancy for + * this corner-case operation -- just put this node's tasks + * at the head of the root node's list, and update the root node's + * ->gp_tasks and ->exp_tasks pointers to those of this node's, + * if non-NULL. This might result in waiting for more tasks than + * absolutely necessary, but this is a good performance/complexity + * tradeoff. */ - if (rcu_preempted_readers(rnp)) + if (rcu_preempt_blocked_readers_cgp(rnp)) retval |= RCU_OFL_TASKS_NORM_GP; if (rcu_preempted_readers_exp(rnp)) retval |= RCU_OFL_TASKS_EXP_GP; - for (i = 0; i < 4; i++) { - lp = &rnp->blocked_tasks[i]; - lp_root = &rnp_root->blocked_tasks[i]; - while (!list_empty(lp)) { - tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); - raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ - list_del(&tp->rcu_node_entry); - tp->rcu_blocked_node = rnp_root; - list_add(&tp->rcu_node_entry, lp_root); - raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */ - } + lp = &rnp->blkd_tasks; + lp_root = &rnp_root->blkd_tasks; + while (!list_empty(lp)) { + t = list_entry(lp->next, typeof(*t), rcu_node_entry); + raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ + list_del(&t->rcu_node_entry); + t->rcu_blocked_node = rnp_root; + list_add(&t->rcu_node_entry, lp_root); + if (&t->rcu_node_entry == rnp->gp_tasks) + rnp_root->gp_tasks = rnp->gp_tasks; + if (&t->rcu_node_entry == rnp->exp_tasks) + rnp_root->exp_tasks = rnp->exp_tasks; +#ifdef CONFIG_RCU_BOOST + if (&t->rcu_node_entry == rnp->boost_tasks) + rnp_root->boost_tasks = rnp->boost_tasks; +#endif /* #ifdef CONFIG_RCU_BOOST */ + raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ } + +#ifdef CONFIG_RCU_BOOST + /* In case root is being boosted and leaf is not. */ + raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ + if (rnp_root->boost_tasks != NULL && + rnp_root->boost_tasks != rnp_root->gp_tasks) + rnp_root->boost_tasks = rnp_root->gp_tasks; + raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ +#endif /* #ifdef CONFIG_RCU_BOOST */ + + rnp->gp_tasks = NULL; + rnp->exp_tasks = NULL; return retval; } /* - * Do CPU-offline processing for preemptable RCU. + * Do CPU-offline processing for preemptible RCU. */ static void rcu_preempt_offline_cpu(int cpu) { @@ -537,7 +594,7 @@ static void rcu_preempt_check_callbacks(int cpu) } /* - * Process callbacks for preemptable RCU. + * Process callbacks for preemptible RCU. */ static void rcu_preempt_process_callbacks(void) { @@ -546,7 +603,7 @@ static void rcu_preempt_process_callbacks(void) } /* - * Queue a preemptable-RCU callback for invocation after a grace period. + * Queue a preemptible-RCU callback for invocation after a grace period. */ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { @@ -594,8 +651,7 @@ static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); */ static int rcu_preempted_readers_exp(struct rcu_node *rnp) { - return !list_empty(&rnp->blocked_tasks[2]) || - !list_empty(&rnp->blocked_tasks[3]); + return rnp->exp_tasks != NULL; } /* @@ -655,13 +711,17 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) static void sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) { - int must_wait; + unsigned long flags; + int must_wait = 0; - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); - list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); - must_wait = rcu_preempted_readers_exp(rnp); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ + raw_spin_lock_irqsave(&rnp->lock, flags); + if (list_empty(&rnp->blkd_tasks)) + raw_spin_unlock_irqrestore(&rnp->lock, flags); + else { + rnp->exp_tasks = rnp->blkd_tasks.next; + rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ + must_wait = 1; + } if (!must_wait) rcu_report_exp_rnp(rsp, rnp); } @@ -669,9 +729,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) /* * Wait for an rcu-preempt grace period, but expedite it. The basic idea * is to invoke synchronize_sched_expedited() to push all the tasks to - * the ->blocked_tasks[] lists, move all entries from the first set of - * ->blocked_tasks[] lists to the second set, and finally wait for this - * second set to drain. + * the ->blkd_tasks lists and wait for this list to drain. */ void synchronize_rcu_expedited(void) { @@ -703,7 +761,7 @@ void synchronize_rcu_expedited(void) if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) goto unlock_mb_ret; /* Others did our work for us. */ - /* force all RCU readers onto blocked_tasks[]. */ + /* force all RCU readers onto ->blkd_tasks lists. */ synchronize_sched_expedited(); raw_spin_lock_irqsave(&rsp->onofflock, flags); @@ -715,7 +773,7 @@ void synchronize_rcu_expedited(void) raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } - /* Snapshot current state of ->blocked_tasks[] lists. */ + /* Snapshot current state of ->blkd_tasks lists. */ rcu_for_each_leaf_node(rsp, rnp) sync_rcu_preempt_exp_init(rsp, rnp); if (NUM_RCU_NODES > 1) @@ -723,7 +781,7 @@ void synchronize_rcu_expedited(void) raw_spin_unlock_irqrestore(&rsp->onofflock, flags); - /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ + /* Wait for snapshotted ->blkd_tasks lists to drain. */ rnp = rcu_get_root(rsp); wait_event(sync_rcu_preempt_exp_wq, sync_rcu_preempt_exp_done(rnp)); @@ -739,7 +797,7 @@ mb_ret: EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); /* - * Check to see if there is any immediate preemptable-RCU-related work + * Check to see if there is any immediate preemptible-RCU-related work * to be done. */ static int rcu_preempt_pending(int cpu) @@ -749,7 +807,7 @@ static int rcu_preempt_pending(int cpu) } /* - * Does preemptable RCU need the CPU to stay out of dynticks mode? + * Does preemptible RCU need the CPU to stay out of dynticks mode? */ static int rcu_preempt_needs_cpu(int cpu) { @@ -766,7 +824,7 @@ void rcu_barrier(void) EXPORT_SYMBOL_GPL(rcu_barrier); /* - * Initialize preemptable RCU's per-CPU data. + * Initialize preemptible RCU's per-CPU data. */ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) { @@ -774,7 +832,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) } /* - * Move preemptable RCU's callbacks from dying CPU to other online CPU. + * Move preemptible RCU's callbacks from dying CPU to other online CPU. */ static void rcu_preempt_send_cbs_to_online(void) { @@ -782,7 +840,7 @@ static void rcu_preempt_send_cbs_to_online(void) } /* - * Initialize preemptable RCU's state structures. + * Initialize preemptible RCU's state structures. */ static void __init __rcu_init_preempt(void) { @@ -790,7 +848,7 @@ static void __init __rcu_init_preempt(void) } /* - * Check for a task exiting while in a preemptable-RCU read-side + * Check for a task exiting while in a preemptible-RCU read-side * critical section, clean up if so. No need to issue warnings, * as debug_check_no_locks_held() already does this if lockdep * is enabled. @@ -802,11 +860,13 @@ void exit_rcu(void) if (t->rcu_read_lock_nesting == 0) return; t->rcu_read_lock_nesting = 1; - rcu_read_unlock(); + __rcu_read_unlock(); } #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ +static struct rcu_state *rcu_state = &rcu_sched_state; + /* * Tell them what RCU they are running. */ @@ -836,7 +896,7 @@ void rcu_force_quiescent_state(void) EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); /* - * Because preemptable RCU does not exist, we never have to check for + * Because preemptible RCU does not exist, we never have to check for * CPUs being in quiescent states. */ static void rcu_preempt_note_context_switch(int cpu) @@ -844,10 +904,10 @@ static void rcu_preempt_note_context_switch(int cpu) } /* - * Because preemptable RCU does not exist, there are never any preempted + * Because preemptible RCU does not exist, there are never any preempted * RCU readers. */ -static int rcu_preempted_readers(struct rcu_node *rnp) +static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) { return 0; } @@ -862,10 +922,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -#ifdef CONFIG_RCU_CPU_STALL_DETECTOR - /* - * Because preemptable RCU does not exist, we never have to check for + * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections. */ static void rcu_print_detail_task_stall(struct rcu_state *rsp) @@ -873,7 +931,7 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) } /* - * Because preemptable RCU does not exist, we never have to check for + * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections. */ static void rcu_print_task_stall(struct rcu_node *rnp) @@ -888,10 +946,8 @@ static void rcu_preempt_stall_reset(void) { } -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ - /* - * Because there is no preemptable RCU, there can be no readers blocked, + * Because there is no preemptible RCU, there can be no readers blocked, * so there is no need to check for blocked tasks. So check only for * bogus qsmask values. */ @@ -903,7 +959,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) #ifdef CONFIG_HOTPLUG_CPU /* - * Because preemptable RCU does not exist, it never needs to migrate + * Because preemptible RCU does not exist, it never needs to migrate * tasks that were blocked within RCU read-side critical sections, and * such non-existent tasks cannot possibly have been blocking the current * grace period. @@ -916,7 +972,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, } /* - * Because preemptable RCU does not exist, it never needs CPU-offline + * Because preemptible RCU does not exist, it never needs CPU-offline * processing. */ static void rcu_preempt_offline_cpu(int cpu) @@ -926,7 +982,7 @@ static void rcu_preempt_offline_cpu(int cpu) #endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* - * Because preemptable RCU does not exist, it never has any callbacks + * Because preemptible RCU does not exist, it never has any callbacks * to check. */ static void rcu_preempt_check_callbacks(int cpu) @@ -934,7 +990,7 @@ static void rcu_preempt_check_callbacks(int cpu) } /* - * Because preemptable RCU does not exist, it never has any callbacks + * Because preemptible RCU does not exist, it never has any callbacks * to process. */ static void rcu_preempt_process_callbacks(void) @@ -943,7 +999,7 @@ static void rcu_preempt_process_callbacks(void) /* * Wait for an rcu-preempt grace period, but make it happen quickly. - * But because preemptable RCU does not exist, map to rcu-sched. + * But because preemptible RCU does not exist, map to rcu-sched. */ void synchronize_rcu_expedited(void) { @@ -954,7 +1010,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); #ifdef CONFIG_HOTPLUG_CPU /* - * Because preemptable RCU does not exist, there is never any need to + * Because preemptible RCU does not exist, there is never any need to * report on tasks preempted in RCU read-side critical sections during * expedited RCU grace periods. */ @@ -966,7 +1022,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) #endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* - * Because preemptable RCU does not exist, it never has any work to do. + * Because preemptible RCU does not exist, it never has any work to do. */ static int rcu_preempt_pending(int cpu) { @@ -974,7 +1030,7 @@ static int rcu_preempt_pending(int cpu) } /* - * Because preemptable RCU does not exist, it never needs any CPU. + * Because preemptible RCU does not exist, it never needs any CPU. */ static int rcu_preempt_needs_cpu(int cpu) { @@ -982,7 +1038,7 @@ static int rcu_preempt_needs_cpu(int cpu) } /* - * Because preemptable RCU does not exist, rcu_barrier() is just + * Because preemptible RCU does not exist, rcu_barrier() is just * another name for rcu_barrier_sched(). */ void rcu_barrier(void) @@ -992,7 +1048,7 @@ void rcu_barrier(void) EXPORT_SYMBOL_GPL(rcu_barrier); /* - * Because preemptable RCU does not exist, there is no per-CPU + * Because preemptible RCU does not exist, there is no per-CPU * data to initialize. */ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) @@ -1000,14 +1056,14 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) } /* - * Because there is no preemptable RCU, there are no callbacks to move. + * Because there is no preemptible RCU, there are no callbacks to move. */ static void rcu_preempt_send_cbs_to_online(void) { } /* - * Because preemptable RCU does not exist, it need not be initialized. + * Because preemptible RCU does not exist, it need not be initialized. */ static void __init __rcu_init_preempt(void) { @@ -1015,6 +1071,263 @@ static void __init __rcu_init_preempt(void) #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ +#ifdef CONFIG_RCU_BOOST + +#include "rtmutex_common.h" + +#ifdef CONFIG_RCU_TRACE + +static void rcu_initiate_boost_trace(struct rcu_node *rnp) +{ + if (list_empty(&rnp->blkd_tasks)) + rnp->n_balk_blkd_tasks++; + else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) + rnp->n_balk_exp_gp_tasks++; + else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL) + rnp->n_balk_boost_tasks++; + else if (rnp->gp_tasks != NULL && rnp->qsmask != 0) + rnp->n_balk_notblocked++; + else if (rnp->gp_tasks != NULL && + ULONG_CMP_LT(jiffies, rnp->boost_time)) + rnp->n_balk_notyet++; + else + rnp->n_balk_nos++; +} + +#else /* #ifdef CONFIG_RCU_TRACE */ + +static void rcu_initiate_boost_trace(struct rcu_node *rnp) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_TRACE */ + +/* + * Carry out RCU priority boosting on the task indicated by ->exp_tasks + * or ->boost_tasks, advancing the pointer to the next task in the + * ->blkd_tasks list. + * + * Note that irqs must be enabled: boosting the task can block. + * Returns 1 if there are more tasks needing to be boosted. + */ +static int rcu_boost(struct rcu_node *rnp) +{ + unsigned long flags; + struct rt_mutex mtx; + struct task_struct *t; + struct list_head *tb; + + if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) + return 0; /* Nothing left to boost. */ + + raw_spin_lock_irqsave(&rnp->lock, flags); + + /* + * Recheck under the lock: all tasks in need of boosting + * might exit their RCU read-side critical sections on their own. + */ + if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return 0; + } + + /* + * Preferentially boost tasks blocking expedited grace periods. + * This cannot starve the normal grace periods because a second + * expedited grace period must boost all blocked tasks, including + * those blocking the pre-existing normal grace period. + */ + if (rnp->exp_tasks != NULL) { + tb = rnp->exp_tasks; + rnp->n_exp_boosts++; + } else { + tb = rnp->boost_tasks; + rnp->n_normal_boosts++; + } + rnp->n_tasks_boosted++; + + /* + * We boost task t by manufacturing an rt_mutex that appears to + * be held by task t. We leave a pointer to that rt_mutex where + * task t can find it, and task t will release the mutex when it + * exits its outermost RCU read-side critical section. Then + * simply acquiring this artificial rt_mutex will boost task + * t's priority. (Thanks to tglx for suggesting this approach!) + * + * Note that task t must acquire rnp->lock to remove itself from + * the ->blkd_tasks list, which it will do from exit() if from + * nowhere else. We therefore are guaranteed that task t will + * stay around at least until we drop rnp->lock. Note that + * rnp->lock also resolves races between our priority boosting + * and task t's exiting its outermost RCU read-side critical + * section. + */ + t = container_of(tb, struct task_struct, rcu_node_entry); + rt_mutex_init_proxy_locked(&mtx, t); + t->rcu_boost_mutex = &mtx; + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ + rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ + + return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL; +} + +/* + * Timer handler to initiate waking up of boost kthreads that + * have yielded the CPU due to excessive numbers of tasks to + * boost. We wake up the per-rcu_node kthread, which in turn + * will wake up the booster kthread. + */ +static void rcu_boost_kthread_timer(unsigned long arg) +{ + invoke_rcu_node_kthread((struct rcu_node *)arg); +} + +/* + * Priority-boosting kthread. One per leaf rcu_node and one for the + * root rcu_node. + */ +static int rcu_boost_kthread(void *arg) +{ + struct rcu_node *rnp = (struct rcu_node *)arg; + int spincnt = 0; + int more2boost; + + for (;;) { + rnp->boost_kthread_status = RCU_KTHREAD_WAITING; + rcu_wait(rnp->boost_tasks || rnp->exp_tasks); + rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; + more2boost = rcu_boost(rnp); + if (more2boost) + spincnt++; + else + spincnt = 0; + if (spincnt > 10) { + rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); + spincnt = 0; + } + } + /* NOTREACHED */ + return 0; +} + +/* + * Check to see if it is time to start boosting RCU readers that are + * blocking the current grace period, and, if so, tell the per-rcu_node + * kthread to start boosting them. If there is an expedited grace + * period in progress, it is always time to boost. + * + * The caller must hold rnp->lock, which this function releases, + * but irqs remain disabled. The ->boost_kthread_task is immortal, + * so we don't need to worry about it going away. + */ +static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) +{ + struct task_struct *t; + + if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { + rnp->n_balk_exp_gp_tasks++; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + if (rnp->exp_tasks != NULL || + (rnp->gp_tasks != NULL && + rnp->boost_tasks == NULL && + rnp->qsmask == 0 && + ULONG_CMP_GE(jiffies, rnp->boost_time))) { + if (rnp->exp_tasks == NULL) + rnp->boost_tasks = rnp->gp_tasks; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + t = rnp->boost_kthread_task; + if (t != NULL) + wake_up_process(t); + } else { + rcu_initiate_boost_trace(rnp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } +} + +/* + * Set the affinity of the boost kthread. The CPU-hotplug locks are + * held, so no one should be messing with the existence of the boost + * kthread. + */ +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, + cpumask_var_t cm) +{ + struct task_struct *t; + + t = rnp->boost_kthread_task; + if (t != NULL) + set_cpus_allowed_ptr(rnp->boost_kthread_task, cm); +} + +#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) + +/* + * Do priority-boost accounting for the start of a new grace period. + */ +static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) +{ + rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; +} + +/* + * Create an RCU-boost kthread for the specified node if one does not + * already exist. We only create this kthread for preemptible RCU. + * Returns zero if all is well, a negated errno otherwise. + */ +static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, + struct rcu_node *rnp, + int rnp_index) +{ + unsigned long flags; + struct sched_param sp; + struct task_struct *t; + + if (&rcu_preempt_state != rsp) + return 0; + if (rnp->boost_kthread_task != NULL) + return 0; + t = kthread_create(rcu_boost_kthread, (void *)rnp, + "rcub%d", rnp_index); + if (IS_ERR(t)) + return PTR_ERR(t); + raw_spin_lock_irqsave(&rnp->lock, flags); + set_task_state(t, TASK_INTERRUPTIBLE); + rnp->boost_kthread_task = t; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + return 0; +} + +#else /* #ifdef CONFIG_RCU_BOOST */ + +static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) +{ + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, + cpumask_var_t cm) +{ +} + +static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) +{ +} + +static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, + struct rcu_node *rnp, + int rnp_index) +{ + return 0; +} + +#endif /* #else #ifdef CONFIG_RCU_BOOST */ + #ifndef CONFIG_SMP void synchronize_sched_expedited(void) @@ -1187,14 +1500,13 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); * * Because it is not legal to invoke rcu_process_callbacks() with irqs * disabled, we do one pass of force_quiescent_state(), then do a - * raise_softirq() to cause rcu_process_callbacks() to be invoked later. - * The per-cpu rcu_dyntick_drain variable controls the sequencing. + * invoke_rcu_cpu_kthread() to cause rcu_process_callbacks() to be invoked + * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. */ int rcu_needs_cpu(int cpu) { int c = 0; int snap; - int snap_nmi; int thatcpu; /* Check for being in the holdoff period. */ @@ -1205,10 +1517,10 @@ int rcu_needs_cpu(int cpu) for_each_online_cpu(thatcpu) { if (thatcpu == cpu) continue; - snap = per_cpu(rcu_dynticks, thatcpu).dynticks; - snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi; + snap = atomic_add_return(0, &per_cpu(rcu_dynticks, + thatcpu).dynticks); smp_mb(); /* Order sampling of snap with end of grace period. */ - if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) { + if ((snap & 0x1) != 0) { per_cpu(rcu_dyntick_drain, cpu) = 0; per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; return rcu_needs_cpu_quick_check(cpu); @@ -1239,7 +1551,7 @@ int rcu_needs_cpu(int cpu) /* If RCU callbacks are still pending, RCU still needs this CPU. */ if (c) - raise_softirq(RCU_SOFTIRQ); + invoke_rcu_cpu_kthread(); return c; } diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index c8e97853b970..9678cc3650f5 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -46,6 +46,18 @@ #define RCU_TREE_NONCORE #include "rcutree.h" +DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); +DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu); +DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); +DECLARE_PER_CPU(char, rcu_cpu_has_work); + +static char convert_kthread_status(unsigned int kthread_status) +{ + if (kthread_status > RCU_KTHREAD_MAX) + return '?'; + return "SRWOY"[kthread_status]; +} + static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) { if (!rdp->beenonline) @@ -57,14 +69,28 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->passed_quiesc, rdp->passed_quiesc_completed, rdp->qs_pending); #ifdef CONFIG_NO_HZ - seq_printf(m, " dt=%d/%d dn=%d df=%lu", - rdp->dynticks->dynticks, + seq_printf(m, " dt=%d/%d/%d df=%lu", + atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, - rdp->dynticks->dynticks_nmi, + rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit); + seq_printf(m, " ql=%ld qs=%c%c%c%c kt=%d/%c/%d ktl=%x b=%ld", + rdp->qlen, + ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != + rdp->nxttail[RCU_NEXT_TAIL]], + ".R"[rdp->nxttail[RCU_WAIT_TAIL] != + rdp->nxttail[RCU_NEXT_READY_TAIL]], + ".W"[rdp->nxttail[RCU_DONE_TAIL] != + rdp->nxttail[RCU_WAIT_TAIL]], + ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]], + per_cpu(rcu_cpu_has_work, rdp->cpu), + convert_kthread_status(per_cpu(rcu_cpu_kthread_status, + rdp->cpu)), + per_cpu(rcu_cpu_kthread_cpu, rdp->cpu), + per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff, + rdp->blimit); seq_printf(m, " ci=%lu co=%lu ca=%lu\n", rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } @@ -115,13 +141,24 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->qs_pending); #ifdef CONFIG_NO_HZ seq_printf(m, ",%d,%d,%d,%lu", - rdp->dynticks->dynticks, + atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, - rdp->dynticks->dynticks_nmi, + rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit); + seq_printf(m, ",%ld,\"%c%c%c%c\",%d,\"%c\",%ld", rdp->qlen, + ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != + rdp->nxttail[RCU_NEXT_TAIL]], + ".R"[rdp->nxttail[RCU_WAIT_TAIL] != + rdp->nxttail[RCU_NEXT_READY_TAIL]], + ".W"[rdp->nxttail[RCU_DONE_TAIL] != + rdp->nxttail[RCU_WAIT_TAIL]], + ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]], + per_cpu(rcu_cpu_has_work, rdp->cpu), + convert_kthread_status(per_cpu(rcu_cpu_kthread_status, + rdp->cpu)), + rdp->blimit); seq_printf(m, ",%lu,%lu,%lu\n", rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } @@ -130,7 +167,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) { seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\","); #ifdef CONFIG_NO_HZ - seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); + seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); #endif /* #ifdef CONFIG_NO_HZ */ seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n"); #ifdef CONFIG_TREE_PREEMPT_RCU @@ -157,11 +194,76 @@ static const struct file_operations rcudata_csv_fops = { .release = single_release, }; +#ifdef CONFIG_RCU_BOOST + +static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) +{ + seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu " + "j=%04x bt=%04x\n", + rnp->grplo, rnp->grphi, + "T."[list_empty(&rnp->blkd_tasks)], + "N."[!rnp->gp_tasks], + "E."[!rnp->exp_tasks], + "B."[!rnp->boost_tasks], + convert_kthread_status(rnp->boost_kthread_status), + rnp->n_tasks_boosted, rnp->n_exp_boosts, + rnp->n_normal_boosts, + (int)(jiffies & 0xffff), + (int)(rnp->boost_time & 0xffff)); + seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", + " balk", + rnp->n_balk_blkd_tasks, + rnp->n_balk_exp_gp_tasks, + rnp->n_balk_boost_tasks, + rnp->n_balk_notblocked, + rnp->n_balk_notyet, + rnp->n_balk_nos); +} + +static int show_rcu_node_boost(struct seq_file *m, void *unused) +{ + struct rcu_node *rnp; + + rcu_for_each_leaf_node(&rcu_preempt_state, rnp) + print_one_rcu_node_boost(m, rnp); + return 0; +} + +static int rcu_node_boost_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcu_node_boost, NULL); +} + +static const struct file_operations rcu_node_boost_fops = { + .owner = THIS_MODULE, + .open = rcu_node_boost_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* + * Create the rcuboost debugfs entry. Standard error return. + */ +static int rcu_boost_trace_create_file(struct dentry *rcudir) +{ + return !debugfs_create_file("rcuboost", 0444, rcudir, NULL, + &rcu_node_boost_fops); +} + +#else /* #ifdef CONFIG_RCU_BOOST */ + +static int rcu_boost_trace_create_file(struct dentry *rcudir) +{ + return 0; /* There cannot be an error if we didn't create it! */ +} + +#endif /* #else #ifdef CONFIG_RCU_BOOST */ + static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) { unsigned long gpnum; int level = 0; - int phase; struct rcu_node *rnp; gpnum = rsp->gpnum; @@ -178,13 +280,11 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) seq_puts(m, "\n"); level = rnp->level; } - phase = gpnum & 0x1; - seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d ", + seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ", rnp->qsmask, rnp->qsmaskinit, - "T."[list_empty(&rnp->blocked_tasks[phase])], - "E."[list_empty(&rnp->blocked_tasks[phase + 2])], - "T."[list_empty(&rnp->blocked_tasks[!phase])], - "E."[list_empty(&rnp->blocked_tasks[!phase + 2])], + ".G"[rnp->gp_tasks != NULL], + ".E"[rnp->exp_tasks != NULL], + ".T"[!list_empty(&rnp->blkd_tasks)], rnp->grplo, rnp->grphi, rnp->grpnum); } seq_puts(m, "\n"); @@ -216,16 +316,35 @@ static const struct file_operations rcuhier_fops = { .release = single_release, }; +static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) +{ + unsigned long flags; + unsigned long completed; + unsigned long gpnum; + unsigned long gpage; + unsigned long gpmax; + struct rcu_node *rnp = &rsp->node[0]; + + raw_spin_lock_irqsave(&rnp->lock, flags); + completed = rsp->completed; + gpnum = rsp->gpnum; + if (rsp->completed == rsp->gpnum) + gpage = 0; + else + gpage = jiffies - rsp->gp_start; + gpmax = rsp->gp_max; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n", + rsp->name, completed, gpnum, gpage, gpmax); +} + static int show_rcugp(struct seq_file *m, void *unused) { #ifdef CONFIG_TREE_PREEMPT_RCU - seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n", - rcu_preempt_state.completed, rcu_preempt_state.gpnum); + show_one_rcugp(m, &rcu_preempt_state); #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n", - rcu_sched_state.completed, rcu_sched_state.gpnum); - seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n", - rcu_bh_state.completed, rcu_bh_state.gpnum); + show_one_rcugp(m, &rcu_sched_state); + show_one_rcugp(m, &rcu_bh_state); return 0; } @@ -298,6 +417,29 @@ static const struct file_operations rcu_pending_fops = { .release = single_release, }; +static int show_rcutorture(struct seq_file *m, void *unused) +{ + seq_printf(m, "rcutorture test sequence: %lu %s\n", + rcutorture_testseq >> 1, + (rcutorture_testseq & 0x1) ? "(test in progress)" : ""); + seq_printf(m, "rcutorture update version number: %lu\n", + rcutorture_vernum); + return 0; +} + +static int rcutorture_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcutorture, NULL); +} + +static const struct file_operations rcutorture_fops = { + .owner = THIS_MODULE, + .open = rcutorture_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static struct dentry *rcudir; static int __init rcutree_trace_init(void) @@ -318,6 +460,9 @@ static int __init rcutree_trace_init(void) if (!retval) goto free_out; + if (rcu_boost_trace_create_file(rcudir)) + goto free_out; + retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); if (!retval) goto free_out; @@ -331,6 +476,11 @@ static int __init rcutree_trace_init(void) NULL, &rcu_pending_fops); if (!retval) goto free_out; + + retval = debugfs_create_file("rcutorture", 0444, rcudir, + NULL, &rcutorture_fops); + if (!retval) + goto free_out; return 0; free_out: debugfs_remove_recursive(rcudir); diff --git a/kernel/res_counter.c b/kernel/res_counter.c index c7eaa37a768b..34683efa2cce 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -126,10 +126,24 @@ ssize_t res_counter_read(struct res_counter *counter, int member, pos, buf, s - buf); } +#if BITS_PER_LONG == 32 +u64 res_counter_read_u64(struct res_counter *counter, int member) +{ + unsigned long flags; + u64 ret; + + spin_lock_irqsave(&counter->lock, flags); + ret = *res_counter_member(counter, member); + spin_unlock_irqrestore(&counter->lock, flags); + + return ret; +} +#else u64 res_counter_read_u64(struct res_counter *counter, int member) { return *res_counter_member(counter, member); } +#endif int res_counter_memparse_write_strategy(const char *buf, unsigned long long *res) diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index ddabb54bb5c8..3c7cbc2c33be 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -215,7 +215,6 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) put_pid(waiter->deadlock_task_pid); TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); - TRACE_WARN_ON(waiter->task); memset(waiter, 0x22, sizeof(*waiter)); } diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 66cb89bc5ef1..5c9ccd380966 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -9,7 +9,6 @@ #include <linux/kthread.h> #include <linux/module.h> #include <linux/sched.h> -#include <linux/smp_lock.h> #include <linux/spinlock.h> #include <linux/sysdev.h> #include <linux/timer.h> @@ -27,7 +26,6 @@ struct test_thread_data { int opcode; int opdata; int mutexes[MAX_RT_TEST_MUTEXES]; - int bkl; int event; struct sys_device sysdev; }; @@ -46,9 +44,8 @@ enum test_opcodes { RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ - RTTEST_LOCKBKL, /* 9 Lock BKL */ - RTTEST_UNLOCKBKL, /* 10 Unlock BKL */ - RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */ + /* 9, 10 - reserved for BKL commemoration */ + RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */ RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ RTTEST_RESET = 99, /* 99 Reset all pending operations */ }; @@ -74,13 +71,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup) td->mutexes[i] = 0; } } - - if (!lockwakeup && td->bkl == 4) { -#ifdef CONFIG_LOCK_KERNEL - unlock_kernel(); -#endif - td->bkl = 0; - } return 0; case RTTEST_RESETEVENT: @@ -131,25 +121,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup) td->mutexes[id] = 0; return 0; - case RTTEST_LOCKBKL: - if (td->bkl) - return 0; - td->bkl = 1; -#ifdef CONFIG_LOCK_KERNEL - lock_kernel(); -#endif - td->bkl = 4; - return 0; - - case RTTEST_UNLOCKBKL: - if (td->bkl != 4) - break; -#ifdef CONFIG_LOCK_KERNEL - unlock_kernel(); -#endif - td->bkl = 0; - return 0; - default: break; } @@ -196,7 +167,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex) td->event = atomic_add_return(1, &rttest_event); break; - case RTTEST_LOCKBKL: default: break; } @@ -229,8 +199,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex) td->event = atomic_add_return(1, &rttest_event); return; - case RTTEST_LOCKBKL: - return; default: return; } @@ -380,11 +348,11 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute spin_lock(&rttest_lock); curr += sprintf(curr, - "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:", + "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:", td->opcode, td->event, tsk->state, (MAX_RT_PRIO - 1) - tsk->prio, (MAX_RT_PRIO - 1) - tsk->normal_prio, - tsk->pi_blocked_on, td->bkl); + tsk->pi_blocked_on); for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) curr += sprintf(curr, "%d", td->mutexes[i]); diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index a9604815786a..ab449117aaf2 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -20,41 +20,34 @@ /* * lock->owner state tracking: * - * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 - * are used to keep track of the "owner is pending" and "lock has - * waiters" state. + * lock->owner holds the task_struct pointer of the owner. Bit 0 + * is used to keep track of the "lock has waiters" state. * - * owner bit1 bit0 - * NULL 0 0 lock is free (fast acquire possible) - * NULL 0 1 invalid state - * NULL 1 0 Transitional State* - * NULL 1 1 invalid state - * taskpointer 0 0 lock is held (fast release possible) - * taskpointer 0 1 task is pending owner - * taskpointer 1 0 lock is held and has waiters - * taskpointer 1 1 task is pending owner and lock has more waiters - * - * Pending ownership is assigned to the top (highest priority) - * waiter of the lock, when the lock is released. The thread is woken - * up and can now take the lock. Until the lock is taken (bit 0 - * cleared) a competing higher priority thread can steal the lock - * which puts the woken up thread back on the waiters list. + * owner bit0 + * NULL 0 lock is free (fast acquire possible) + * NULL 1 lock is free and has waiters and the top waiter + * is going to take the lock* + * taskpointer 0 lock is held (fast release possible) + * taskpointer 1 lock is held and has waiters** * * The fast atomic compare exchange based acquire and release is only - * possible when bit 0 and 1 of lock->owner are 0. + * possible when bit 0 of lock->owner is 0. + * + * (*) It also can be a transitional state when grabbing the lock + * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock, + * we need to set the bit0 before looking at the lock, and the owner may be + * NULL in this small time, hence this can be a transitional state. * - * (*) There's a small time where the owner can be NULL and the - * "lock has waiters" bit is set. This can happen when grabbing the lock. - * To prevent a cmpxchg of the owner releasing the lock, we need to set this - * bit before looking at the lock, hence the reason this is a transitional - * state. + * (**) There is a small time when bit 0 is set but there are no + * waiters. This can happen when grabbing the lock in the slow path. + * To prevent a cmpxchg of the owner releasing the lock, we need to + * set this bit before looking at the lock. */ static void -rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, - unsigned long mask) +rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner) { - unsigned long val = (unsigned long)owner | mask; + unsigned long val = (unsigned long)owner; if (rt_mutex_has_waiters(lock)) val |= RT_MUTEX_HAS_WAITERS; @@ -203,15 +196,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * reached or the state of the chain has changed while we * dropped the locks. */ - if (!waiter || !waiter->task) + if (!waiter) goto out_unlock_pi; /* * Check the orig_waiter state. After we dropped the locks, - * the previous owner of the lock might have released the lock - * and made us the pending owner: + * the previous owner of the lock might have released the lock. */ - if (orig_waiter && !orig_waiter->task) + if (orig_waiter && !rt_mutex_owner(orig_lock)) goto out_unlock_pi; /* @@ -254,6 +246,17 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* Release the task */ raw_spin_unlock_irqrestore(&task->pi_lock, flags); + if (!rt_mutex_owner(lock)) { + /* + * If the requeue above changed the top waiter, then we need + * to wake the new top waiter up to try to get the lock. + */ + + if (top_waiter != rt_mutex_top_waiter(lock)) + wake_up_process(rt_mutex_top_waiter(lock)->task); + raw_spin_unlock(&lock->wait_lock); + goto out_put_task; + } put_task_struct(task); /* Grab the next task */ @@ -296,78 +299,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, } /* - * Optimization: check if we can steal the lock from the - * assigned pending owner [which might not have taken the - * lock yet]: - */ -static inline int try_to_steal_lock(struct rt_mutex *lock, - struct task_struct *task) -{ - struct task_struct *pendowner = rt_mutex_owner(lock); - struct rt_mutex_waiter *next; - unsigned long flags; - - if (!rt_mutex_owner_pending(lock)) - return 0; - - if (pendowner == task) - return 1; - - raw_spin_lock_irqsave(&pendowner->pi_lock, flags); - if (task->prio >= pendowner->prio) { - raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); - return 0; - } - - /* - * Check if a waiter is enqueued on the pending owners - * pi_waiters list. Remove it and readjust pending owners - * priority. - */ - if (likely(!rt_mutex_has_waiters(lock))) { - raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); - return 1; - } - - /* No chain handling, pending owner is not blocked on anything: */ - next = rt_mutex_top_waiter(lock); - plist_del(&next->pi_list_entry, &pendowner->pi_waiters); - __rt_mutex_adjust_prio(pendowner); - raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); - - /* - * We are going to steal the lock and a waiter was - * enqueued on the pending owners pi_waiters queue. So - * we have to enqueue this waiter into - * task->pi_waiters list. This covers the case, - * where task is boosted because it holds another - * lock and gets unboosted because the booster is - * interrupted, so we would delay a waiter with higher - * priority as task->normal_prio. - * - * Note: in the rare case of a SCHED_OTHER task changing - * its priority and thus stealing the lock, next->task - * might be task: - */ - if (likely(next->task != task)) { - raw_spin_lock_irqsave(&task->pi_lock, flags); - plist_add(&next->pi_list_entry, &task->pi_waiters); - __rt_mutex_adjust_prio(task); - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - } - return 1; -} - -/* * Try to take an rt-mutex * - * This fails - * - when the lock has a real owner - * - when a different pending owner exists and has higher priority than current - * * Must be called with lock->wait_lock held. + * + * @lock: the lock to be acquired. + * @task: the task which wants to acquire the lock + * @waiter: the waiter that is queued to the lock's wait list. (could be NULL) */ -static int try_to_take_rt_mutex(struct rt_mutex *lock) +static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, + struct rt_mutex_waiter *waiter) { /* * We have to be careful here if the atomic speedups are @@ -390,15 +331,52 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) */ mark_rt_mutex_waiters(lock); - if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current)) + if (rt_mutex_owner(lock)) return 0; + /* + * It will get the lock because of one of these conditions: + * 1) there is no waiter + * 2) higher priority than waiters + * 3) it is top waiter + */ + if (rt_mutex_has_waiters(lock)) { + if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) { + if (!waiter || waiter != rt_mutex_top_waiter(lock)) + return 0; + } + } + + if (waiter || rt_mutex_has_waiters(lock)) { + unsigned long flags; + struct rt_mutex_waiter *top; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + + /* remove the queued waiter. */ + if (waiter) { + plist_del(&waiter->list_entry, &lock->wait_list); + task->pi_blocked_on = NULL; + } + + /* + * We have to enqueue the top waiter(if it exists) into + * task->pi_waiters list. + */ + if (rt_mutex_has_waiters(lock)) { + top = rt_mutex_top_waiter(lock); + top->pi_list_entry.prio = top->list_entry.prio; + plist_add(&top->pi_list_entry, &task->pi_waiters); + } + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + } + /* We got the lock. */ debug_rt_mutex_lock(lock); - rt_mutex_set_owner(lock, current, 0); + rt_mutex_set_owner(lock, task); - rt_mutex_deadlock_account_lock(lock, current); + rt_mutex_deadlock_account_lock(lock, task); return 1; } @@ -436,6 +414,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, raw_spin_unlock_irqrestore(&task->pi_lock, flags); + if (!owner) + return 0; + if (waiter == rt_mutex_top_waiter(lock)) { raw_spin_lock_irqsave(&owner->pi_lock, flags); plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); @@ -472,21 +453,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, /* * Wake up the next waiter on the lock. * - * Remove the top waiter from the current tasks waiter list and from - * the lock waiter list. Set it as pending owner. Then wake it up. + * Remove the top waiter from the current tasks waiter list and wake it up. * * Called with lock->wait_lock held. */ static void wakeup_next_waiter(struct rt_mutex *lock) { struct rt_mutex_waiter *waiter; - struct task_struct *pendowner; unsigned long flags; raw_spin_lock_irqsave(¤t->pi_lock, flags); waiter = rt_mutex_top_waiter(lock); - plist_del(&waiter->list_entry, &lock->wait_list); /* * Remove it from current->pi_waiters. We do not adjust a @@ -495,43 +473,19 @@ static void wakeup_next_waiter(struct rt_mutex *lock) * lock->wait_lock. */ plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); - pendowner = waiter->task; - waiter->task = NULL; - rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); + rt_mutex_set_owner(lock, NULL); raw_spin_unlock_irqrestore(¤t->pi_lock, flags); - /* - * Clear the pi_blocked_on variable and enqueue a possible - * waiter into the pi_waiters list of the pending owner. This - * prevents that in case the pending owner gets unboosted a - * waiter with higher priority than pending-owner->normal_prio - * is blocked on the unboosted (pending) owner. - */ - raw_spin_lock_irqsave(&pendowner->pi_lock, flags); - - WARN_ON(!pendowner->pi_blocked_on); - WARN_ON(pendowner->pi_blocked_on != waiter); - WARN_ON(pendowner->pi_blocked_on->lock != lock); - - pendowner->pi_blocked_on = NULL; - - if (rt_mutex_has_waiters(lock)) { - struct rt_mutex_waiter *next; - - next = rt_mutex_top_waiter(lock); - plist_add(&next->pi_list_entry, &pendowner->pi_waiters); - } - raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); - - wake_up_process(pendowner); + wake_up_process(waiter->task); } /* - * Remove a waiter from a lock + * Remove a waiter from a lock and give up * - * Must be called with lock->wait_lock held + * Must be called with lock->wait_lock held and + * have just failed to try_to_take_rt_mutex(). */ static void remove_waiter(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) @@ -543,11 +497,13 @@ static void remove_waiter(struct rt_mutex *lock, raw_spin_lock_irqsave(¤t->pi_lock, flags); plist_del(&waiter->list_entry, &lock->wait_list); - waiter->task = NULL; current->pi_blocked_on = NULL; raw_spin_unlock_irqrestore(¤t->pi_lock, flags); - if (first && owner != current) { + if (!owner) + return; + + if (first) { raw_spin_lock_irqsave(&owner->pi_lock, flags); @@ -614,21 +570,19 @@ void rt_mutex_adjust_pi(struct task_struct *task) * or TASK_UNINTERRUPTIBLE) * @timeout: the pre-initialized and started timer, or NULL for none * @waiter: the pre-initialized rt_mutex_waiter - * @detect_deadlock: passed to task_blocks_on_rt_mutex * * lock->wait_lock must be held by the caller. */ static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, - struct rt_mutex_waiter *waiter, - int detect_deadlock) + struct rt_mutex_waiter *waiter) { int ret = 0; for (;;) { /* Try to acquire the lock: */ - if (try_to_take_rt_mutex(lock)) + if (try_to_take_rt_mutex(lock, current, waiter)) break; /* @@ -645,39 +599,11 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, break; } - /* - * waiter->task is NULL the first time we come here and - * when we have been woken up by the previous owner - * but the lock got stolen by a higher prio task. - */ - if (!waiter->task) { - ret = task_blocks_on_rt_mutex(lock, waiter, current, - detect_deadlock); - /* - * If we got woken up by the owner then start loop - * all over without going into schedule to try - * to get the lock now: - */ - if (unlikely(!waiter->task)) { - /* - * Reset the return value. We might - * have returned with -EDEADLK and the - * owner released the lock while we - * were walking the pi chain. - */ - ret = 0; - continue; - } - if (unlikely(ret)) - break; - } - raw_spin_unlock(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); - if (waiter->task) - schedule_rt_mutex(lock); + schedule_rt_mutex(lock); raw_spin_lock(&lock->wait_lock); set_current_state(state); @@ -698,12 +624,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, int ret = 0; debug_rt_mutex_init_waiter(&waiter); - waiter.task = NULL; raw_spin_lock(&lock->wait_lock); /* Try to acquire the lock again: */ - if (try_to_take_rt_mutex(lock)) { + if (try_to_take_rt_mutex(lock, current, NULL)) { raw_spin_unlock(&lock->wait_lock); return 0; } @@ -717,12 +642,14 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, timeout->task = NULL; } - ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, - detect_deadlock); + ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock); + + if (likely(!ret)) + ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); set_current_state(TASK_RUNNING); - if (unlikely(waiter.task)) + if (unlikely(ret)) remove_waiter(lock, &waiter); /* @@ -737,14 +664,6 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, if (unlikely(timeout)) hrtimer_cancel(&timeout->timer); - /* - * Readjust priority, when we did not get the lock. We might - * have been the pending owner and boosted. Since we did not - * take the lock, the PI boost has to go. - */ - if (unlikely(ret)) - rt_mutex_adjust_prio(current); - debug_rt_mutex_free_waiter(&waiter); return ret; @@ -762,7 +681,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock) if (likely(rt_mutex_owner(lock) != current)) { - ret = try_to_take_rt_mutex(lock); + ret = try_to_take_rt_mutex(lock, current, NULL); /* * try_to_take_rt_mutex() sets the lock waiters * bit unconditionally. Clean this up. @@ -992,7 +911,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, { __rt_mutex_init(lock, NULL); debug_rt_mutex_proxy_lock(lock, proxy_owner); - rt_mutex_set_owner(lock, proxy_owner, 0); + rt_mutex_set_owner(lock, proxy_owner); rt_mutex_deadlock_account_lock(lock, proxy_owner); } @@ -1008,7 +927,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner) { debug_rt_mutex_proxy_unlock(lock); - rt_mutex_set_owner(lock, NULL, 0); + rt_mutex_set_owner(lock, NULL); rt_mutex_deadlock_account_unlock(proxy_owner); } @@ -1034,20 +953,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, raw_spin_lock(&lock->wait_lock); - mark_rt_mutex_waiters(lock); - - if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { - /* We got the lock for task. */ - debug_rt_mutex_lock(lock); - rt_mutex_set_owner(lock, task, 0); + if (try_to_take_rt_mutex(lock, task, NULL)) { raw_spin_unlock(&lock->wait_lock); - rt_mutex_deadlock_account_lock(lock, task); return 1; } ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); - if (ret && !waiter->task) { + if (ret && !rt_mutex_owner(lock)) { /* * Reset the return value. We might have * returned with -EDEADLK and the owner @@ -1056,6 +969,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, */ ret = 0; } + + if (unlikely(ret)) + remove_waiter(lock, waiter); + raw_spin_unlock(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); @@ -1110,12 +1027,11 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, set_current_state(TASK_INTERRUPTIBLE); - ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, - detect_deadlock); + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); set_current_state(TASK_RUNNING); - if (unlikely(waiter->task)) + if (unlikely(ret)) remove_waiter(lock, waiter); /* @@ -1126,13 +1042,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, raw_spin_unlock(&lock->wait_lock); - /* - * Readjust priority, when we did not get the lock. We might have been - * the pending owner and boosted. Since we did not take the lock, the - * PI boost has to go. - */ - if (unlikely(ret)) - rt_mutex_adjust_prio(current); - return ret; } diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index 97a2f81866af..53a66c85261b 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h @@ -91,9 +91,8 @@ task_top_pi_waiter(struct task_struct *p) /* * lock->owner state tracking: */ -#define RT_MUTEX_OWNER_PENDING 1UL -#define RT_MUTEX_HAS_WAITERS 2UL -#define RT_MUTEX_OWNER_MASKALL 3UL +#define RT_MUTEX_HAS_WAITERS 1UL +#define RT_MUTEX_OWNER_MASKALL 1UL static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) { @@ -101,17 +100,6 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); } -static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) -{ - return (struct task_struct *) - ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); -} - -static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock) -{ - return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; -} - /* * PI-futex support (proxy locking functions, etc.): */ diff --git a/kernel/sched.c b/kernel/sched.c index 42eab5a8437d..cbb3a0eee58e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -32,7 +32,6 @@ #include <linux/init.h> #include <linux/uaccess.h> #include <linux/highmem.h> -#include <linux/smp_lock.h> #include <asm/mmu_context.h> #include <linux/interrupt.h> #include <linux/capability.h> @@ -232,7 +231,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) #endif /* - * sched_domains_mutex serializes calls to arch_init_sched_domains, + * sched_domains_mutex serializes calls to init_sched_domains, * detach_destroy_domains and partition_sched_domains. */ static DEFINE_MUTEX(sched_domains_mutex); @@ -294,7 +293,7 @@ static DEFINE_SPINLOCK(task_group_lock); * limitation from this.) */ #define MIN_SHARES 2 -#define MAX_SHARES (1UL << 18) +#define MAX_SHARES (1UL << (18 + SCHED_LOAD_RESOLUTION)) static int root_task_group_load = ROOT_TASK_GROUP_LOAD; #endif @@ -313,6 +312,9 @@ struct cfs_rq { u64 exec_clock; u64 min_vruntime; +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; +#endif struct rb_root tasks_timeline; struct rb_node *rb_leftmost; @@ -324,9 +326,11 @@ struct cfs_rq { * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ - struct sched_entity *curr, *next, *last; + struct sched_entity *curr, *next, *last, *skip; +#ifdef CONFIG_SCHED_DEBUG unsigned int nr_spread_over; +#endif #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ @@ -418,6 +422,7 @@ struct rt_rq { */ struct root_domain { atomic_t refcount; + struct rcu_head rcu; cpumask_var_t span; cpumask_var_t online; @@ -461,7 +466,7 @@ struct rq { u64 nohz_stamp; unsigned char nohz_balance_kick; #endif - unsigned int skip_clock_update; + int skip_clock_update; /* capture load from *all* tasks on this cpu: */ struct load_weight load; @@ -554,6 +559,10 @@ struct rq { unsigned int ttwu_count; unsigned int ttwu_local; #endif + +#ifdef CONFIG_SMP + struct task_struct *wake_list; +#endif }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -572,7 +581,7 @@ static inline int cpu_of(struct rq *rq) #define rcu_dereference_check_sched_domain(p) \ rcu_dereference_check((p), \ - rcu_read_lock_sched_held() || \ + rcu_read_lock_held() || \ lockdep_is_held(&sched_domains_mutex)) /* @@ -597,7 +606,7 @@ static inline int cpu_of(struct rq *rq) * Return the group to which this tasks belongs. * * We use task_subsys_state_check() and extend the RCU verification - * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() + * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach() * holds that lock for each task it moves into the cgroup. Therefore * by holding that lock, we pin the task to the current cgroup. */ @@ -606,11 +615,8 @@ static inline struct task_group *task_group(struct task_struct *p) struct task_group *tg; struct cgroup_subsys_state *css; - if (p->flags & PF_EXITING) - return &root_task_group; - css = task_subsys_state_check(p, cpu_cgroup_subsys_id, - lockdep_is_held(&task_rq(p)->lock)); + lockdep_is_held(&p->pi_lock)); tg = container_of(css, struct task_group, css); return autogroup_task_group(p, tg); @@ -646,7 +652,7 @@ static void update_rq_clock(struct rq *rq) { s64 delta; - if (rq->skip_clock_update) + if (rq->skip_clock_update > 0) return; delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; @@ -664,10 +670,9 @@ static void update_rq_clock(struct rq *rq) #endif /** - * runqueue_is_locked + * runqueue_is_locked - Returns true if the current cpu runqueue is locked * @cpu: the processor in question. * - * Returns true if the current cpu runqueue is locked. * This interface allows printk to be called with the runqueue lock * held and know whether or not it is OK to wake up the klogd. */ @@ -843,18 +848,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p) return rq->curr == p; } -#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline int task_running(struct rq *rq, struct task_struct *p) { +#ifdef CONFIG_SMP + return p->on_cpu; +#else return task_current(rq, p); +#endif } +#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->on_cpu = 1; +#endif } static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->on_cpu = 0; +#endif #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ rq->lock.owner = current; @@ -870,15 +896,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) } #else /* __ARCH_WANT_UNLOCKED_CTXSW */ -static inline int task_running(struct rq *rq, struct task_struct *p) -{ -#ifdef CONFIG_SMP - return p->oncpu; -#else - return task_current(rq, p); -#endif -} - static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { #ifdef CONFIG_SMP @@ -887,7 +904,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) * SMP rebalancing from interrupt is the only thing that cares * here. */ - next->oncpu = 1; + next->on_cpu = 1; #endif #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW raw_spin_unlock_irq(&rq->lock); @@ -900,12 +917,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { #ifdef CONFIG_SMP /* - * After ->oncpu is cleared, the task can be moved to a different CPU. + * After ->on_cpu is cleared, the task can be moved to a different CPU. * We must ensure this doesn't happen until the switch is completely * finished. */ smp_wmb(); - prev->oncpu = 0; + prev->on_cpu = 0; #endif #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_enable(); @@ -914,23 +931,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ /* - * Check whether the task is waking, we use this to synchronize ->cpus_allowed - * against ttwu(). - */ -static inline int task_is_waking(struct task_struct *p) -{ - return unlikely(p->state == TASK_WAKING); -} - -/* - * __task_rq_lock - lock the runqueue a given task resides on. - * Must be called interrupts disabled. + * __task_rq_lock - lock the rq @p resides on. */ static inline struct rq *__task_rq_lock(struct task_struct *p) __acquires(rq->lock) { struct rq *rq; + lockdep_assert_held(&p->pi_lock); + for (;;) { rq = task_rq(p); raw_spin_lock(&rq->lock); @@ -941,22 +950,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) } /* - * task_rq_lock - lock the runqueue a given task resides on and disable - * interrupts. Note the ordering: we can safely lookup the task_rq without - * explicitly disabling preemption. + * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. */ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) + __acquires(p->pi_lock) __acquires(rq->lock) { struct rq *rq; for (;;) { - local_irq_save(*flags); + raw_spin_lock_irqsave(&p->pi_lock, *flags); rq = task_rq(p); raw_spin_lock(&rq->lock); if (likely(rq == task_rq(p))) return rq; - raw_spin_unlock_irqrestore(&rq->lock, *flags); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); } } @@ -966,10 +975,13 @@ static void __task_rq_unlock(struct rq *rq) raw_spin_unlock(&rq->lock); } -static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) __releases(rq->lock) + __releases(p->pi_lock) { - raw_spin_unlock_irqrestore(&rq->lock, *flags); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); } /* @@ -1198,11 +1210,17 @@ int get_nohz_timer_target(void) int i; struct sched_domain *sd; + rcu_read_lock(); for_each_domain(cpu, sd) { - for_each_cpu(i, sched_domain_span(sd)) - if (!idle_cpu(i)) - return i; + for_each_cpu(i, sched_domain_span(sd)) { + if (!idle_cpu(i)) { + cpu = i; + goto unlock; + } + } } +unlock: + rcu_read_unlock(); return cpu; } /* @@ -1312,15 +1330,27 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, { u64 tmp; + /* + * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched + * entities since MIN_SHARES = 2. Treat weight as 1 if less than + * 2^SCHED_LOAD_RESOLUTION. + */ + if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) + tmp = (u64)delta_exec * scale_load_down(weight); + else + tmp = (u64)delta_exec; + if (!lw->inv_weight) { - if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) + unsigned long w = scale_load_down(lw->weight); + + if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) lw->inv_weight = 1; + else if (unlikely(!w)) + lw->inv_weight = WMULT_CONST; else - lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) - / (lw->weight+1); + lw->inv_weight = WMULT_CONST / w; } - tmp = (u64)delta_exec * weight; /* * Check whether we'd overflow the 64-bit multiplication: */ @@ -1686,6 +1716,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) __release(rq2->lock); } +#else /* CONFIG_SMP */ + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + BUG_ON(rq1 != rq2); + raw_spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + BUG_ON(rq1 != rq2); + raw_spin_unlock(&rq1->lock); + __release(rq2->lock); +} + #endif static void calc_load_account_idle(struct rq *this_rq); @@ -1727,17 +1790,20 @@ static void dec_nr_running(struct rq *rq) static void set_load_weight(struct task_struct *p) { + int prio = p->static_prio - MAX_RT_PRIO; + struct load_weight *load = &p->se.load; + /* * SCHED_IDLE tasks get minimal weight: */ if (p->policy == SCHED_IDLE) { - p->se.load.weight = WEIGHT_IDLEPRIO; - p->se.load.inv_weight = WMULT_IDLEPRIO; + load->weight = scale_load(WEIGHT_IDLEPRIO); + load->inv_weight = WMULT_IDLEPRIO; return; } - p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; - p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; + load->weight = scale_load(prio_to_weight[prio]); + load->inv_weight = prio_to_wmult[prio]; } static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) @@ -1745,7 +1811,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) update_rq_clock(rq); sched_info_queued(p); p->sched_class->enqueue_task(rq, p, flags); - p->se.on_rq = 1; } static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) @@ -1753,7 +1818,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) update_rq_clock(rq); sched_info_dequeued(p); p->sched_class->dequeue_task(rq, p, flags); - p->se.on_rq = 0; } /* @@ -1880,7 +1944,7 @@ void account_system_vtime(struct task_struct *curr) */ if (hardirq_count()) __this_cpu_add(cpu_hardirq_time, delta); - else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) + else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) __this_cpu_add(cpu_softirq_time, delta); irq_time_write_end(); @@ -1920,8 +1984,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) sched_rt_avg_update(rq, irq_delta); } +static int irqtime_account_hi_update(void) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_hardirq_time); + if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) + ret = 1; + local_irq_restore(flags); + return ret; +} + +static int irqtime_account_si_update(void) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_softirq_time); + if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) + ret = 1; + local_irq_restore(flags); + return ret; +} + #else /* CONFIG_IRQ_TIME_ACCOUNTING */ +#define sched_clock_irqtime (0) + static void update_rq_clock_task(struct rq *rq, s64 delta) { rq->clock_task += delta; @@ -2025,14 +2121,14 @@ inline int task_curr(const struct task_struct *p) static inline void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, - int oldprio, int running) + int oldprio) { if (prev_class != p->sched_class) { if (prev_class->switched_from) - prev_class->switched_from(rq, p, running); - p->sched_class->switched_to(rq, p, running); - } else - p->sched_class->prio_changed(rq, p, oldprio, running); + prev_class->switched_from(rq, p); + p->sched_class->switched_to(rq, p); + } else if (oldprio != p->prio) + p->sched_class->prio_changed(rq, p, oldprio); } static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) @@ -2056,7 +2152,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * A queue event has occurred, and we're going to schedule. In * this case, we can save a useless back to back clock update. */ - if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) + if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) rq->skip_clock_update = 1; } @@ -2102,6 +2198,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) */ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); + +#ifdef CONFIG_LOCKDEP + WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || + lockdep_is_held(&task_rq(p)->lock))); +#endif #endif trace_sched_migrate_task(p, new_cpu); @@ -2122,19 +2223,6 @@ struct migration_arg { static int migration_cpu_stop(void *data); /* - * The task's runqueue lock must be held. - * Returns true if you have to wait for migration thread. - */ -static bool migrate_task(struct task_struct *p, struct rq *rq) -{ - /* - * If the task is not on a runqueue (and not running), then - * the next wake-up will properly place the task. - */ - return p->se.on_rq || task_running(rq, p); -} - -/* * wait_task_inactive - wait for a thread to unschedule. * * If @match_state is nonzero, it's the @p->state value just checked and @@ -2191,11 +2279,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) rq = task_rq_lock(p, &flags); trace_sched_wait_task(p); running = task_running(rq, p); - on_rq = p->se.on_rq; + on_rq = p->on_rq; ncsw = 0; if (!match_state || p->state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); /* * If it changed from the expected state, bail out now. @@ -2224,7 +2312,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * yield - it could be a while. */ if (unlikely(on_rq)) { - schedule_timeout_uninterruptible(1); + ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&to, HRTIMER_MODE_REL); continue; } @@ -2246,7 +2337,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * Cause a process which is running on another CPU to enter * kernel-mode, without any delay. (to get signals handled.) * - * NOTE: this function doesnt have to take the runqueue lock, + * NOTE: this function doesn't have to take the runqueue lock, * because all it wants to ensure is that the remote task enters * the kernel. If the IPI races and the task has been migrated * to another CPU then no harm is done and the purpose has been @@ -2265,30 +2356,9 @@ void kick_process(struct task_struct *p) EXPORT_SYMBOL_GPL(kick_process); #endif /* CONFIG_SMP */ -/** - * task_oncpu_function_call - call a function on the cpu on which a task runs - * @p: the task to evaluate - * @func: the function to be called - * @info: the function call argument - * - * Calls the function @func when the task is currently running. This might - * be on the current CPU, which just calls the function directly - */ -void task_oncpu_function_call(struct task_struct *p, - void (*func) (void *info), void *info) -{ - int cpu; - - preempt_disable(); - cpu = task_cpu(p); - if (task_curr(p)) - smp_call_function_single(cpu, func, info, 1); - preempt_enable(); -} - #ifdef CONFIG_SMP /* - * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. + * ->cpus_allowed is protected by both rq->lock and p->pi_lock */ static int select_fallback_rq(int cpu, struct task_struct *p) { @@ -2321,12 +2391,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p) } /* - * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. + * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. */ static inline -int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) +int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) { - int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); + int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); /* * In order not to call set_task_cpu() on a blocking task we need @@ -2352,27 +2422,62 @@ static void update_avg(u64 *avg, u64 sample) } #endif -static inline void ttwu_activate(struct task_struct *p, struct rq *rq, - bool is_sync, bool is_migrate, bool is_local, - unsigned long en_flags) +static void +ttwu_stat(struct task_struct *p, int cpu, int wake_flags) { +#ifdef CONFIG_SCHEDSTATS + struct rq *rq = this_rq(); + +#ifdef CONFIG_SMP + int this_cpu = smp_processor_id(); + + if (cpu == this_cpu) { + schedstat_inc(rq, ttwu_local); + schedstat_inc(p, se.statistics.nr_wakeups_local); + } else { + struct sched_domain *sd; + + schedstat_inc(p, se.statistics.nr_wakeups_remote); + rcu_read_lock(); + for_each_domain(this_cpu, sd) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { + schedstat_inc(sd, ttwu_wake_remote); + break; + } + } + rcu_read_unlock(); + } +#endif /* CONFIG_SMP */ + + schedstat_inc(rq, ttwu_count); schedstat_inc(p, se.statistics.nr_wakeups); - if (is_sync) + + if (wake_flags & WF_SYNC) schedstat_inc(p, se.statistics.nr_wakeups_sync); - if (is_migrate) + + if (cpu != task_cpu(p)) schedstat_inc(p, se.statistics.nr_wakeups_migrate); - if (is_local) - schedstat_inc(p, se.statistics.nr_wakeups_local); - else - schedstat_inc(p, se.statistics.nr_wakeups_remote); +#endif /* CONFIG_SCHEDSTATS */ +} + +static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) +{ activate_task(rq, p, en_flags); + p->on_rq = 1; + + /* if a worker is waking up, notify workqueue */ + if (p->flags & PF_WQ_WORKER) + wq_worker_waking_up(p, cpu_of(rq)); } -static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, - int wake_flags, bool success) +/* + * Mark the task runnable and perform wakeup-preemption. + */ +static void +ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { - trace_sched_wakeup(p, success); + trace_sched_wakeup(p, true); check_preempt_curr(rq, p, wake_flags); p->state = TASK_RUNNING; @@ -2391,9 +2496,118 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, rq->idle_stamp = 0; } #endif - /* if a worker is waking up, notify workqueue */ - if ((p->flags & PF_WQ_WORKER) && success) - wq_worker_waking_up(p, cpu_of(rq)); +} + +static void +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) +{ +#ifdef CONFIG_SMP + if (p->sched_contributes_to_load) + rq->nr_uninterruptible--; +#endif + + ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); + ttwu_do_wakeup(rq, p, wake_flags); +} + +/* + * Called in case the task @p isn't fully descheduled from its runqueue, + * in this case we must do a remote wakeup. Its a 'light' wakeup though, + * since all we need to do is flip p->state to TASK_RUNNING, since + * the task is still ->on_rq. + */ +static int ttwu_remote(struct task_struct *p, int wake_flags) +{ + struct rq *rq; + int ret = 0; + + rq = __task_rq_lock(p); + if (p->on_rq) { + ttwu_do_wakeup(rq, p, wake_flags); + ret = 1; + } + __task_rq_unlock(rq); + + return ret; +} + +#ifdef CONFIG_SMP +static void sched_ttwu_pending(void) +{ + struct rq *rq = this_rq(); + struct task_struct *list = xchg(&rq->wake_list, NULL); + + if (!list) + return; + + raw_spin_lock(&rq->lock); + + while (list) { + struct task_struct *p = list; + list = list->wake_entry; + ttwu_do_activate(rq, p, 0); + } + + raw_spin_unlock(&rq->lock); +} + +void scheduler_ipi(void) +{ + sched_ttwu_pending(); +} + +static void ttwu_queue_remote(struct task_struct *p, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct task_struct *next = rq->wake_list; + + for (;;) { + struct task_struct *old = next; + + p->wake_entry = next; + next = cmpxchg(&rq->wake_list, old, p); + if (next == old) + break; + } + + if (!next) + smp_send_reschedule(cpu); +} + +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW +static int ttwu_activate_remote(struct task_struct *p, int wake_flags) +{ + struct rq *rq; + int ret = 0; + + rq = __task_rq_lock(p); + if (p->on_cpu) { + ttwu_activate(rq, p, ENQUEUE_WAKEUP); + ttwu_do_wakeup(rq, p, wake_flags); + ret = 1; + } + __task_rq_unlock(rq); + + return ret; + +} +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ +#endif /* CONFIG_SMP */ + +static void ttwu_queue(struct task_struct *p, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + +#if defined(CONFIG_SMP) + if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { + ttwu_queue_remote(p, cpu); + return; + } +#endif + + raw_spin_lock(&rq->lock); + ttwu_do_activate(rq, p, 0); + raw_spin_unlock(&rq->lock); } /** @@ -2411,92 +2625,64 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, * Returns %true if @p was woken up, %false if it was already running * or @state didn't match @p's state. */ -static int try_to_wake_up(struct task_struct *p, unsigned int state, - int wake_flags) +static int +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { - int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; - unsigned long en_flags = ENQUEUE_WAKEUP; - struct rq *rq; - - this_cpu = get_cpu(); + int cpu, success = 0; smp_wmb(); - rq = task_rq_lock(p, &flags); + raw_spin_lock_irqsave(&p->pi_lock, flags); if (!(p->state & state)) goto out; - if (p->se.on_rq) - goto out_running; - + success = 1; /* we're going to change ->state */ cpu = task_cpu(p); - orig_cpu = cpu; -#ifdef CONFIG_SMP - if (unlikely(task_running(rq, p))) - goto out_activate; + if (p->on_rq && ttwu_remote(p, wake_flags)) + goto stat; +#ifdef CONFIG_SMP /* - * In order to handle concurrent wakeups and release the rq->lock - * we put the task in TASK_WAKING state. - * - * First fix up the nr_uninterruptible count: + * If the owning (remote) cpu is still in the middle of schedule() with + * this task as prev, wait until its done referencing the task. */ - if (task_contributes_to_load(p)) { - if (likely(cpu_online(orig_cpu))) - rq->nr_uninterruptible--; - else - this_rq()->nr_uninterruptible--; - } - p->state = TASK_WAKING; - - if (p->sched_class->task_waking) { - p->sched_class->task_waking(rq, p); - en_flags |= ENQUEUE_WAKING; + while (p->on_cpu) { +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + /* + * In case the architecture enables interrupts in + * context_switch(), we cannot busy wait, since that + * would lead to deadlocks when an interrupt hits and + * tries to wake up @prev. So bail and do a complete + * remote wakeup. + */ + if (ttwu_activate_remote(p, wake_flags)) + goto stat; +#else + cpu_relax(); +#endif } - - cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); - if (cpu != orig_cpu) - set_task_cpu(p, cpu); - __task_rq_unlock(rq); - - rq = cpu_rq(cpu); - raw_spin_lock(&rq->lock); - /* - * We migrated the task without holding either rq->lock, however - * since the task is not on the task list itself, nobody else - * will try and migrate the task, hence the rq should match the - * cpu we just moved it to. + * Pairs with the smp_wmb() in finish_lock_switch(). */ - WARN_ON(task_cpu(p) != cpu); - WARN_ON(p->state != TASK_WAKING); + smp_rmb(); -#ifdef CONFIG_SCHEDSTATS - schedstat_inc(rq, ttwu_count); - if (cpu == this_cpu) - schedstat_inc(rq, ttwu_local); - else { - struct sched_domain *sd; - for_each_domain(this_cpu, sd) { - if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - schedstat_inc(sd, ttwu_wake_remote); - break; - } - } - } -#endif /* CONFIG_SCHEDSTATS */ + p->sched_contributes_to_load = !!task_contributes_to_load(p); + p->state = TASK_WAKING; + + if (p->sched_class->task_waking) + p->sched_class->task_waking(p); -out_activate: + cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); + if (task_cpu(p) != cpu) + set_task_cpu(p, cpu); #endif /* CONFIG_SMP */ - ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, - cpu == this_cpu, en_flags); - success = 1; -out_running: - ttwu_post_activation(p, rq, wake_flags, success); + + ttwu_queue(p, cpu); +stat: + ttwu_stat(p, cpu, wake_flags); out: - task_rq_unlock(rq, &flags); - put_cpu(); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); return success; } @@ -2505,31 +2691,34 @@ out: * try_to_wake_up_local - try to wake up a local task with rq lock held * @p: the thread to be awakened * - * Put @p on the run-queue if it's not already there. The caller must + * Put @p on the run-queue if it's not already there. The caller must * ensure that this_rq() is locked, @p is bound to this_rq() and not - * the current task. this_rq() stays locked over invocation. + * the current task. */ static void try_to_wake_up_local(struct task_struct *p) { struct rq *rq = task_rq(p); - bool success = false; BUG_ON(rq != this_rq()); BUG_ON(p == current); lockdep_assert_held(&rq->lock); + if (!raw_spin_trylock(&p->pi_lock)) { + raw_spin_unlock(&rq->lock); + raw_spin_lock(&p->pi_lock); + raw_spin_lock(&rq->lock); + } + if (!(p->state & TASK_NORMAL)) - return; + goto out; - if (!p->se.on_rq) { - if (likely(!task_running(rq, p))) { - schedstat_inc(rq, ttwu_count); - schedstat_inc(rq, ttwu_local); - } - ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); - success = true; - } - ttwu_post_activation(p, rq, 0, success); + if (!p->on_rq) + ttwu_activate(rq, p, ENQUEUE_WAKEUP); + + ttwu_do_wakeup(rq, p, 0); + ttwu_stat(p, smp_processor_id(), 0); +out: + raw_spin_unlock(&p->pi_lock); } /** @@ -2562,18 +2751,21 @@ int wake_up_state(struct task_struct *p, unsigned int state) */ static void __sched_fork(struct task_struct *p) { + p->on_rq = 0; + + p->se.on_rq = 0; p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; + p->se.vruntime = 0; + INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif INIT_LIST_HEAD(&p->rt.run_list); - p->se.on_rq = 0; - INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); @@ -2583,8 +2775,9 @@ static void __sched_fork(struct task_struct *p) /* * fork()/clone()-time setup: */ -void sched_fork(struct task_struct *p, int clone_flags) +void sched_fork(struct task_struct *p) { + unsigned long flags; int cpu = get_cpu(); __sched_fork(p); @@ -2635,16 +2828,16 @@ void sched_fork(struct task_struct *p, int clone_flags) * * Silence PROVE_RCU. */ - rcu_read_lock(); + raw_spin_lock_irqsave(&p->pi_lock, flags); set_task_cpu(p, cpu); - rcu_read_unlock(); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) - p->oncpu = 0; +#if defined(CONFIG_SMP) + p->on_cpu = 0; #endif #ifdef CONFIG_PREEMPT /* Want to start with kernel preemption disabled. */ @@ -2664,41 +2857,31 @@ void sched_fork(struct task_struct *p, int clone_flags) * that must be done for every newly created context, then puts the task * on the runqueue and wakes it. */ -void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) +void wake_up_new_task(struct task_struct *p) { unsigned long flags; struct rq *rq; - int cpu __maybe_unused = get_cpu(); + raw_spin_lock_irqsave(&p->pi_lock, flags); #ifdef CONFIG_SMP - rq = task_rq_lock(p, &flags); - p->state = TASK_WAKING; - /* * Fork balancing, do it here and not earlier because: * - cpus_allowed can change in the fork path * - any previously selected cpu might disappear through hotplug - * - * We set TASK_WAKING so that select_task_rq() can drop rq->lock - * without people poking at ->cpus_allowed. */ - cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); - set_task_cpu(p, cpu); - - p->state = TASK_RUNNING; - task_rq_unlock(rq, &flags); + set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); #endif - rq = task_rq_lock(p, &flags); + rq = __task_rq_lock(p); activate_task(rq, p, 0); - trace_sched_wakeup_new(p, 1); + p->on_rq = 1; + trace_sched_wakeup_new(p, true); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP if (p->sched_class->task_woken) p->sched_class->task_woken(rq, p); #endif - task_rq_unlock(rq, &flags); - put_cpu(); + task_rq_unlock(rq, p, &flags); } #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -2776,9 +2959,12 @@ static inline void prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { + sched_info_switch(prev, next); + perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); prepare_lock_switch(rq, next); prepare_arch_switch(next); + trace_sched_switch(prev, next); } /** @@ -2911,7 +3097,7 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); - trace_sched_switch(prev, next); + mm = next->mm; oldmm = prev->active_mm; /* @@ -3404,27 +3590,22 @@ void sched_exec(void) { struct task_struct *p = current; unsigned long flags; - struct rq *rq; int dest_cpu; - rq = task_rq_lock(p, &flags); - dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); + raw_spin_lock_irqsave(&p->pi_lock, flags); + dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); if (dest_cpu == smp_processor_id()) goto unlock; - /* - * select_task_rq() can race against ->cpus_allowed - */ - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && - likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) { + if (likely(cpu_active(dest_cpu))) { struct migration_arg arg = { p, dest_cpu }; - task_rq_unlock(rq, &flags); - stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); return; } unlock: - task_rq_unlock(rq, &flags); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); } #endif @@ -3461,7 +3642,7 @@ unsigned long long task_delta_exec(struct task_struct *p) rq = task_rq_lock(p, &flags); ns = do_task_delta_exec(p, rq); - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); return ns; } @@ -3479,7 +3660,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) rq = task_rq_lock(p, &flags); ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); return ns; } @@ -3503,7 +3684,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p) rq = task_rq_lock(p, &flags); thread_group_cputime(p, &totals); ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); return ns; } @@ -3568,6 +3749,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, } /* + * Account system cpu time to a process and desired cpustat field + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + * @target_cputime64: pointer to cpustat field that has to be updated + */ +static inline +void __account_system_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled, cputime64_t *target_cputime64) +{ + cputime64_t tmp = cputime_to_cputime64(cputime); + + /* Add system time to process. */ + p->stime = cputime_add(p->stime, cputime); + p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); + account_group_system_time(p, cputime); + + /* Add system time to cpustat. */ + *target_cputime64 = cputime64_add(*target_cputime64, tmp); + cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); + + /* Account for system time used */ + acct_update_integrals(p); +} + +/* * Account system cpu time to a process. * @p: the process that the cpu time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() @@ -3578,36 +3785,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset, cputime_t cputime, cputime_t cputime_scaled) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t tmp; + cputime64_t *target_cputime64; if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { account_guest_time(p, cputime, cputime_scaled); return; } - /* Add system time to process. */ - p->stime = cputime_add(p->stime, cputime); - p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); - account_group_system_time(p, cputime); - - /* Add system time to cpustat. */ - tmp = cputime_to_cputime64(cputime); if (hardirq_count() - hardirq_offset) - cpustat->irq = cputime64_add(cpustat->irq, tmp); + target_cputime64 = &cpustat->irq; else if (in_serving_softirq()) - cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + target_cputime64 = &cpustat->softirq; else - cpustat->system = cputime64_add(cpustat->system, tmp); + target_cputime64 = &cpustat->system; - cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); - - /* Account for system time used */ - acct_update_integrals(p); + __account_system_time(p, cputime, cputime_scaled, target_cputime64); } /* * Account for involuntary wait time. - * @steal: the cpu time spent in involuntary wait + * @cputime: the cpu time spent in involuntary wait */ void account_steal_time(cputime_t cputime) { @@ -3635,6 +3832,73 @@ void account_idle_time(cputime_t cputime) #ifndef CONFIG_VIRT_CPU_ACCOUNTING +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +/* + * Account a tick to a process and cpustat + * @p: the process that the cpu time gets accounted to + * @user_tick: is the tick from userspace + * @rq: the pointer to rq + * + * Tick demultiplexing follows the order + * - pending hardirq update + * - pending softirq update + * - user_time + * - idle_time + * - system time + * - check for guest_time + * - else account as system_time + * + * Check for hardirq is done both for system and user time as there is + * no timer going off while we are on hardirq and hence we may never get an + * opportunity to update it solely in system time. + * p->stime and friends are only updated on system time and not on irq + * softirq as those do not count in task exec_runtime any more. + */ +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, + struct rq *rq) +{ + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + + if (irqtime_account_hi_update()) { + cpustat->irq = cputime64_add(cpustat->irq, tmp); + } else if (irqtime_account_si_update()) { + cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + } else if (this_cpu_ksoftirqd() == p) { + /* + * ksoftirqd time do not get accounted in cpu_softirq_time. + * So, we have to handle it separately here. + * Also, p->stime needs to be updated for ksoftirqd. + */ + __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, + &cpustat->softirq); + } else if (user_tick) { + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + } else if (p == rq->idle) { + account_idle_time(cputime_one_jiffy); + } else if (p->flags & PF_VCPU) { /* System time or guest time */ + account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); + } else { + __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, + &cpustat->system); + } +} + +static void irqtime_account_idle_ticks(int ticks) +{ + int i; + struct rq *rq = this_rq(); + + for (i = 0; i < ticks; i++) + irqtime_account_process_tick(current, 0, rq); +} +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ +static void irqtime_account_idle_ticks(int ticks) {} +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, + struct rq *rq) {} +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + /* * Account a single tick of cpu time. * @p: the process that the cpu time gets accounted to @@ -3645,6 +3909,11 @@ void account_process_tick(struct task_struct *p, int user_tick) cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); struct rq *rq = this_rq(); + if (sched_clock_irqtime) { + irqtime_account_process_tick(p, user_tick, rq); + return; + } + if (user_tick) account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) @@ -3670,6 +3939,12 @@ void account_steal_ticks(unsigned long ticks) */ void account_idle_ticks(unsigned long ticks) { + + if (sched_clock_irqtime) { + irqtime_account_idle_ticks(ticks); + return; + } + account_idle_time(jiffies_to_cputime(ticks)); } @@ -3763,9 +4038,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. - * - * It also gets called by the fork code, when changing the parent's - * timeslices. */ void scheduler_tick(void) { @@ -3885,17 +4157,11 @@ static inline void schedule_debug(struct task_struct *prev) profile_hit(SCHED_PROFILING, __builtin_return_address(0)); schedstat_inc(this_rq(), sched_count); -#ifdef CONFIG_SCHEDSTATS - if (unlikely(prev->lock_depth >= 0)) { - schedstat_inc(this_rq(), rq_sched_info.bkl_count); - schedstat_inc(prev, sched_info.bkl_count); - } -#endif } static void put_prev_task(struct rq *rq, struct task_struct *prev) { - if (prev->se.on_rq) + if (prev->on_rq || rq->skip_clock_update < 0) update_rq_clock(rq); prev->sched_class->put_prev_task(rq, prev); } @@ -3945,9 +4211,6 @@ need_resched: rcu_note_context_switch(cpu); prev = rq->curr; - release_kernel_lock(prev); -need_resched_nonpreemptible: - schedule_debug(prev); if (sched_feat(HRTICK)) @@ -3960,11 +4223,13 @@ need_resched_nonpreemptible: if (unlikely(signal_pending_state(prev->state, prev))) { prev->state = TASK_RUNNING; } else { + deactivate_task(rq, prev, DEQUEUE_SLEEP); + prev->on_rq = 0; + /* - * If a worker is going to sleep, notify and - * ask workqueue whether it wants to wake up a - * task to maintain concurrency. If so, wake - * up the task. + * If a worker went to sleep, notify and ask workqueue + * whether it wants to wake up a task to maintain + * concurrency. */ if (prev->flags & PF_WQ_WORKER) { struct task_struct *to_wakeup; @@ -3973,7 +4238,16 @@ need_resched_nonpreemptible: if (to_wakeup) try_to_wake_up_local(to_wakeup); } - deactivate_task(rq, prev, DEQUEUE_SLEEP); + + /* + * If we are going to sleep and we have plugged IO + * queued, make sure to submit it to avoid deadlocks. + */ + if (blk_needs_flush_plug(prev)) { + raw_spin_unlock(&rq->lock); + blk_schedule_flush_plug(prev); + raw_spin_lock(&rq->lock); + } } switch_count = &prev->nvcsw; } @@ -3989,9 +4263,6 @@ need_resched_nonpreemptible: rq->skip_clock_update = 0; if (likely(prev != next)) { - sched_info_switch(prev, next); - perf_event_task_sched_out(prev, next); - rq->nr_switches++; rq->curr = next; ++*switch_count; @@ -4010,9 +4281,6 @@ need_resched_nonpreemptible: post_schedule(rq); - if (unlikely(reacquire_kernel_lock(prev))) - goto need_resched_nonpreemptible; - preempt_enable_no_resched(); if (need_resched()) goto need_resched; @@ -4020,70 +4288,53 @@ need_resched_nonpreemptible: EXPORT_SYMBOL(schedule); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER -/* - * Look out! "owner" is an entirely speculative pointer - * access and not reliable. - */ -int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) -{ - unsigned int cpu; - struct rq *rq; - if (!sched_feat(OWNER_SPIN)) - return 0; +static inline bool owner_running(struct mutex *lock, struct task_struct *owner) +{ + bool ret = false; -#ifdef CONFIG_DEBUG_PAGEALLOC - /* - * Need to access the cpu field knowing that - * DEBUG_PAGEALLOC could have unmapped it if - * the mutex owner just released it and exited. - */ - if (probe_kernel_address(&owner->cpu, cpu)) - return 0; -#else - cpu = owner->cpu; -#endif + rcu_read_lock(); + if (lock->owner != owner) + goto fail; /* - * Even if the access succeeded (likely case), - * the cpu field may no longer be valid. + * Ensure we emit the owner->on_cpu, dereference _after_ checking + * lock->owner still matches owner, if that fails, owner might + * point to free()d memory, if it still matches, the rcu_read_lock() + * ensures the memory stays valid. */ - if (cpu >= nr_cpumask_bits) - return 0; + barrier(); - /* - * We need to validate that we can do a - * get_cpu() and that we have the percpu area. - */ - if (!cpu_online(cpu)) - return 0; + ret = owner->on_cpu; +fail: + rcu_read_unlock(); - rq = cpu_rq(cpu); + return ret; +} - for (;;) { - /* - * Owner changed, break to re-assess state. - */ - if (lock->owner != owner) { - /* - * If the lock has switched to a different owner, - * we likely have heavy contention. Return 0 to quit - * optimistic spinning and not contend further: - */ - if (lock->owner) - return 0; - break; - } +/* + * Look out! "owner" is an entirely speculative pointer + * access and not reliable. + */ +int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) +{ + if (!sched_feat(OWNER_SPIN)) + return 0; - /* - * Is that owner really running on that cpu? - */ - if (task_thread_info(rq->curr) != owner || need_resched()) + while (owner_running(lock, owner)) { + if (need_resched()) return 0; arch_mutex_cpu_relax(); } + /* + * If the owner changed to another task there is likely + * heavy contention, stop spinning. + */ + if (lock->owner) + return 0; + return 1; } #endif @@ -4543,19 +4794,18 @@ EXPORT_SYMBOL(sleep_on_timeout); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - unsigned long flags; int oldprio, on_rq, running; struct rq *rq; const struct sched_class *prev_class; BUG_ON(prio < 0 || prio > MAX_PRIO); - rq = task_rq_lock(p, &flags); + rq = __task_rq_lock(p); trace_sched_pi_setprio(p, prio); oldprio = p->prio; prev_class = p->sched_class; - on_rq = p->se.on_rq; + on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) dequeue_task(rq, p, 0); @@ -4571,12 +4821,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->set_curr_task(rq); - if (on_rq) { + if (on_rq) enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); - check_class_changed(rq, p, prev_class, oldprio, running); - } - task_rq_unlock(rq, &flags); + check_class_changed(rq, p, prev_class, oldprio); + __task_rq_unlock(rq); } #endif @@ -4604,7 +4853,7 @@ void set_user_nice(struct task_struct *p, long nice) p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } - on_rq = p->se.on_rq; + on_rq = p->on_rq; if (on_rq) dequeue_task(rq, p, 0); @@ -4624,7 +4873,7 @@ void set_user_nice(struct task_struct *p, long nice) resched_task(rq->curr); } out_unlock: - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); } EXPORT_SYMBOL(set_user_nice); @@ -4738,8 +4987,6 @@ static struct task_struct *find_process_by_pid(pid_t pid) static void __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) { - BUG_ON(p->se.on_rq); - p->policy = policy; p->rt_priority = prio; p->normal_prio = normal_prio(p); @@ -4762,8 +5009,11 @@ static bool check_same_owner(struct task_struct *p) rcu_read_lock(); pcred = __task_cred(p); - match = (cred->euid == pcred->euid || - cred->euid == pcred->uid); + if (cred->user->user_ns == pcred->user->user_ns) + match = (cred->euid == pcred->euid || + cred->euid == pcred->uid); + else + match = false; rcu_read_unlock(); return match; } @@ -4823,12 +5073,15 @@ recheck: param->sched_priority > rlim_rtprio) return -EPERM; } + /* - * Like positive nice levels, dont allow tasks to - * move out of SCHED_IDLE either: + * Treat SCHED_IDLE as nice 20. Only allow a switch to + * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. */ - if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) - return -EPERM; + if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { + if (!can_nice(p, TASK_NICE(p))) + return -EPERM; + } /* can't change other user's priorities */ if (!check_same_owner(p)) @@ -4848,21 +5101,29 @@ recheck: /* * make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: - */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - /* - * To be able to change p->policy safely, the apropriate + * + * To be able to change p->policy safely, the appropriate * runqueue lock must be held. */ - rq = __task_rq_lock(p); + rq = task_rq_lock(p, &flags); /* * Changing the policy of the stop threads its a very bad idea */ if (p == rq->stop) { + task_rq_unlock(rq, p, &flags); + return -EINVAL; + } + + /* + * If not changing anything there's no need to proceed further: + */ + if (unlikely(policy == p->policy && (!rt_policy(policy) || + param->sched_priority == p->rt_priority))) { + __task_rq_unlock(rq); raw_spin_unlock_irqrestore(&p->pi_lock, flags); - return -EINVAL; + return 0; } #ifdef CONFIG_RT_GROUP_SCHED @@ -4874,8 +5135,7 @@ recheck: if (rt_bandwidth_enabled() && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0 && !task_group_is_autogroup(task_group(p))) { - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); return -EPERM; } } @@ -4884,11 +5144,10 @@ recheck: /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); goto recheck; } - on_rq = p->se.on_rq; + on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) deactivate_task(rq, p, 0); @@ -4903,13 +5162,11 @@ recheck: if (running) p->sched_class->set_curr_task(rq); - if (on_rq) { + if (on_rq) activate_task(rq, p, 0); - check_class_changed(rq, p, prev_class, oldprio, running); - } - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + check_class_changed(rq, p, prev_class, oldprio); + task_rq_unlock(rq, p, &flags); rt_mutex_adjust_pi(p); @@ -5089,7 +5346,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) goto out_free_cpus_allowed; } retval = -EPERM; - if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) + if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE)) goto out_unlock; retval = security_task_setscheduler(p); @@ -5160,7 +5417,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; unsigned long flags; - struct rq *rq; int retval; get_online_cpus(); @@ -5175,9 +5431,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) if (retval) goto out_unlock; - rq = task_rq_lock(p, &flags); + raw_spin_lock_irqsave(&p->pi_lock, flags); cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); - task_rq_unlock(rq, &flags); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: rcu_read_unlock(); @@ -5324,6 +5580,67 @@ void __sched yield(void) } EXPORT_SYMBOL(yield); +/** + * yield_to - yield the current processor to another thread in + * your thread group, or accelerate that thread toward the + * processor it's on. + * @p: target task + * @preempt: whether task preemption is allowed or not + * + * It's the caller's job to ensure that the target task struct + * can't go away on us before we can do any checks. + * + * Returns true if we indeed boosted the target task. + */ +bool __sched yield_to(struct task_struct *p, bool preempt) +{ + struct task_struct *curr = current; + struct rq *rq, *p_rq; + unsigned long flags; + bool yielded = 0; + + local_irq_save(flags); + rq = this_rq(); + +again: + p_rq = task_rq(p); + double_rq_lock(rq, p_rq); + while (task_rq(p) != p_rq) { + double_rq_unlock(rq, p_rq); + goto again; + } + + if (!curr->sched_class->yield_to_task) + goto out; + + if (curr->sched_class != p->sched_class) + goto out; + + if (task_running(p_rq, p) || p->state) + goto out; + + yielded = curr->sched_class->yield_to_task(rq, p, preempt); + if (yielded) { + schedstat_inc(rq, yld_count); + /* + * Make p's CPU reschedule; pick_next_entity takes care of + * fairness. + */ + if (preempt && rq != p_rq) + resched_task(p_rq->curr); + } + +out: + double_rq_unlock(rq, p_rq); + local_irq_restore(flags); + + if (yielded) + schedule(); + + return yielded; +} +EXPORT_SYMBOL_GPL(yield_to); + /* * This task is about to go to sleep on IO. Increment rq->nr_iowait so * that process accounting knows that this is a task in IO wait state. @@ -5334,6 +5651,7 @@ void __sched io_schedule(void) delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + blk_flush_plug(current); current->in_iowait = 1; schedule(); current->in_iowait = 0; @@ -5349,6 +5667,7 @@ long __sched io_schedule_timeout(long timeout) delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + blk_flush_plug(current); current->in_iowait = 1; ret = schedule_timeout(timeout); current->in_iowait = 0; @@ -5439,7 +5758,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, rq = task_rq_lock(p, &flags); time_slice = p->sched_class->get_rr_interval(rq, p); - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); rcu_read_unlock(); jiffies_to_timespec(time_slice, &t); @@ -5497,7 +5816,7 @@ void show_state_filter(unsigned long state_filter) do_each_thread(g, p) { /* * reset the NMI-timeout, listing all files on a slow - * console might take alot of time: + * console might take a lot of time: */ touch_nmi_watchdog(); if (!state_filter || (p->state & state_filter)) @@ -5541,7 +5860,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); - cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); + do_set_cpus_allowed(idle, cpumask_of(cpu)); /* * We're having a chicken and egg problem, even though we are * holding rq->lock, the cpu isn't yet set to this cpu so the @@ -5557,22 +5876,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->curr = rq->idle = idle; -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) - idle->oncpu = 1; +#if defined(CONFIG_SMP) + idle->on_cpu = 1; #endif raw_spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ -#if defined(CONFIG_PREEMPT) - task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); -#else task_thread_info(idle)->preempt_count = 0; -#endif + /* * The idle tasks have their own, simple scheduling class: */ idle->sched_class = &idle_sched_class; - ftrace_graph_init_task(idle); + ftrace_graph_init_idle_task(idle, cpu); } /* @@ -5632,6 +5948,16 @@ static inline void sched_init_granularity(void) } #ifdef CONFIG_SMP +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ + if (p->sched_class && p->sched_class->set_cpus_allowed) + p->sched_class->set_cpus_allowed(p, new_mask); + else { + cpumask_copy(&p->cpus_allowed, new_mask); + p->rt.nr_cpus_allowed = cpumask_weight(new_mask); + } +} + /* * This is how migration works: * @@ -5662,52 +5988,38 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) unsigned int dest_cpu; int ret = 0; - /* - * Serialize against TASK_WAKING so that ttwu() and wunt() can - * drop the rq->lock and still rely on ->cpus_allowed. - */ -again: - while (task_is_waking(p)) - cpu_relax(); rq = task_rq_lock(p, &flags); - if (task_is_waking(p)) { - task_rq_unlock(rq, &flags); - goto again; - } + + if (cpumask_equal(&p->cpus_allowed, new_mask)) + goto out; if (!cpumask_intersects(new_mask, cpu_active_mask)) { ret = -EINVAL; goto out; } - if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && - !cpumask_equal(&p->cpus_allowed, new_mask))) { + if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { ret = -EINVAL; goto out; } - if (p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, new_mask); - else { - cpumask_copy(&p->cpus_allowed, new_mask); - p->rt.nr_cpus_allowed = cpumask_weight(new_mask); - } + do_set_cpus_allowed(p, new_mask); /* Can the task run on the task's current CPU? If so, we're done */ if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (migrate_task(p, rq)) { + if (p->on_rq) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); tlb_migrate_finish(p->mm); return 0; } out: - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); return ret; } @@ -5735,6 +6047,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) rq_src = cpu_rq(src_cpu); rq_dest = cpu_rq(dest_cpu); + raw_spin_lock(&p->pi_lock); double_rq_lock(rq_src, rq_dest); /* Already moved. */ if (task_cpu(p) != src_cpu) @@ -5747,7 +6060,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) * If we're not on a rq, the next wake-up will ensure we're * placed properly. */ - if (p->se.on_rq) { + if (p->on_rq) { deactivate_task(rq_src, p, 0); set_task_cpu(p, dest_cpu); activate_task(rq_dest, p, 0); @@ -5757,6 +6070,7 @@ done: ret = 1; fail: double_rq_unlock(rq_src, rq_dest); + raw_spin_unlock(&p->pi_lock); return ret; } @@ -6097,6 +6411,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) #ifdef CONFIG_HOTPLUG_CPU case CPU_DYING: + sched_ttwu_pending(); /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { @@ -6112,6 +6427,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) break; #endif } + + update_max_interval(); + return NOTIFY_OK; } @@ -6172,6 +6490,8 @@ early_initcall(migration_init); #ifdef CONFIG_SMP +static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ + #ifdef CONFIG_SCHED_DEBUG static __read_mostly int sched_domain_debug_enabled; @@ -6246,7 +6566,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); - if (group->cpu_power != SCHED_LOAD_SCALE) { + if (group->cpu_power != SCHED_POWER_SCALE) { printk(KERN_CONT " (cpu_power = %d)", group->cpu_power); } @@ -6267,7 +6587,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, static void sched_domain_debug(struct sched_domain *sd, int cpu) { - cpumask_var_t groupmask; int level = 0; if (!sched_domain_debug_enabled) @@ -6280,20 +6599,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); - if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { - printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); - return; - } - for (;;) { - if (sched_domain_debug_one(sd, cpu, level, groupmask)) + if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) break; level++; sd = sd->parent; if (!sd) break; } - free_cpumask_var(groupmask); } #else /* !CONFIG_SCHED_DEBUG */ # define sched_domain_debug(sd, cpu) do { } while (0) @@ -6350,12 +6663,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 1; } -static void free_rootdomain(struct root_domain *rd) +static void free_rootdomain(struct rcu_head *rcu) { - synchronize_sched(); + struct root_domain *rd = container_of(rcu, struct root_domain, rcu); cpupri_cleanup(&rd->cpupri); - free_cpumask_var(rd->rto_mask); free_cpumask_var(rd->online); free_cpumask_var(rd->span); @@ -6396,7 +6708,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) raw_spin_unlock_irqrestore(&rq->lock, flags); if (old_rd) - free_rootdomain(old_rd); + call_rcu_sched(&old_rd->rcu, free_rootdomain); } static int init_rootdomain(struct root_domain *rd) @@ -6447,6 +6759,25 @@ static struct root_domain *alloc_rootdomain(void) return rd; } +static void free_sched_domain(struct rcu_head *rcu) +{ + struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); + if (atomic_dec_and_test(&sd->groups->ref)) + kfree(sd->groups); + kfree(sd); +} + +static void destroy_sched_domain(struct sched_domain *sd, int cpu) +{ + call_rcu(&sd->rcu, free_sched_domain); +} + +static void destroy_sched_domains(struct sched_domain *sd, int cpu) +{ + for (; sd; sd = sd->parent) + destroy_sched_domain(sd, cpu); +} + /* * Attach the domain 'sd' to 'cpu' as its base domain. Callers must * hold the hotplug lock. @@ -6457,9 +6788,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) struct rq *rq = cpu_rq(cpu); struct sched_domain *tmp; - for (tmp = sd; tmp; tmp = tmp->parent) - tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); - /* Remove the sched domains which do not contribute to scheduling. */ for (tmp = sd; tmp; ) { struct sched_domain *parent = tmp->parent; @@ -6470,12 +6798,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) tmp->parent = parent->parent; if (parent->parent) parent->parent->child = tmp; + destroy_sched_domain(parent, cpu); } else tmp = tmp->parent; } if (sd && sd_degenerate(sd)) { + tmp = sd; sd = sd->parent; + destroy_sched_domain(tmp, cpu); if (sd) sd->child = NULL; } @@ -6483,7 +6814,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) sched_domain_debug(sd, cpu); rq_attach_root(rq, rd); + tmp = rq->sd; rcu_assign_pointer(rq->sd, sd); + destroy_sched_domains(tmp, cpu); } /* cpus with isolated domains */ @@ -6499,56 +6832,6 @@ static int __init isolated_cpu_setup(char *str) __setup("isolcpus=", isolated_cpu_setup); -/* - * init_sched_build_groups takes the cpumask we wish to span, and a pointer - * to a function which identifies what group(along with sched group) a CPU - * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids - * (due to the fact that we keep track of groups covered with a struct cpumask). - * - * init_sched_build_groups will build a circular linked list of the groups - * covered by the given span, and will set each group's ->cpumask correctly, - * and ->cpu_power to 0. - */ -static void -init_sched_build_groups(const struct cpumask *span, - const struct cpumask *cpu_map, - int (*group_fn)(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, - struct cpumask *tmpmask), - struct cpumask *covered, struct cpumask *tmpmask) -{ - struct sched_group *first = NULL, *last = NULL; - int i; - - cpumask_clear(covered); - - for_each_cpu(i, span) { - struct sched_group *sg; - int group = group_fn(i, cpu_map, &sg, tmpmask); - int j; - - if (cpumask_test_cpu(i, covered)) - continue; - - cpumask_clear(sched_group_cpus(sg)); - sg->cpu_power = 0; - - for_each_cpu(j, span) { - if (group_fn(j, cpu_map, NULL, tmpmask) != group) - continue; - - cpumask_set_cpu(j, covered); - cpumask_set_cpu(j, sched_group_cpus(sg)); - } - if (!first) - first = sg; - if (last) - last->next = sg; - last = sg; - } - last->next = first; -} - #define SD_NODES_PER_DOMAIN 16 #ifdef CONFIG_NUMA @@ -6565,7 +6848,7 @@ init_sched_build_groups(const struct cpumask *span, */ static int find_next_best_node(int node, nodemask_t *used_nodes) { - int i, n, val, min_val, best_node = 0; + int i, n, val, min_val, best_node = -1; min_val = INT_MAX; @@ -6589,7 +6872,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) } } - node_set(best_node, *used_nodes); + if (best_node != -1) + node_set(best_node, *used_nodes); return best_node; } @@ -6615,315 +6899,130 @@ static void sched_domain_node_span(int node, struct cpumask *span) for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { int next_node = find_next_best_node(node, &used_nodes); - + if (next_node < 0) + break; cpumask_or(span, span, cpumask_of_node(next_node)); } } + +static const struct cpumask *cpu_node_mask(int cpu) +{ + lockdep_assert_held(&sched_domains_mutex); + + sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); + + return sched_domains_tmpmask; +} + +static const struct cpumask *cpu_allnodes_mask(int cpu) +{ + return cpu_possible_mask; +} #endif /* CONFIG_NUMA */ -int sched_smt_power_savings = 0, sched_mc_power_savings = 0; +static const struct cpumask *cpu_cpu_mask(int cpu) +{ + return cpumask_of_node(cpu_to_node(cpu)); +} -/* - * The cpus mask in sched_group and sched_domain hangs off the end. - * - * ( See the the comments in include/linux/sched.h:struct sched_group - * and struct sched_domain. ) - */ -struct static_sched_group { - struct sched_group sg; - DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); -}; +int sched_smt_power_savings = 0, sched_mc_power_savings = 0; -struct static_sched_domain { - struct sched_domain sd; - DECLARE_BITMAP(span, CONFIG_NR_CPUS); +struct sd_data { + struct sched_domain **__percpu sd; + struct sched_group **__percpu sg; }; struct s_data { -#ifdef CONFIG_NUMA - int sd_allnodes; - cpumask_var_t domainspan; - cpumask_var_t covered; - cpumask_var_t notcovered; -#endif - cpumask_var_t nodemask; - cpumask_var_t this_sibling_map; - cpumask_var_t this_core_map; - cpumask_var_t this_book_map; - cpumask_var_t send_covered; - cpumask_var_t tmpmask; - struct sched_group **sched_group_nodes; + struct sched_domain ** __percpu sd; struct root_domain *rd; }; enum s_alloc { - sa_sched_groups = 0, sa_rootdomain, - sa_tmpmask, - sa_send_covered, - sa_this_book_map, - sa_this_core_map, - sa_this_sibling_map, - sa_nodemask, - sa_sched_group_nodes, -#ifdef CONFIG_NUMA - sa_notcovered, - sa_covered, - sa_domainspan, -#endif + sa_sd, + sa_sd_storage, sa_none, }; -/* - * SMT sched-domains: - */ -#ifdef CONFIG_SCHED_SMT -static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_groups); +struct sched_domain_topology_level; -static int -cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *unused) -{ - if (sg) - *sg = &per_cpu(sched_groups, cpu).sg; - return cpu; -} -#endif /* CONFIG_SCHED_SMT */ +typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); +typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); -/* - * multi-core sched-domains: - */ -#ifdef CONFIG_SCHED_MC -static DEFINE_PER_CPU(struct static_sched_domain, core_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); - -static int -cpu_to_core_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *mask) -{ - int group; -#ifdef CONFIG_SCHED_SMT - cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); - group = cpumask_first(mask); -#else - group = cpu; -#endif - if (sg) - *sg = &per_cpu(sched_group_core, group).sg; - return group; -} -#endif /* CONFIG_SCHED_MC */ +struct sched_domain_topology_level { + sched_domain_init_f init; + sched_domain_mask_f mask; + struct sd_data data; +}; /* - * book sched-domains: + * Assumes the sched_domain tree is fully constructed */ -#ifdef CONFIG_SCHED_BOOK -static DEFINE_PER_CPU(struct static_sched_domain, book_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_book); - -static int -cpu_to_book_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *mask) +static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) { - int group = cpu; -#ifdef CONFIG_SCHED_MC - cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); - group = cpumask_first(mask); -#elif defined(CONFIG_SCHED_SMT) - cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); - group = cpumask_first(mask); -#endif - if (sg) - *sg = &per_cpu(sched_group_book, group).sg; - return group; -} -#endif /* CONFIG_SCHED_BOOK */ + struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); + struct sched_domain *child = sd->child; -static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); + if (child) + cpu = cpumask_first(sched_domain_span(child)); -static int -cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *mask) -{ - int group; -#ifdef CONFIG_SCHED_BOOK - cpumask_and(mask, cpu_book_mask(cpu), cpu_map); - group = cpumask_first(mask); -#elif defined(CONFIG_SCHED_MC) - cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); - group = cpumask_first(mask); -#elif defined(CONFIG_SCHED_SMT) - cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); - group = cpumask_first(mask); -#else - group = cpu; -#endif if (sg) - *sg = &per_cpu(sched_group_phys, group).sg; - return group; + *sg = *per_cpu_ptr(sdd->sg, cpu); + + return cpu; } -#ifdef CONFIG_NUMA /* - * The init_sched_build_groups can't handle what we want to do with node - * groups, so roll our own. Now each node has its own list of groups which - * gets dynamically allocated. + * build_sched_groups takes the cpumask we wish to span, and a pointer + * to a function which identifies what group(along with sched group) a CPU + * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids + * (due to the fact that we keep track of groups covered with a struct cpumask). + * + * build_sched_groups will build a circular linked list of the groups + * covered by the given span, and will set each group's ->cpumask correctly, + * and ->cpu_power to 0. */ -static DEFINE_PER_CPU(struct static_sched_domain, node_domains); -static struct sched_group ***sched_group_nodes_bycpu; - -static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); - -static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, - struct cpumask *nodemask) -{ - int group; - - cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); - group = cpumask_first(nodemask); - - if (sg) - *sg = &per_cpu(sched_group_allnodes, group).sg; - return group; -} - -static void init_numa_sched_groups_power(struct sched_group *group_head) -{ - struct sched_group *sg = group_head; - int j; - - if (!sg) - return; - do { - for_each_cpu(j, sched_group_cpus(sg)) { - struct sched_domain *sd; - - sd = &per_cpu(phys_domains, j).sd; - if (j != group_first_cpu(sd->groups)) { - /* - * Only add "power" once for each - * physical package. - */ - continue; - } - - sg->cpu_power += sd->groups->cpu_power; - } - sg = sg->next; - } while (sg != group_head); -} - -static int build_numa_sched_groups(struct s_data *d, - const struct cpumask *cpu_map, int num) +static void +build_sched_groups(struct sched_domain *sd) { - struct sched_domain *sd; - struct sched_group *sg, *prev; - int n, j; - - cpumask_clear(d->covered); - cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); - if (cpumask_empty(d->nodemask)) { - d->sched_group_nodes[num] = NULL; - goto out; - } - - sched_domain_node_span(num, d->domainspan); - cpumask_and(d->domainspan, d->domainspan, cpu_map); - - sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, num); - if (!sg) { - printk(KERN_WARNING "Can not alloc domain group for node %d\n", - num); - return -ENOMEM; - } - d->sched_group_nodes[num] = sg; - - for_each_cpu(j, d->nodemask) { - sd = &per_cpu(node_domains, j).sd; - sd->groups = sg; - } - - sg->cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), d->nodemask); - sg->next = sg; - cpumask_or(d->covered, d->covered, d->nodemask); + struct sched_group *first = NULL, *last = NULL; + struct sd_data *sdd = sd->private; + const struct cpumask *span = sched_domain_span(sd); + struct cpumask *covered; + int i; - prev = sg; - for (j = 0; j < nr_node_ids; j++) { - n = (num + j) % nr_node_ids; - cpumask_complement(d->notcovered, d->covered); - cpumask_and(d->tmpmask, d->notcovered, cpu_map); - cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); - if (cpumask_empty(d->tmpmask)) - break; - cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); - if (cpumask_empty(d->tmpmask)) - continue; - sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, num); - if (!sg) { - printk(KERN_WARNING - "Can not alloc domain group for node %d\n", j); - return -ENOMEM; - } - sg->cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), d->tmpmask); - sg->next = prev->next; - cpumask_or(d->covered, d->covered, d->tmpmask); - prev->next = sg; - prev = sg; - } -out: - return 0; -} -#endif /* CONFIG_NUMA */ + lockdep_assert_held(&sched_domains_mutex); + covered = sched_domains_tmpmask; -#ifdef CONFIG_NUMA -/* Free memory allocated for various sched_group structures */ -static void free_sched_groups(const struct cpumask *cpu_map, - struct cpumask *nodemask) -{ - int cpu, i; + cpumask_clear(covered); - for_each_cpu(cpu, cpu_map) { - struct sched_group **sched_group_nodes - = sched_group_nodes_bycpu[cpu]; + for_each_cpu(i, span) { + struct sched_group *sg; + int group = get_group(i, sdd, &sg); + int j; - if (!sched_group_nodes) + if (cpumask_test_cpu(i, covered)) continue; - for (i = 0; i < nr_node_ids; i++) { - struct sched_group *oldsg, *sg = sched_group_nodes[i]; + cpumask_clear(sched_group_cpus(sg)); + sg->cpu_power = 0; - cpumask_and(nodemask, cpumask_of_node(i), cpu_map); - if (cpumask_empty(nodemask)) + for_each_cpu(j, span) { + if (get_group(j, sdd, NULL) != group) continue; - if (sg == NULL) - continue; - sg = sg->next; -next_sg: - oldsg = sg; - sg = sg->next; - kfree(oldsg); - if (oldsg != sched_group_nodes[i]) - goto next_sg; + cpumask_set_cpu(j, covered); + cpumask_set_cpu(j, sched_group_cpus(sg)); } - kfree(sched_group_nodes); - sched_group_nodes_bycpu[cpu] = NULL; + + if (!first) + first = sg; + if (last) + last->next = sg; + last = sg; } + last->next = first; } -#else /* !CONFIG_NUMA */ -static void free_sched_groups(const struct cpumask *cpu_map, - struct cpumask *nodemask) -{ -} -#endif /* CONFIG_NUMA */ /* * Initialize sched groups cpu_power. @@ -6937,11 +7036,6 @@ static void free_sched_groups(const struct cpumask *cpu_map, */ static void init_sched_groups_power(int cpu, struct sched_domain *sd) { - struct sched_domain *child; - struct sched_group *group; - long power; - int weight; - WARN_ON(!sd || !sd->groups); if (cpu != group_first_cpu(sd->groups)) @@ -6949,36 +7043,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); - child = sd->child; - - sd->groups->cpu_power = 0; - - if (!child) { - power = SCHED_LOAD_SCALE; - weight = cpumask_weight(sched_domain_span(sd)); - /* - * SMT siblings share the power of a single core. - * Usually multiple threads get a better yield out of - * that one core than a single thread would have, - * reflect that in sd->smt_gain. - */ - if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { - power *= sd->smt_gain; - power /= weight; - power >>= SCHED_LOAD_SHIFT; - } - sd->groups->cpu_power += power; - return; - } - - /* - * Add cpu_power of each child group to this groups cpu_power. - */ - group = child->groups; - do { - sd->groups->cpu_power += group->cpu_power; - group = group->next; - } while (group != child->groups); + update_group_power(sd, cpu); } /* @@ -6992,15 +7057,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) # define SD_INIT_NAME(sd, type) do { } while (0) #endif -#define SD_INIT(sd, type) sd_init_##type(sd) - -#define SD_INIT_FUNC(type) \ -static noinline void sd_init_##type(struct sched_domain *sd) \ -{ \ - memset(sd, 0, sizeof(*sd)); \ - *sd = SD_##type##_INIT; \ - sd->level = SD_LV_##type; \ - SD_INIT_NAME(sd, type); \ +#define SD_INIT_FUNC(type) \ +static noinline struct sched_domain * \ +sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ +{ \ + struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ + *sd = SD_##type##_INIT; \ + SD_INIT_NAME(sd, type); \ + sd->private = &tl->data; \ + return sd; \ } SD_INIT_FUNC(CPU) @@ -7019,13 +7084,14 @@ SD_INIT_FUNC(CPU) #endif static int default_relax_domain_level = -1; +int sched_domain_level_max; static int __init setup_relax_domain_level(char *str) { unsigned long val; val = simple_strtoul(str, NULL, 0); - if (val < SD_LV_MAX) + if (val < sched_domain_level_max) default_relax_domain_level = val; return 1; @@ -7053,37 +7119,20 @@ static void set_domain_attribute(struct sched_domain *sd, } } +static void __sdt_free(const struct cpumask *cpu_map); +static int __sdt_alloc(const struct cpumask *cpu_map); + static void __free_domain_allocs(struct s_data *d, enum s_alloc what, const struct cpumask *cpu_map) { switch (what) { - case sa_sched_groups: - free_sched_groups(cpu_map, d->tmpmask); /* fall through */ - d->sched_group_nodes = NULL; case sa_rootdomain: - free_rootdomain(d->rd); /* fall through */ - case sa_tmpmask: - free_cpumask_var(d->tmpmask); /* fall through */ - case sa_send_covered: - free_cpumask_var(d->send_covered); /* fall through */ - case sa_this_book_map: - free_cpumask_var(d->this_book_map); /* fall through */ - case sa_this_core_map: - free_cpumask_var(d->this_core_map); /* fall through */ - case sa_this_sibling_map: - free_cpumask_var(d->this_sibling_map); /* fall through */ - case sa_nodemask: - free_cpumask_var(d->nodemask); /* fall through */ - case sa_sched_group_nodes: -#ifdef CONFIG_NUMA - kfree(d->sched_group_nodes); /* fall through */ - case sa_notcovered: - free_cpumask_var(d->notcovered); /* fall through */ - case sa_covered: - free_cpumask_var(d->covered); /* fall through */ - case sa_domainspan: - free_cpumask_var(d->domainspan); /* fall through */ -#endif + if (!atomic_read(&d->rd->refcount)) + free_rootdomain(&d->rd->rcu); /* fall through */ + case sa_sd: + free_percpu(d->sd); /* fall through */ + case sa_sd_storage: + __sdt_free(cpu_map); /* fall through */ case sa_none: break; } @@ -7092,308 +7141,212 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) { -#ifdef CONFIG_NUMA - if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) - return sa_none; - if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) - return sa_domainspan; - if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) - return sa_covered; - /* Allocate the per-node list of sched groups */ - d->sched_group_nodes = kcalloc(nr_node_ids, - sizeof(struct sched_group *), GFP_KERNEL); - if (!d->sched_group_nodes) { - printk(KERN_WARNING "Can not alloc sched group node list\n"); - return sa_notcovered; - } - sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; -#endif - if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) - return sa_sched_group_nodes; - if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) - return sa_nodemask; - if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) - return sa_this_sibling_map; - if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL)) - return sa_this_core_map; - if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) - return sa_this_book_map; - if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) - return sa_send_covered; + memset(d, 0, sizeof(*d)); + + if (__sdt_alloc(cpu_map)) + return sa_sd_storage; + d->sd = alloc_percpu(struct sched_domain *); + if (!d->sd) + return sa_sd_storage; d->rd = alloc_rootdomain(); - if (!d->rd) { - printk(KERN_WARNING "Cannot alloc root domain\n"); - return sa_tmpmask; - } + if (!d->rd) + return sa_sd; return sa_rootdomain; } -static struct sched_domain *__build_numa_sched_domains(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) +/* + * NULL the sd_data elements we've used to build the sched_domain and + * sched_group structure so that the subsequent __free_domain_allocs() + * will not free the data we're using. + */ +static void claim_allocations(int cpu, struct sched_domain *sd) { - struct sched_domain *sd = NULL; -#ifdef CONFIG_NUMA - struct sched_domain *parent; - - d->sd_allnodes = 0; - if (cpumask_weight(cpu_map) > - SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { - sd = &per_cpu(allnodes_domains, i).sd; - SD_INIT(sd, ALLNODES); - set_domain_attribute(sd, attr); - cpumask_copy(sched_domain_span(sd), cpu_map); - cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); - d->sd_allnodes = 1; - } - parent = sd; - - sd = &per_cpu(node_domains, i).sd; - SD_INIT(sd, NODE); - set_domain_attribute(sd, attr); - sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); - sd->parent = parent; - if (parent) - parent->child = sd; - cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); -#endif - return sd; -} + struct sd_data *sdd = sd->private; + struct sched_group *sg = sd->groups; -static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *parent, int i) -{ - struct sched_domain *sd; - sd = &per_cpu(phys_domains, i).sd; - SD_INIT(sd, CPU); - set_domain_attribute(sd, attr); - cpumask_copy(sched_domain_span(sd), d->nodemask); - sd->parent = parent; - if (parent) - parent->child = sd; - cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); - return sd; -} + WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); + *per_cpu_ptr(sdd->sd, cpu) = NULL; -static struct sched_domain *__build_book_sched_domain(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *parent, int i) -{ - struct sched_domain *sd = parent; -#ifdef CONFIG_SCHED_BOOK - sd = &per_cpu(book_domains, i).sd; - SD_INIT(sd, BOOK); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i)); - sd->parent = parent; - parent->child = sd; - cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask); -#endif - return sd; + if (cpu == cpumask_first(sched_group_cpus(sg))) { + WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg); + *per_cpu_ptr(sdd->sg, cpu) = NULL; + } } -static struct sched_domain *__build_mc_sched_domain(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *parent, int i) +#ifdef CONFIG_SCHED_SMT +static const struct cpumask *cpu_smt_mask(int cpu) { - struct sched_domain *sd = parent; -#ifdef CONFIG_SCHED_MC - sd = &per_cpu(core_domains, i).sd; - SD_INIT(sd, MC); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); - sd->parent = parent; - parent->child = sd; - cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); -#endif - return sd; + return topology_thread_cpumask(cpu); } - -static struct sched_domain *__build_smt_sched_domain(struct s_data *d, - const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *parent, int i) -{ - struct sched_domain *sd = parent; -#ifdef CONFIG_SCHED_SMT - sd = &per_cpu(cpu_domains, i).sd; - SD_INIT(sd, SIBLING); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); - sd->parent = parent; - parent->child = sd; - cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); #endif - return sd; -} -static void build_sched_groups(struct s_data *d, enum sched_domain_level l, - const struct cpumask *cpu_map, int cpu) -{ - switch (l) { +/* + * Topology list, bottom-up. + */ +static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_SMT - case SD_LV_SIBLING: /* set up CPU (sibling) groups */ - cpumask_and(d->this_sibling_map, cpu_map, - topology_thread_cpumask(cpu)); - if (cpu == cpumask_first(d->this_sibling_map)) - init_sched_build_groups(d->this_sibling_map, cpu_map, - &cpu_to_cpu_group, - d->send_covered, d->tmpmask); - break; + { sd_init_SIBLING, cpu_smt_mask, }, #endif #ifdef CONFIG_SCHED_MC - case SD_LV_MC: /* set up multi-core groups */ - cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); - if (cpu == cpumask_first(d->this_core_map)) - init_sched_build_groups(d->this_core_map, cpu_map, - &cpu_to_core_group, - d->send_covered, d->tmpmask); - break; + { sd_init_MC, cpu_coregroup_mask, }, #endif #ifdef CONFIG_SCHED_BOOK - case SD_LV_BOOK: /* set up book groups */ - cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu)); - if (cpu == cpumask_first(d->this_book_map)) - init_sched_build_groups(d->this_book_map, cpu_map, - &cpu_to_book_group, - d->send_covered, d->tmpmask); - break; + { sd_init_BOOK, cpu_book_mask, }, #endif - case SD_LV_CPU: /* set up physical groups */ - cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); - if (!cpumask_empty(d->nodemask)) - init_sched_build_groups(d->nodemask, cpu_map, - &cpu_to_phys_group, - d->send_covered, d->tmpmask); - break; + { sd_init_CPU, cpu_cpu_mask, }, #ifdef CONFIG_NUMA - case SD_LV_ALLNODES: - init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, - d->send_covered, d->tmpmask); - break; + { sd_init_NODE, cpu_node_mask, }, + { sd_init_ALLNODES, cpu_allnodes_mask, }, #endif - default: - break; + { NULL, }, +}; + +static struct sched_domain_topology_level *sched_domain_topology = default_topology; + +static int __sdt_alloc(const struct cpumask *cpu_map) +{ + struct sched_domain_topology_level *tl; + int j; + + for (tl = sched_domain_topology; tl->init; tl++) { + struct sd_data *sdd = &tl->data; + + sdd->sd = alloc_percpu(struct sched_domain *); + if (!sdd->sd) + return -ENOMEM; + + sdd->sg = alloc_percpu(struct sched_group *); + if (!sdd->sg) + return -ENOMEM; + + for_each_cpu(j, cpu_map) { + struct sched_domain *sd; + struct sched_group *sg; + + sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), + GFP_KERNEL, cpu_to_node(j)); + if (!sd) + return -ENOMEM; + + *per_cpu_ptr(sdd->sd, j) = sd; + + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, cpu_to_node(j)); + if (!sg) + return -ENOMEM; + + *per_cpu_ptr(sdd->sg, j) = sg; + } + } + + return 0; +} + +static void __sdt_free(const struct cpumask *cpu_map) +{ + struct sched_domain_topology_level *tl; + int j; + + for (tl = sched_domain_topology; tl->init; tl++) { + struct sd_data *sdd = &tl->data; + + for_each_cpu(j, cpu_map) { + kfree(*per_cpu_ptr(sdd->sd, j)); + kfree(*per_cpu_ptr(sdd->sg, j)); + } + free_percpu(sdd->sd); + free_percpu(sdd->sg); } } +struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, + struct s_data *d, const struct cpumask *cpu_map, + struct sched_domain_attr *attr, struct sched_domain *child, + int cpu) +{ + struct sched_domain *sd = tl->init(tl, cpu); + if (!sd) + return child; + + set_domain_attribute(sd, attr); + cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); + if (child) { + sd->level = child->level + 1; + sched_domain_level_max = max(sched_domain_level_max, sd->level); + child->parent = sd; + } + sd->child = child; + + return sd; +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus */ -static int __build_sched_domains(const struct cpumask *cpu_map, - struct sched_domain_attr *attr) +static int build_sched_domains(const struct cpumask *cpu_map, + struct sched_domain_attr *attr) { enum s_alloc alloc_state = sa_none; - struct s_data d; struct sched_domain *sd; - int i; -#ifdef CONFIG_NUMA - d.sd_allnodes = 0; -#endif + struct s_data d; + int i, ret = -ENOMEM; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); if (alloc_state != sa_rootdomain) goto error; - alloc_state = sa_sched_groups; - - /* - * Set up domains for cpus specified by the cpu_map. - */ - for_each_cpu(i, cpu_map) { - cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), - cpu_map); - - sd = __build_numa_sched_domains(&d, cpu_map, attr, i); - sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); - sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); - sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); - sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); - } + /* Set up domains for cpus specified by the cpu_map. */ for_each_cpu(i, cpu_map) { - build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); - build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); - build_sched_groups(&d, SD_LV_MC, cpu_map, i); - } + struct sched_domain_topology_level *tl; - /* Set up physical groups */ - for (i = 0; i < nr_node_ids; i++) - build_sched_groups(&d, SD_LV_CPU, cpu_map, i); + sd = NULL; + for (tl = sched_domain_topology; tl->init; tl++) + sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); -#ifdef CONFIG_NUMA - /* Set up node groups */ - if (d.sd_allnodes) - build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); + while (sd->child) + sd = sd->child; - for (i = 0; i < nr_node_ids; i++) - if (build_numa_sched_groups(&d, cpu_map, i)) - goto error; -#endif - - /* Calculate CPU power for physical packages and nodes */ -#ifdef CONFIG_SCHED_SMT - for_each_cpu(i, cpu_map) { - sd = &per_cpu(cpu_domains, i).sd; - init_sched_groups_power(i, sd); - } -#endif -#ifdef CONFIG_SCHED_MC - for_each_cpu(i, cpu_map) { - sd = &per_cpu(core_domains, i).sd; - init_sched_groups_power(i, sd); - } -#endif -#ifdef CONFIG_SCHED_BOOK - for_each_cpu(i, cpu_map) { - sd = &per_cpu(book_domains, i).sd; - init_sched_groups_power(i, sd); + *per_cpu_ptr(d.sd, i) = sd; } -#endif + /* Build the groups for the domains */ for_each_cpu(i, cpu_map) { - sd = &per_cpu(phys_domains, i).sd; - init_sched_groups_power(i, sd); - } + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + sd->span_weight = cpumask_weight(sched_domain_span(sd)); + get_group(i, sd->private, &sd->groups); + atomic_inc(&sd->groups->ref); -#ifdef CONFIG_NUMA - for (i = 0; i < nr_node_ids; i++) - init_numa_sched_groups_power(d.sched_group_nodes[i]); + if (i != cpumask_first(sched_domain_span(sd))) + continue; - if (d.sd_allnodes) { - struct sched_group *sg; + build_sched_groups(sd); + } + } + + /* Calculate CPU power for physical packages and nodes */ + for (i = nr_cpumask_bits-1; i >= 0; i--) { + if (!cpumask_test_cpu(i, cpu_map)) + continue; - cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, - d.tmpmask); - init_numa_sched_groups_power(sg); + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + claim_allocations(i, sd); + init_sched_groups_power(i, sd); + } } -#endif /* Attach the domains */ + rcu_read_lock(); for_each_cpu(i, cpu_map) { -#ifdef CONFIG_SCHED_SMT - sd = &per_cpu(cpu_domains, i).sd; -#elif defined(CONFIG_SCHED_MC) - sd = &per_cpu(core_domains, i).sd; -#elif defined(CONFIG_SCHED_BOOK) - sd = &per_cpu(book_domains, i).sd; -#else - sd = &per_cpu(phys_domains, i).sd; -#endif + sd = *per_cpu_ptr(d.sd, i); cpu_attach_domain(sd, d.rd, i); } + rcu_read_unlock(); - d.sched_group_nodes = NULL; /* don't free this we still need it */ - __free_domain_allocs(&d, sa_tmpmask, cpu_map); - return 0; - + ret = 0; error: __free_domain_allocs(&d, alloc_state, cpu_map); - return -ENOMEM; -} - -static int build_sched_domains(const struct cpumask *cpu_map) -{ - return __build_sched_domains(cpu_map, NULL); + return ret; } static cpumask_var_t *doms_cur; /* current sched domains */ @@ -7448,7 +7401,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) * For now this just excludes isolated cpus, but could be used to * exclude other special cases in the future. */ -static int arch_init_sched_domains(const struct cpumask *cpu_map) +static int init_sched_domains(const struct cpumask *cpu_map) { int err; @@ -7459,32 +7412,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map) doms_cur = &fallback_doms; cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); dattr_cur = NULL; - err = build_sched_domains(doms_cur[0]); + err = build_sched_domains(doms_cur[0], NULL); register_sched_domain_sysctl(); return err; } -static void arch_destroy_sched_domains(const struct cpumask *cpu_map, - struct cpumask *tmpmask) -{ - free_sched_groups(cpu_map, tmpmask); -} - /* * Detach sched domains from a group of cpus specified in cpu_map * These cpus will now be attached to the NULL domain */ static void detach_destroy_domains(const struct cpumask *cpu_map) { - /* Save because hotplug lock held. */ - static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); int i; + rcu_read_lock(); for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); - synchronize_sched(); - arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); + rcu_read_unlock(); } /* handle null as "default" */ @@ -7573,8 +7518,7 @@ match1: goto match2; } /* no match - add a new doms_new */ - __build_sched_domains(doms_new[i], - dattr_new ? dattr_new + i : NULL); + build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); match2: ; } @@ -7593,7 +7537,7 @@ match2: } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -static void arch_reinit_sched_domains(void) +static void reinit_sched_domains(void) { get_online_cpus(); @@ -7626,7 +7570,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) else sched_mc_power_savings = level; - arch_reinit_sched_domains(); + reinit_sched_domains(); return count; } @@ -7745,14 +7689,9 @@ void __init sched_init_smp(void) alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); -#if defined(CONFIG_NUMA) - sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), - GFP_KERNEL); - BUG_ON(sched_group_nodes_bycpu == NULL); -#endif get_online_cpus(); mutex_lock(&sched_domains_mutex); - arch_init_sched_domains(cpu_active_mask); + init_sched_domains(cpu_active_mask); cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); if (cpumask_empty(non_isolated_cpus)) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); @@ -7797,6 +7736,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) INIT_LIST_HEAD(&cfs_rq->tasks); #ifdef CONFIG_FAIR_GROUP_SCHED cfs_rq->rq = rq; + /* allow initial update_cfs_load() to truncate */ +#ifdef CONFIG_SMP + cfs_rq->load_stamp = 1; +#endif #endif cfs_rq->min_vruntime = (u64)(-(1LL << 20)); } @@ -7998,7 +7941,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; - rq->cpu_power = SCHED_LOAD_SCALE; + rq->cpu_power = SCHED_POWER_SCALE; rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; @@ -8055,6 +7998,7 @@ void __init sched_init(void) /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); #ifdef CONFIG_SMP + zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); #ifdef CONFIG_NO_HZ zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); @@ -8075,7 +8019,7 @@ static inline int preempt_count_equals(int preempt_offset) { int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); - return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); + return (nested == preempt_offset); } void __might_sleep(const char *file, int line, int preempt_offset) @@ -8110,9 +8054,11 @@ EXPORT_SYMBOL(__might_sleep); #ifdef CONFIG_MAGIC_SYSRQ static void normalize_task(struct rq *rq, struct task_struct *p) { + const struct sched_class *prev_class = p->sched_class; + int old_prio = p->prio; int on_rq; - on_rq = p->se.on_rq; + on_rq = p->on_rq; if (on_rq) deactivate_task(rq, p, 0); __setscheduler(rq, p, SCHED_NORMAL, 0); @@ -8120,6 +8066,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p) activate_task(rq, p, 0); resched_task(rq->curr); } + + check_class_changed(rq, p, prev_class, old_prio); } void normalize_rt_tasks(void) @@ -8235,7 +8183,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct cfs_rq *cfs_rq; struct sched_entity *se; - struct rq *rq; int i; tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8248,8 +8195,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) tg->shares = NICE_0_LOAD; for_each_possible_cpu(i) { - rq = cpu_rq(i); - cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) @@ -8326,7 +8271,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { struct rt_rq *rt_rq; struct sched_rt_entity *rt_se; - struct rq *rq; int i; tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8340,8 +8284,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) ktime_to_ns(def_rt_bandwidth.rt_period), 0); for_each_possible_cpu(i) { - rq = cpu_rq(i); - rt_rq = kzalloc_node(sizeof(struct rt_rq), GFP_KERNEL, cpu_to_node(i)); if (!rt_rq) @@ -8456,7 +8398,7 @@ void sched_move_task(struct task_struct *tsk) rq = task_rq_lock(tsk, &flags); running = task_current(rq, tsk); - on_rq = tsk->se.on_rq; + on_rq = tsk->on_rq; if (on_rq) dequeue_task(rq, tsk, 0); @@ -8475,7 +8417,7 @@ void sched_move_task(struct task_struct *tsk) if (on_rq) enqueue_task(rq, tsk, 0); - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, tsk, &flags); } #endif /* CONFIG_CGROUP_SCHED */ @@ -8511,7 +8453,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) /* Propagate contribution to hierarchy */ raw_spin_lock_irqsave(&rq->lock, flags); for_each_sched_entity(se) - update_cfs_shares(group_cfs_rq(se), 0); + update_cfs_shares(group_cfs_rq(se)); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -8846,46 +8788,15 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) return 0; } -static int -cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct task_struct *tsk, bool threadgroup) -{ - int retval = cpu_cgroup_can_attach_task(cgrp, tsk); - if (retval) - return retval; - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - retval = cpu_cgroup_can_attach_task(cgrp, c); - if (retval) { - rcu_read_unlock(); - return retval; - } - } - rcu_read_unlock(); - } - return 0; -} - static void -cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cont, struct task_struct *tsk, - bool threadgroup) +cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { sched_move_task(tsk); - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - sched_move_task(c); - } - rcu_read_unlock(); - } } static void -cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) +cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup *old_cgrp, struct task_struct *task) { /* * cgroup_exit() is called in the copy_process() failure path. @@ -8902,14 +8813,14 @@ cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, u64 shareval) { - return sched_group_set_shares(cgroup_tg(cgrp), shareval); + return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); } static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) { struct task_group *tg = cgroup_tg(cgrp); - return (u64) tg->shares; + return (u64) scale_load_down(tg->shares); } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -8968,8 +8879,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { .name = "cpu", .create = cpu_cgroup_create, .destroy = cpu_cgroup_destroy, - .can_attach = cpu_cgroup_can_attach, - .attach = cpu_cgroup_attach, + .can_attach_task = cpu_cgroup_can_attach_task, + .attach_task = cpu_cgroup_attach_task, .exit = cpu_cgroup_exit, .populate = cpu_cgroup_populate, .subsys_id = cpu_cgroup_subsys_id, diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c index 9fb656283157..429242f3c484 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched_autogroup.c @@ -12,7 +12,6 @@ static atomic_t autogroup_seq_nr; static void __init autogroup_init(struct task_struct *init_task) { autogroup_default.tg = &root_task_group; - root_task_group.autogroup = &autogroup_default; kref_init(&autogroup_default.kref); init_rwsem(&autogroup_default.lock); init_task->signal->autogroup = &autogroup_default; @@ -130,7 +129,7 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg) static inline bool task_group_is_autogroup(struct task_group *tg) { - return tg != &root_task_group && tg->autogroup; + return !!tg->autogroup; } static inline struct task_group * @@ -161,11 +160,15 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) p->signal->autogroup = autogroup_kref_get(ag); + if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) + goto out; + t = p; do { sched_move_task(t); } while_each_thread(p, t); +out: unlock_task_sighand(p, &flags); autogroup_kref_put(prev); } @@ -176,7 +179,7 @@ void sched_autogroup_create_attach(struct task_struct *p) struct autogroup *ag = autogroup_create(); autogroup_move_group(p, ag); - /* drop extra refrence added by autogroup_create() */ + /* drop extra reference added by autogroup_create() */ autogroup_kref_put(ag); } EXPORT_SYMBOL(sched_autogroup_create_attach); @@ -247,10 +250,14 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) { struct autogroup *ag = autogroup_task_get(p); + if (!task_group_is_autogroup(ag->tg)) + goto out; + down_read(&ag->lock); seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); up_read(&ag->lock); +out: autogroup_kref_put(ag); } #endif /* CONFIG_PROC_FS */ @@ -258,9 +265,7 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) #ifdef CONFIG_SCHED_DEBUG static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) { - int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); - - if (!enabled || !tg->autogroup) + if (!task_group_is_autogroup(tg)) return 0; return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h index 7b859ffe5dad..05577055cfca 100644 --- a/kernel/sched_autogroup.h +++ b/kernel/sched_autogroup.h @@ -1,6 +1,11 @@ #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup { + /* + * reference doesn't mean how many thread attach to this + * autogroup now. It just stands for the number of task + * could use this autogroup. + */ struct kref kref; struct task_group *tg; struct rw_semaphore lock; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index eb6cb8edd075..a6710a112b4f 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -152,7 +152,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, p) { - if (!p->se.on_rq || task_cpu(p) != rq_cpu) + if (!p->on_rq || task_cpu(p) != rq_cpu) continue; print_task(m, rq, p); @@ -179,7 +179,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) raw_spin_lock_irqsave(&rq->lock, flags); if (cfs_rq->rb_leftmost) - MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; + MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; last = __pick_last_entity(cfs_rq); if (last) max_vruntime = last->vruntime; @@ -296,9 +296,6 @@ static void print_cpu(struct seq_file *m, int cpu) P(ttwu_count); P(ttwu_local); - SEQ_printf(m, " .%-30s: %d\n", "bkl_count", - rq->rq_sched_info.bkl_count); - #undef P #undef P64 #endif @@ -441,7 +438,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.statistics.wait_count); PN(se.statistics.iowait_sum); P(se.statistics.iowait_count); - P(sched_info.bkl_count); P(se.nr_migrations); P(se.statistics.nr_migrations_cold); P(se.statistics.nr_failed_migrations_affine); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0c26e2df450e..433491c2dc8f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -22,6 +22,7 @@ #include <linux/latencytop.h> #include <linux/sched.h> +#include <linux/cpumask.h> /* * Targeted preemption latency for CPU-bound tasks: @@ -69,14 +70,6 @@ static unsigned int sched_nr_latency = 8; unsigned int sysctl_sched_child_runs_first __read_mostly; /* - * sys_sched_yield() compat mode - * - * This option switches the agressive yield implementation of the - * old scheduler back on. - */ -unsigned int __read_mostly sysctl_sched_compat_yield; - -/* * SCHED_OTHER wake-up granularity. * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) * @@ -365,6 +358,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) } cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); +#ifndef CONFIG_64BIT + smp_wmb(); + cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; +#endif } /* @@ -419,7 +416,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) rb_erase(&se->run_node, &cfs_rq->tasks_timeline); } -static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) +static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) { struct rb_node *left = cfs_rq->rb_leftmost; @@ -429,6 +426,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) return rb_entry(left, struct sched_entity, run_node); } +static struct sched_entity *__pick_next_entity(struct sched_entity *se) +{ + struct rb_node *next = rb_next(&se->run_node); + + if (!next) + return NULL; + + return rb_entry(next, struct sched_entity, run_node); +} + +#ifdef CONFIG_SCHED_DEBUG static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); @@ -443,7 +451,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) * Scheduling class statistics methods: */ -#ifdef CONFIG_SCHED_DEBUG int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -540,7 +547,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); -static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); +static void update_cfs_shares(struct cfs_rq *cfs_rq); /* * Update the current task's runtime statistics. Skip current tasks that @@ -733,6 +740,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) now - cfs_rq->load_last > 4 * period) { cfs_rq->load_period = 0; cfs_rq->load_avg = 0; + delta = period - 1; } cfs_rq->load_stamp = now; @@ -763,16 +771,15 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) list_del_leaf_cfs_rq(cfs_rq); } -static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, - long weight_delta) +static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { long load_weight, load, shares; - load = cfs_rq->load.weight + weight_delta; + load = cfs_rq->load.weight; load_weight = atomic_read(&tg->load_weight); - load_weight -= cfs_rq->load_contribution; load_weight += load; + load_weight -= cfs_rq->load_contribution; shares = (tg->shares * load); if (load_weight) @@ -790,7 +797,7 @@ static void update_entity_shares_tick(struct cfs_rq *cfs_rq) { if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); } } # else /* CONFIG_SMP */ @@ -798,8 +805,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) { } -static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, - long weight_delta) +static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { return tg->shares; } @@ -824,7 +830,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, account_entity_enqueue(cfs_rq, se); } -static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) +static void update_cfs_shares(struct cfs_rq *cfs_rq) { struct task_group *tg; struct sched_entity *se; @@ -838,7 +844,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) if (likely(se->load.weight == tg->shares)) return; #endif - shares = calc_cfs_shares(cfs_rq, tg, weight_delta); + shares = calc_cfs_shares(cfs_rq, tg); reweight_entity(cfs_rq_of(se), se, shares); } @@ -847,7 +853,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) { } -static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) +static inline void update_cfs_shares(struct cfs_rq *cfs_rq) { } @@ -978,8 +984,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ update_curr(cfs_rq); update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, se->load.weight); account_entity_enqueue(cfs_rq, se); + update_cfs_shares(cfs_rq); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); @@ -996,19 +1002,49 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) list_add_leaf_cfs_rq(cfs_rq); } -static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) +static void __clear_buddies_last(struct sched_entity *se) { - if (!se || cfs_rq->last == se) - cfs_rq->last = NULL; + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + if (cfs_rq->last == se) + cfs_rq->last = NULL; + else + break; + } +} - if (!se || cfs_rq->next == se) - cfs_rq->next = NULL; +static void __clear_buddies_next(struct sched_entity *se) +{ + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + if (cfs_rq->next == se) + cfs_rq->next = NULL; + else + break; + } +} + +static void __clear_buddies_skip(struct sched_entity *se) +{ + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + if (cfs_rq->skip == se) + cfs_rq->skip = NULL; + else + break; + } } static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) { - for_each_sched_entity(se) - __clear_buddies(cfs_rq_of(se), se); + if (cfs_rq->last == se) + __clear_buddies_last(se); + + if (cfs_rq->next == se) + __clear_buddies_next(se); + + if (cfs_rq->skip == se) + __clear_buddies_skip(se); } static void @@ -1040,8 +1076,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) se->on_rq = 0; update_cfs_load(cfs_rq, 0); account_entity_dequeue(cfs_rq, se); - update_min_vruntime(cfs_rq); - update_cfs_shares(cfs_rq, 0); /* * Normalize the entity after updating the min_vruntime because the @@ -1050,6 +1084,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ if (!(flags & DEQUEUE_SLEEP)) se->vruntime -= cfs_rq->min_vruntime; + + update_min_vruntime(cfs_rq); + update_cfs_shares(cfs_rq); } /* @@ -1084,7 +1121,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) return; if (cfs_rq->nr_running > 1) { - struct sched_entity *se = __pick_next_entity(cfs_rq); + struct sched_entity *se = __pick_first_entity(cfs_rq); s64 delta = curr->vruntime - se->vruntime; if (delta < 0) @@ -1128,13 +1165,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) static int wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); +/* + * Pick the next process, keeping these things in mind, in this order: + * 1) keep things fair between processes/task groups + * 2) pick the "next" process, since someone really wants that to run + * 3) pick the "last" process, for cache locality + * 4) do not run the "skip" process, if something else is available + */ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { - struct sched_entity *se = __pick_next_entity(cfs_rq); + struct sched_entity *se = __pick_first_entity(cfs_rq); struct sched_entity *left = se; - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) - se = cfs_rq->next; + /* + * Avoid running the skip buddy, if running something else can + * be done without getting too unfair. + */ + if (cfs_rq->skip == se) { + struct sched_entity *second = __pick_next_entity(se); + if (second && wakeup_preempt_entity(second, left) < 1) + se = second; + } /* * Prefer last buddy, try to return the CPU to a preempted task. @@ -1142,6 +1193,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) se = cfs_rq->last; + /* + * Someone really wants this to run. If it's not unfair, run it. + */ + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) + se = cfs_rq->next; + clear_buddies(cfs_rq, se); return se; @@ -1282,12 +1339,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq = cfs_rq_of(se); update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); } hrtick_update(rq); } +static void set_next_buddy(struct sched_entity *se); + /* * The dequeue_task method is called before nr_running is * decreased. We remove the task from the rbtree and @@ -1297,14 +1356,22 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int task_sleep = flags & DEQUEUE_SLEEP; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); /* Don't dequeue parent if it has other entities besides us */ - if (cfs_rq->load.weight) + if (cfs_rq->load.weight) { + /* + * Bias pick_next to pick a task from this cfs_rq, as + * p is sleeping when it is within its sched_slice. + */ + if (task_sleep && parent_entity(se)) + set_next_buddy(parent_entity(se)); break; + } flags |= DEQUEUE_SLEEP; } @@ -1312,66 +1379,33 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq = cfs_rq_of(se); update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); } hrtick_update(rq); } -/* - * sched_yield() support is very simple - we dequeue and enqueue. - * - * If compat_yield is turned on then we requeue to the end of the tree. - */ -static void yield_task_fair(struct rq *rq) -{ - struct task_struct *curr = rq->curr; - struct cfs_rq *cfs_rq = task_cfs_rq(curr); - struct sched_entity *rightmost, *se = &curr->se; - - /* - * Are we the only task in the tree? - */ - if (unlikely(cfs_rq->nr_running == 1)) - return; - - clear_buddies(cfs_rq, se); - - if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { - update_rq_clock(rq); - /* - * Update run-time statistics of the 'current'. - */ - update_curr(cfs_rq); - - return; - } - /* - * Find the rightmost entry in the rbtree: - */ - rightmost = __pick_last_entity(cfs_rq); - /* - * Already in the rightmost position? - */ - if (unlikely(!rightmost || entity_before(rightmost, se))) - return; - - /* - * Minimally necessary key value to be last in the tree: - * Upon rescheduling, sched_class::put_prev_task() will place - * 'current' within the tree based on its new key value. - */ - se->vruntime = rightmost->vruntime + 1; -} - #ifdef CONFIG_SMP -static void task_waking_fair(struct rq *rq, struct task_struct *p) +static void task_waking_fair(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 min_vruntime; - se->vruntime -= cfs_rq->min_vruntime; +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; + + do { + min_vruntime_copy = cfs_rq->min_vruntime_copy; + smp_rmb(); + min_vruntime = cfs_rq->min_vruntime; + } while (min_vruntime != min_vruntime_copy); +#else + min_vruntime = cfs_rq->min_vruntime; +#endif + + se->vruntime -= min_vruntime; } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1551,7 +1585,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, } /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power; if (local_group) { this_load = avg_load; @@ -1616,6 +1650,7 @@ static int select_idle_sibling(struct task_struct *p, int target) /* * Otherwise, iterate the domains and find an elegible idle cpu. */ + rcu_read_lock(); for_each_domain(target, sd) { if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) break; @@ -1635,6 +1670,7 @@ static int select_idle_sibling(struct task_struct *p, int target) cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) break; } + rcu_read_unlock(); return target; } @@ -1651,7 +1687,7 @@ static int select_idle_sibling(struct task_struct *p, int target) * preempt must be disabled. */ static int -select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); @@ -1667,6 +1703,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ new_cpu = prev_cpu; } + rcu_read_lock(); for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) continue; @@ -1686,7 +1723,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ nr_running += cpu_rq(i)->cfs.nr_running; } - capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); + capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); if (tmp->flags & SD_POWERSAVINGS_BALANCE) nr_running /= 2; @@ -1717,9 +1754,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ if (affine_sd) { if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) - return select_idle_sibling(p, cpu); - else - return select_idle_sibling(p, prev_cpu); + prev_cpu = cpu; + + new_cpu = select_idle_sibling(p, prev_cpu); + goto unlock; } while (sd) { @@ -1760,6 +1798,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ } /* while loop will break here if sd == NULL */ } +unlock: + rcu_read_unlock(); return new_cpu; } @@ -1783,10 +1823,7 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se) * This is especially important for buddies when the leftmost * task is higher priority than the buddy. */ - if (unlikely(se->load.weight != NICE_0_LOAD)) - gran = calc_delta_fair(gran, se); - - return gran; + return calc_delta_fair(gran, se); } /* @@ -1820,18 +1857,26 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) static void set_last_buddy(struct sched_entity *se) { - if (likely(task_of(se)->policy != SCHED_IDLE)) { - for_each_sched_entity(se) - cfs_rq_of(se)->last = se; - } + if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) + return; + + for_each_sched_entity(se) + cfs_rq_of(se)->last = se; } static void set_next_buddy(struct sched_entity *se) { - if (likely(task_of(se)->policy != SCHED_IDLE)) { - for_each_sched_entity(se) - cfs_rq_of(se)->next = se; - } + if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) + return; + + for_each_sched_entity(se) + cfs_rq_of(se)->next = se; +} + +static void set_skip_buddy(struct sched_entity *se) +{ + for_each_sched_entity(se) + cfs_rq_of(se)->skip = se; } /* @@ -1843,12 +1888,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); int scale = cfs_rq->nr_running >= sched_nr_latency; + int next_buddy_marked = 0; if (unlikely(se == pse)) return; - if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) + if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { set_next_buddy(pse); + next_buddy_marked = 1; + } /* * We can come here with TIF_NEED_RESCHED already set from new task @@ -1857,16 +1905,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (test_tsk_need_resched(curr)) return; + /* Idle tasks are by definition preempted by non-idle tasks. */ + if (unlikely(curr->policy == SCHED_IDLE) && + likely(p->policy != SCHED_IDLE)) + goto preempt; + /* - * Batch and idle tasks do not preempt (their preemption is driven by - * the tick): + * Batch and idle tasks do not preempt non-idle tasks (their preemption + * is driven by the tick): */ if (unlikely(p->policy != SCHED_NORMAL)) return; - /* Idle tasks are by definition preempted by everybody. */ - if (unlikely(curr->policy == SCHED_IDLE)) - goto preempt; if (!sched_feat(WAKEUP_PREEMPT)) return; @@ -1874,8 +1924,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ update_curr(cfs_rq); find_matching_se(&se, &pse); BUG_ON(!pse); - if (wakeup_preempt_entity(se, pse) == 1) + if (wakeup_preempt_entity(se, pse) == 1) { + /* + * Bias pick_next to pick the sched entity that is + * triggering this preemption. + */ + if (!next_buddy_marked) + set_next_buddy(pse); goto preempt; + } return; @@ -1932,6 +1989,51 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) } } +/* + * sched_yield() is very simple + * + * The magic of dealing with the ->skip buddy is in pick_next_entity. + */ +static void yield_task_fair(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); + struct sched_entity *se = &curr->se; + + /* + * Are we the only task in the tree? + */ + if (unlikely(rq->nr_running == 1)) + return; + + clear_buddies(cfs_rq, se); + + if (curr->policy != SCHED_BATCH) { + update_rq_clock(rq); + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + } + + set_skip_buddy(se); +} + +static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) +{ + struct sched_entity *se = &p->se; + + if (!se->on_rq) + return false; + + /* Tell the scheduler that we'd really like pse to run next. */ + set_next_buddy(se); + + yield_task_fair(rq); + + return true; +} + #ifdef CONFIG_SMP /************************************************** * Fair scheduling class load-balancing methods: @@ -2041,23 +2143,22 @@ static unsigned long balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, - int *this_best_prio, struct cfs_rq *busiest_cfs_rq) + struct cfs_rq *busiest_cfs_rq) { - int loops = 0, pulled = 0, pinned = 0; + int loops = 0, pulled = 0; long rem_load_move = max_load_move; struct task_struct *p, *n; if (max_load_move == 0) goto out; - pinned = 1; - list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { if (loops++ > sysctl_sched_nr_migrate) break; if ((p->se.load.weight >> 1) > rem_load_move || - !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) + !can_migrate_task(p, busiest, this_cpu, sd, idle, + all_pinned)) continue; pull_task(busiest, p, this_rq, this_cpu); @@ -2080,9 +2181,6 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, */ if (rem_load_move <= 0) break; - - if (p->prio < *this_best_prio) - *this_best_prio = p->prio; } out: /* @@ -2092,9 +2190,6 @@ out: */ schedstat_add(sd, lb_gained[idle], pulled); - if (all_pinned) - *all_pinned = pinned; - return max_load_move - rem_load_move; } @@ -2123,7 +2218,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu) * We need to update shares after updating tg->load_weight in * order to adjust the weight of groups with long running tasks. */ - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); raw_spin_unlock_irqrestore(&rq->lock, flags); @@ -2145,7 +2240,7 @@ static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, int *this_best_prio) + int *all_pinned) { long rem_load_move = max_load_move; int busiest_cpu = cpu_of(busiest); @@ -2170,7 +2265,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, rem_load = div_u64(rem_load, busiest_h_load + 1); moved_load = balance_tasks(this_rq, this_cpu, busiest, - rem_load, sd, idle, all_pinned, this_best_prio, + rem_load, sd, idle, all_pinned, busiest_cfs_rq); if (!moved_load) @@ -2196,11 +2291,11 @@ static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, int *this_best_prio) + int *all_pinned) { return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd, idle, all_pinned, - this_best_prio, &busiest->cfs); + &busiest->cfs); } #endif @@ -2217,12 +2312,11 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, int *all_pinned) { unsigned long total_load_moved = 0, load_moved; - int this_best_prio = this_rq->curr->prio; do { load_moved = load_balance_fair(this_rq, this_cpu, busiest, max_load_move - total_load_moved, - sd, idle, all_pinned, &this_best_prio); + sd, idle, all_pinned); total_load_moved += load_moved; @@ -2477,7 +2571,7 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) { - return SCHED_LOAD_SCALE; + return SCHED_POWER_SCALE; } unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) @@ -2514,10 +2608,10 @@ unsigned long scale_rt_power(int cpu) available = total - rq->rt_avg; } - if (unlikely((s64)total < SCHED_LOAD_SCALE)) - total = SCHED_LOAD_SCALE; + if (unlikely((s64)total < SCHED_POWER_SCALE)) + total = SCHED_POWER_SCALE; - total >>= SCHED_LOAD_SHIFT; + total >>= SCHED_POWER_SHIFT; return div_u64(available, total); } @@ -2525,7 +2619,7 @@ unsigned long scale_rt_power(int cpu) static void update_cpu_power(struct sched_domain *sd, int cpu) { unsigned long weight = sd->span_weight; - unsigned long power = SCHED_LOAD_SCALE; + unsigned long power = SCHED_POWER_SCALE; struct sched_group *sdg = sd->groups; if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { @@ -2534,7 +2628,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) else power *= default_scale_smt_power(sd, cpu); - power >>= SCHED_LOAD_SHIFT; + power >>= SCHED_POWER_SHIFT; } sdg->cpu_power_orig = power; @@ -2544,10 +2638,10 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) else power *= default_scale_freq_power(sd, cpu); - power >>= SCHED_LOAD_SHIFT; + power >>= SCHED_POWER_SHIFT; power *= scale_rt_power(cpu); - power >>= SCHED_LOAD_SHIFT; + power >>= SCHED_POWER_SHIFT; if (!power) power = 1; @@ -2589,9 +2683,9 @@ static inline int fix_small_capacity(struct sched_domain *sd, struct sched_group *group) { /* - * Only siblings can have significantly less than SCHED_LOAD_SCALE + * Only siblings can have significantly less than SCHED_POWER_SCALE */ - if (sd->level != SD_LV_SIBLING) + if (!(sd->flags & SD_SHARE_CPUPOWER)) return 0; /* @@ -2610,7 +2704,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu * @load_idx: Load index of sched_domain of this_cpu for load calc. - * @sd_idle: Idle status of the sched_domain containing group. * @local_group: Does group contain this_cpu. * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. @@ -2618,7 +2711,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) */ static inline void update_sg_lb_stats(struct sched_domain *sd, struct sched_group *group, int this_cpu, - enum cpu_idle_type idle, int load_idx, int *sd_idle, + enum cpu_idle_type idle, int load_idx, int local_group, const struct cpumask *cpus, int *balance, struct sg_lb_stats *sgs) { @@ -2638,9 +2731,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, for_each_cpu_and(i, sched_group_cpus(group), cpus) { struct rq *rq = cpu_rq(i); - if (*sd_idle && rq->nr_running) - *sd_idle = 0; - /* Bias balancing toward cpus of our domain */ if (local_group) { if (idle_cpu(i) && !first_idle_cpu) { @@ -2681,11 +2771,11 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, } /* Adjust by relative CPU power of the group */ - sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; + sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power; /* * Consider the group unbalanced when the imbalance is larger - * than the average weight of two tasks. + * than the average weight of a task. * * APZ: with cgroup the avg task weight can vary wildly and * might not be a suitable number - should we keep a @@ -2695,10 +2785,11 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, if (sgs->sum_nr_running) avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; - if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1) + if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) sgs->group_imb = 1; - sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); + sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, + SCHED_POWER_SCALE); if (!sgs->group_capacity) sgs->group_capacity = fix_small_capacity(sd, group); sgs->group_weight = group->group_weight; @@ -2755,15 +2846,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, * @sd: sched_domain whose statistics are to be updated. * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu - * @sd_idle: Idle status of the sched_domain containing sg. * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. */ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, - enum cpu_idle_type idle, int *sd_idle, - const struct cpumask *cpus, int *balance, - struct sd_lb_stats *sds) + enum cpu_idle_type idle, const struct cpumask *cpus, + int *balance, struct sd_lb_stats *sds) { struct sched_domain *child = sd->child; struct sched_group *sg = sd->groups; @@ -2781,7 +2870,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, + update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, local_group, cpus, balance, &sgs); if (local_group && !(*balance)) @@ -2874,7 +2963,7 @@ static int check_asym_packing(struct sched_domain *sd, return 0; *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, - SCHED_LOAD_SCALE); + SCHED_POWER_SCALE); return 1; } @@ -2903,7 +2992,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, cpu_avg_load_per_task(this_cpu); scaled_busy_load_per_task = sds->busiest_load_per_task - * SCHED_LOAD_SCALE; + * SCHED_POWER_SCALE; scaled_busy_load_per_task /= sds->busiest->cpu_power; if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= @@ -2922,10 +3011,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, min(sds->busiest_load_per_task, sds->max_load); pwr_now += sds->this->cpu_power * min(sds->this_load_per_task, sds->this_load); - pwr_now /= SCHED_LOAD_SCALE; + pwr_now /= SCHED_POWER_SCALE; /* Amount of load we'd subtract */ - tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / + tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / sds->busiest->cpu_power; if (sds->max_load > tmp) pwr_move += sds->busiest->cpu_power * @@ -2933,15 +3022,15 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, /* Amount of load we'd add */ if (sds->max_load * sds->busiest->cpu_power < - sds->busiest_load_per_task * SCHED_LOAD_SCALE) + sds->busiest_load_per_task * SCHED_POWER_SCALE) tmp = (sds->max_load * sds->busiest->cpu_power) / sds->this->cpu_power; else - tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / + tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / sds->this->cpu_power; pwr_move += sds->this->cpu_power * min(sds->this_load_per_task, sds->this_load + tmp); - pwr_move /= SCHED_LOAD_SCALE; + pwr_move /= SCHED_POWER_SCALE; /* Move if we gain throughput */ if (pwr_move > pwr_now) @@ -2983,7 +3072,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, load_above_capacity = (sds->busiest_nr_running - sds->busiest_group_capacity); - load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE); + load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); load_above_capacity /= sds->busiest->cpu_power; } @@ -3003,11 +3092,11 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, /* How much load to actually move to equalise the imbalance */ *imbalance = min(max_pull * sds->busiest->cpu_power, (sds->avg_load - sds->this_load) * sds->this->cpu_power) - / SCHED_LOAD_SCALE; + / SCHED_POWER_SCALE; /* * if *imbalance is less than the average load per runnable task - * there is no gaurantee that any tasks will be moved so we'll have + * there is no guarantee that any tasks will be moved so we'll have * a think about bumping its value to force at least one task to be * moved */ @@ -3033,7 +3122,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, * @imbalance: Variable which stores amount of weighted load which should * be moved to restore balance/put a group to idle. * @idle: The idle status of this_cpu. - * @sd_idle: The idleness of sd * @cpus: The set of CPUs under consideration for load-balancing. * @balance: Pointer to a variable indicating if this_cpu * is the appropriate cpu to perform load balancing at this_level. @@ -3046,7 +3134,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum cpu_idle_type idle, - int *sd_idle, const struct cpumask *cpus, int *balance) + const struct cpumask *cpus, int *balance) { struct sd_lb_stats sds; @@ -3056,22 +3144,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * Compute the various statistics relavent for load balancing at * this level. */ - update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, - balance, &sds); - - /* Cases where imbalance does not exist from POV of this_cpu */ - /* 1) this_cpu is not the appropriate cpu to perform load balancing - * at this level. - * 2) There is no busy sibling group to pull from. - * 3) This group is the busiest group. - * 4) This group is more busy than the avg busieness at this - * sched_domain. - * 5) The imbalance is within the specified limit. - * - * Note: when doing newidle balance, if the local group has excess - * capacity (i.e. nr_running < group_capacity) and the busiest group - * does not have any capacity, we force a load balance to pull tasks - * to the local group. In this case, we skip past checks 3, 4 and 5. + update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); + + /* + * this_cpu is not the appropriate cpu to perform load balancing at + * this level. */ if (!(*balance)) goto ret; @@ -3080,41 +3157,56 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, check_asym_packing(sd, &sds, this_cpu, imbalance)) return sds.busiest; + /* There is no busy sibling group to pull tasks from */ if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; - /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ + sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; + + /* + * If the busiest group is imbalanced the below checks don't + * work because they assumes all things are equal, which typically + * isn't true due to cpus_allowed constraints and the like. + */ + if (sds.group_imb) + goto force_balance; + + /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && !sds.busiest_has_capacity) goto force_balance; + /* + * If the local group is more busy than the selected busiest group + * don't try and pull any tasks. + */ if (sds.this_load >= sds.max_load) goto out_balanced; - sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; - + /* + * Don't pull any tasks if this group is already above the domain + * average load. + */ if (sds.this_load >= sds.avg_load) goto out_balanced; - /* - * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative. - * And to check for busy balance use !idle_cpu instead of - * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE - * even when they are idle. - */ - if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) { - if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) - goto out_balanced; - } else { + if (idle == CPU_IDLE) { /* * This cpu is idle. If the busiest group load doesn't * have more tasks than the number of available cpu's and * there is no imbalance between this and busiest group * wrt to idle cpu's, it is balanced. */ - if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && + if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && sds.busiest_nr_running <= sds.busiest_group_weight) goto out_balanced; + } else { + /* + * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use + * imbalance_pct to be conservative. + */ + if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) + goto out_balanced; } force_balance: @@ -3148,7 +3240,8 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, for_each_cpu(i, sched_group_cpus(group)) { unsigned long power = power_of(i); - unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); + unsigned long capacity = DIV_ROUND_CLOSEST(power, + SCHED_POWER_SCALE); unsigned long wl; if (!capacity) @@ -3173,7 +3266,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, * the load can be moved away from the cpu that is potentially * running at a lower capacity. */ - wl = (wl * SCHED_LOAD_SCALE) / power; + wl = (wl * SCHED_POWER_SCALE) / power; if (wl > max_load) { max_load = wl; @@ -3193,7 +3286,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, /* Working cpumask for load_balance and load_balance_newidle. */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); -static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, +static int need_active_balance(struct sched_domain *sd, int idle, int busiest_cpu, int this_cpu) { if (idle == CPU_NEWLY_IDLE) { @@ -3225,10 +3318,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, * move_tasks() will succeed. ld_moved will be true and this * active balance code will not be triggered. */ - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - return 0; - if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) return 0; } @@ -3246,7 +3335,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { - int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; + int ld_moved, all_pinned = 0, active_balance = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; @@ -3255,20 +3344,10 @@ static int load_balance(int this_cpu, struct rq *this_rq, cpumask_copy(cpus, cpu_active_mask); - /* - * When power savings policy is enabled for the parent domain, idle - * sibling can pick up load irrespective of busy siblings. In this case, - * let the state of idle sibling percolate up as CPU_IDLE, instead of - * portraying it as CPU_NOT_IDLE. - */ - if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - sd_idle = 1; - schedstat_inc(sd, lb_count[idle]); redo: - group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, + group = find_busiest_group(sd, this_cpu, &imbalance, idle, cpus, balance); if (*balance == 0) @@ -3297,6 +3376,7 @@ redo: * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ + all_pinned = 1; local_irq_save(flags); double_rq_lock(this_rq, busiest); ld_moved = move_tasks(this_rq, this_cpu, busiest, @@ -3330,8 +3410,7 @@ redo: if (idle != CPU_NEWLY_IDLE) sd->nr_balance_failed++; - if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), - this_cpu)) { + if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the active_load_balance_cpu_stop, @@ -3386,10 +3465,6 @@ redo: sd->balance_interval *= 2; } - if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - ld_moved = -1; - goto out; out_balanced: @@ -3403,11 +3478,7 @@ out_one_pinned: (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - ld_moved = -1; - else - ld_moved = 0; + ld_moved = 0; out: return ld_moved; } @@ -3433,6 +3504,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) raw_spin_unlock(&this_rq->lock); update_shares(this_cpu); + rcu_read_lock(); for_each_domain(this_cpu, sd) { unsigned long interval; int balance = 1; @@ -3454,6 +3526,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) break; } } + rcu_read_unlock(); raw_spin_lock(&this_rq->lock); @@ -3502,6 +3575,7 @@ static int active_load_balance_cpu_stop(void *data) double_lock_balance(busiest_rq, target_rq); /* Search for an sd spanning us and the target CPU. */ + rcu_read_lock(); for_each_domain(target_cpu, sd) { if ((sd->flags & SD_LOAD_BALANCE) && cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) @@ -3517,6 +3591,7 @@ static int active_load_balance_cpu_stop(void *data) else schedstat_inc(sd, alb_failed); } + rcu_read_unlock(); double_unlock_balance(busiest_rq, target_rq); out_unlock: busiest_rq->active_balance = 0; @@ -3643,6 +3718,7 @@ static int find_new_ilb(int cpu) { struct sched_domain *sd; struct sched_group *ilb_group; + int ilb = nr_cpu_ids; /* * Have idle load balancer selection from semi-idle packages only @@ -3658,20 +3734,25 @@ static int find_new_ilb(int cpu) if (cpumask_weight(nohz.idle_cpus_mask) < 2) goto out_done; + rcu_read_lock(); for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { ilb_group = sd->groups; do { - if (is_semi_idle_group(ilb_group)) - return cpumask_first(nohz.grp_idle_mask); + if (is_semi_idle_group(ilb_group)) { + ilb = cpumask_first(nohz.grp_idle_mask); + goto unlock; + } ilb_group = ilb_group->next; } while (ilb_group != sd->groups); } +unlock: + rcu_read_unlock(); out_done: - return nr_cpu_ids; + return ilb; } #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ static inline int find_new_ilb(int call_cpu) @@ -3786,6 +3867,17 @@ void select_nohz_load_balancer(int stop_tick) static DEFINE_SPINLOCK(balancing); +static unsigned long __read_mostly max_load_balance_interval = HZ/10; + +/* + * Scale the max load_balance interval with the number of CPUs in the system. + * This trades load-balance latency on larger machines for less cross talk. + */ +static void update_max_interval(void) +{ + max_load_balance_interval = HZ*num_online_cpus()/10; +} + /* * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. @@ -3805,6 +3897,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) update_shares(cpu); + rcu_read_lock(); for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) continue; @@ -3815,10 +3908,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) /* scale ms to jiffies */ interval = msecs_to_jiffies(interval); - if (unlikely(!interval)) - interval = 1; - if (interval > HZ*NR_CPUS/10) - interval = HZ*NR_CPUS/10; + interval = clamp(interval, 1UL, max_load_balance_interval); need_serialize = sd->flags & SD_SERIALIZE; @@ -3831,8 +3921,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) if (load_balance(cpu, rq, sd, idle, &balance)) { /* * We've pulled tasks over so either we're no - * longer idle, or one of our SMT siblings is - * not idle. + * longer idle. */ idle = CPU_NOT_IDLE; } @@ -3854,6 +3943,7 @@ out: if (!balance) break; } + rcu_read_unlock(); /* * next_balance will be updated only when there is a need. @@ -4079,33 +4169,62 @@ static void task_fork_fair(struct task_struct *p) * Priority of the task has changed. Check to see if we preempt * the current task. */ -static void prio_changed_fair(struct rq *rq, struct task_struct *p, - int oldprio, int running) +static void +prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) { + if (!p->se.on_rq) + return; + /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (running) { + if (rq->curr == p) { if (p->prio > oldprio) resched_task(rq->curr); } else check_preempt_curr(rq, p, 0); } +static void switched_from_fair(struct rq *rq, struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* + * Ensure the task's vruntime is normalized, so that when its + * switched back to the fair class the enqueue_entity(.flags=0) will + * do the right thing. + * + * If it was on_rq, then the dequeue_entity(.flags=0) will already + * have normalized the vruntime, if it was !on_rq, then only when + * the task is sleeping will it still have non-normalized vruntime. + */ + if (!se->on_rq && p->state != TASK_RUNNING) { + /* + * Fix up our vruntime so that the current sleep doesn't + * cause 'unlimited' sleep bonus. + */ + place_entity(cfs_rq, se, 0); + se->vruntime -= cfs_rq->min_vruntime; + } +} + /* * We switched to the sched_fair class. */ -static void switched_to_fair(struct rq *rq, struct task_struct *p, - int running) +static void switched_to_fair(struct rq *rq, struct task_struct *p) { + if (!p->se.on_rq) + return; + /* * We were most likely switched from sched_rt, so * kick off the schedule if running, otherwise just see * if we can still preempt the current task. */ - if (running) + if (rq->curr == p) resched_task(rq->curr); else check_preempt_curr(rq, p, 0); @@ -4171,6 +4290,7 @@ static const struct sched_class fair_sched_class = { .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, + .yield_to_task = yield_to_task_fair, .check_preempt_curr = check_preempt_wakeup, @@ -4191,6 +4311,7 @@ static const struct sched_class fair_sched_class = { .task_fork = task_fork_fair, .prio_changed = prio_changed_fair, + .switched_from = switched_from_fair, .switched_to = switched_to_fair, .get_rr_interval = get_rr_interval_fair, diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 68e69acc29b9..be40f7371ee1 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -64,3 +64,9 @@ SCHED_FEAT(OWNER_SPIN, 1) * Decrement CPU power based on irq activity */ SCHED_FEAT(NONIRQ_POWER, 1) + +/* + * Queue remote wakeups on the target CPU and process them + * using the scheduler IPI. Reduces rq->lock contention/bounces. + */ +SCHED_FEAT(TTWU_QUEUE, 1) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 9fa0f402c87c..0a51882534ea 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -7,7 +7,7 @@ #ifdef CONFIG_SMP static int -select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags) +select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } @@ -52,31 +52,15 @@ static void set_curr_task_idle(struct rq *rq) { } -static void switched_to_idle(struct rq *rq, struct task_struct *p, - int running) +static void switched_to_idle(struct rq *rq, struct task_struct *p) { - /* Can this actually happen?? */ - if (running) - resched_task(rq->curr); - else - check_preempt_curr(rq, p, 0); + BUG(); } -static void prio_changed_idle(struct rq *rq, struct task_struct *p, - int oldprio, int running) +static void +prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) { - /* This can happen for hot plug CPUS */ - - /* - * Reschedule if we are currently running on this runqueue and - * our priority decreased, or if we are not currently running on - * this runqueue and our priority is higher than the current's - */ - if (running) { - if (p->prio > oldprio) - resched_task(rq->curr); - } else - check_preempt_curr(rq, p, 0); + BUG(); } static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) @@ -110,6 +94,4 @@ static const struct sched_class idle_sched_class = { .prio_changed = prio_changed_idle, .switched_to = switched_to_idle, - - /* no .task_new for idle tasks */ }; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 01f75a5f17af..88725c939e0b 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -183,6 +183,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); } +typedef struct task_group *rt_rq_iter_t; + +#define for_each_rt_rq(rt_rq, iter, rq) \ + for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \ + (&iter->list != &task_groups) && \ + (rt_rq = iter->rt_rq[cpu_of(rq)]); \ + iter = list_entry_rcu(iter->list.next, typeof(*iter), list)) + static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) { list_add_rcu(&rt_rq->leaf_rt_rq_list, @@ -288,6 +296,11 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) return ktime_to_ns(def_rt_bandwidth.rt_period); } +typedef struct rt_rq *rt_rq_iter_t; + +#define for_each_rt_rq(rt_rq, iter, rq) \ + for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) + static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) { } @@ -402,12 +415,13 @@ next: static void __disable_runtime(struct rq *rq) { struct root_domain *rd = rq->rd; + rt_rq_iter_t iter; struct rt_rq *rt_rq; if (unlikely(!scheduler_running)) return; - for_each_leaf_rt_rq(rt_rq, rq) { + for_each_rt_rq(rt_rq, iter, rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); s64 want; int i; @@ -487,6 +501,7 @@ static void disable_runtime(struct rq *rq) static void __enable_runtime(struct rq *rq) { + rt_rq_iter_t iter; struct rt_rq *rt_rq; if (unlikely(!scheduler_running)) @@ -495,7 +510,7 @@ static void __enable_runtime(struct rq *rq) /* * Reset each runqueue's bandwidth settings */ - for_each_leaf_rt_rq(rt_rq, rq) { + for_each_rt_rq(rt_rq, iter, rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); raw_spin_lock(&rt_b->rt_runtime_lock); @@ -562,6 +577,13 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { rt_rq->rt_throttled = 0; enqueue = 1; + + /* + * Force a clock update if the CPU was idle, + * lest wakeup -> unthrottle time accumulate. + */ + if (rt_rq->rt_nr_running && rq->curr == rq->idle) + rq->skip_clock_update = -1; } if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; @@ -977,13 +999,23 @@ static void yield_task_rt(struct rq *rq) static int find_lowest_rq(struct task_struct *task); static int -select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) +select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) { + struct task_struct *curr; + struct rq *rq; + int cpu; + if (sd_flag != SD_BALANCE_WAKE) return smp_processor_id(); + cpu = task_cpu(p); + rq = cpu_rq(cpu); + + rcu_read_lock(); + curr = ACCESS_ONCE(rq->curr); /* unlocked access */ + /* - * If the current task is an RT task, then + * If the current task on @p's runqueue is an RT task, then * try to see if we can wake this RT task up on another * runqueue. Otherwise simply start this RT task * on its current runqueue. @@ -997,21 +1029,25 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) * lock? * * For equal prio tasks, we just let the scheduler sort it out. + * + * Otherwise, just let it ride on the affined RQ and the + * post-schedule router will push the preempted task away + * + * This test is optimistic, if we get it wrong the load-balancer + * will have to sort it out. */ - if (unlikely(rt_task(rq->curr)) && - (rq->curr->rt.nr_cpus_allowed < 2 || - rq->curr->prio < p->prio) && + if (curr && unlikely(rt_task(curr)) && + (curr->rt.nr_cpus_allowed < 2 || + curr->prio < p->prio) && (p->rt.nr_cpus_allowed > 1)) { - int cpu = find_lowest_rq(p); + int target = find_lowest_rq(p); - return (cpu == -1) ? task_cpu(p) : cpu; + if (target != -1) + cpu = target; } + rcu_read_unlock(); - /* - * Otherwise, just let it ride on the affined RQ and the - * post-schedule router will push the preempted task away - */ - return task_cpu(p); + return cpu; } static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) @@ -1136,7 +1172,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) * The previous task needs to be made eligible for pushing * if it is still active */ - if (p->se.on_rq && p->rt.nr_cpus_allowed > 1) + if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); } @@ -1227,6 +1263,7 @@ static int find_lowest_rq(struct task_struct *task) if (!cpumask_test_cpu(this_cpu, lowest_mask)) this_cpu = -1; /* Skip this_cpu opt if not among lowest */ + rcu_read_lock(); for_each_domain(cpu, sd) { if (sd->flags & SD_WAKE_AFFINE) { int best_cpu; @@ -1236,15 +1273,20 @@ static int find_lowest_rq(struct task_struct *task) * remote processor. */ if (this_cpu != -1 && - cpumask_test_cpu(this_cpu, sched_domain_span(sd))) + cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { + rcu_read_unlock(); return this_cpu; + } best_cpu = cpumask_first_and(lowest_mask, sched_domain_span(sd)); - if (best_cpu < nr_cpu_ids) + if (best_cpu < nr_cpu_ids) { + rcu_read_unlock(); return best_cpu; + } } } + rcu_read_unlock(); /* * And finally, if there were no matches within the domains @@ -1287,7 +1329,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) || task_running(rq, task) || - !task->se.on_rq)) { + !task->on_rq)) { raw_spin_unlock(&lowest_rq->lock); lowest_rq = NULL; @@ -1321,7 +1363,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(task_current(rq, p)); BUG_ON(p->rt.nr_cpus_allowed <= 1); - BUG_ON(!p->se.on_rq); + BUG_ON(!p->on_rq); BUG_ON(!rt_task(p)); return p; @@ -1378,7 +1420,7 @@ retry: task = pick_next_pushable_task(rq); if (task_cpu(next_task) == rq->cpu && task == next_task) { /* - * If we get here, the task hasnt moved at all, but + * If we get here, the task hasn't moved at all, but * it has failed to push. We will not try again, * since the other cpus will pull from us when they * are ready. @@ -1467,7 +1509,7 @@ static int pull_rt_task(struct rq *this_rq) */ if (p && (p->prio < this_rq->rt.highest_prio.curr)) { WARN_ON(p == src_rq->curr); - WARN_ON(!p->se.on_rq); + WARN_ON(!p->on_rq); /* * There's a chance that p is higher in priority @@ -1488,7 +1530,7 @@ static int pull_rt_task(struct rq *this_rq) /* * We continue with the search, just in * case there's an even higher prio task - * in another runqueue. (low likelyhood + * in another runqueue. (low likelihood * but possible) */ } @@ -1538,7 +1580,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, * Update the migration status of the RQ if we have an RT task * which is running AND changing its weight value. */ - if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { + if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) { struct rq *rq = task_rq(p); if (!task_current(rq, p)) { @@ -1599,8 +1641,7 @@ static void rq_offline_rt(struct rq *rq) * When switch from the rt queue, we bring ourselves to a position * that we might want to pull RT tasks from other runqueues. */ -static void switched_from_rt(struct rq *rq, struct task_struct *p, - int running) +static void switched_from_rt(struct rq *rq, struct task_struct *p) { /* * If there are other RT tasks then we will reschedule @@ -1609,7 +1650,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p, * we may need to handle the pulling of RT tasks * now. */ - if (!rq->rt.rt_nr_running) + if (p->on_rq && !rq->rt.rt_nr_running) pull_rt_task(rq); } @@ -1628,8 +1669,7 @@ static inline void init_sched_rt_class(void) * with RT tasks. In this case we try to push them off to * other runqueues. */ -static void switched_to_rt(struct rq *rq, struct task_struct *p, - int running) +static void switched_to_rt(struct rq *rq, struct task_struct *p) { int check_resched = 1; @@ -1640,7 +1680,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p, * If that current running task is also an RT task * then see if we can move to another run queue. */ - if (!running) { + if (p->on_rq && rq->curr != p) { #ifdef CONFIG_SMP if (rq->rt.overloaded && push_rt_task(rq) && /* Don't resched if we changed runqueues */ @@ -1656,10 +1696,13 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p, * Priority of the task has changed. This may cause * us to initiate a push or pull. */ -static void prio_changed_rt(struct rq *rq, struct task_struct *p, - int oldprio, int running) +static void +prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) { - if (running) { + if (!p->on_rq) + return; + + if (rq->curr == p) { #ifdef CONFIG_SMP /* * If our priority decreases while running, we @@ -1795,10 +1838,11 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); static void print_rt_stats(struct seq_file *m, int cpu) { + rt_rq_iter_t iter; struct rt_rq *rt_rq; rcu_read_lock(); - for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu)) + for_each_rt_rq(rt_rq, iter, cpu_rq(cpu)) print_rt_rq(m, cpu, rt_rq); rcu_read_unlock(); } diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 48ddf431db0e..331e01bcd026 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -37,7 +37,7 @@ static int show_schedstat(struct seq_file *seq, void *v) #ifdef CONFIG_SMP /* domain-specific stats */ - preempt_disable(); + rcu_read_lock(); for_each_domain(cpu, sd) { enum cpu_idle_type itype; @@ -64,7 +64,7 @@ static int show_schedstat(struct seq_file *seq, void *v) sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); } - preempt_enable(); + rcu_read_unlock(); #endif } kfree(mask_str); diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 2bf6b47058c1..6f437632afab 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c @@ -9,8 +9,7 @@ #ifdef CONFIG_SMP static int -select_task_rq_stop(struct rq *rq, struct task_struct *p, - int sd_flag, int flags) +select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* stop tasks as never migrate */ } @@ -26,7 +25,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) { struct task_struct *stop = rq->stop; - if (stop && stop->se.on_rq) + if (stop && stop->on_rq) return stop; return NULL; @@ -59,14 +58,13 @@ static void set_curr_task_stop(struct rq *rq) { } -static void switched_to_stop(struct rq *rq, struct task_struct *p, - int running) +static void switched_to_stop(struct rq *rq, struct task_struct *p) { BUG(); /* its impossible to change to this class */ } -static void prio_changed_stop(struct rq *rq, struct task_struct *p, - int oldprio, int running) +static void +prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) { BUG(); /* how!?, what priority? */ } @@ -103,6 +101,4 @@ static const struct sched_class stop_sched_class = { .prio_changed = prio_changed_stop, .switched_to = switched_to_stop, - - /* no .task_new for stop tasks */ }; diff --git a/kernel/signal.c b/kernel/signal.c index 4e3cff10fdce..86c32b884f8e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) static int recalc_sigpending_tsk(struct task_struct *t) { - if (t->signal->group_stop_count > 0 || + if ((t->group_stop & GROUP_STOP_PENDING) || PENDING(&t->pending, &t->blocked) || PENDING(&t->signal->shared_pending, &t->blocked)) { set_tsk_thread_flag(t, TIF_SIGPENDING); @@ -223,10 +223,87 @@ static inline void print_dropped_signal(int sig) current->comm, current->pid, sig); } +/** + * task_clear_group_stop_trapping - clear group stop trapping bit + * @task: target task + * + * If GROUP_STOP_TRAPPING is set, a ptracer is waiting for us. Clear it + * and wake up the ptracer. Note that we don't need any further locking. + * @task->siglock guarantees that @task->parent points to the ptracer. + * + * CONTEXT: + * Must be called with @task->sighand->siglock held. + */ +static void task_clear_group_stop_trapping(struct task_struct *task) +{ + if (unlikely(task->group_stop & GROUP_STOP_TRAPPING)) { + task->group_stop &= ~GROUP_STOP_TRAPPING; + __wake_up_sync_key(&task->parent->signal->wait_chldexit, + TASK_UNINTERRUPTIBLE, 1, task); + } +} + +/** + * task_clear_group_stop_pending - clear pending group stop + * @task: target task + * + * Clear group stop states for @task. + * + * CONTEXT: + * Must be called with @task->sighand->siglock held. + */ +void task_clear_group_stop_pending(struct task_struct *task) +{ + task->group_stop &= ~(GROUP_STOP_PENDING | GROUP_STOP_CONSUME | + GROUP_STOP_DEQUEUED); +} + +/** + * task_participate_group_stop - participate in a group stop + * @task: task participating in a group stop + * + * @task has GROUP_STOP_PENDING set and is participating in a group stop. + * Group stop states are cleared and the group stop count is consumed if + * %GROUP_STOP_CONSUME was set. If the consumption completes the group + * stop, the appropriate %SIGNAL_* flags are set. + * + * CONTEXT: + * Must be called with @task->sighand->siglock held. + * + * RETURNS: + * %true if group stop completion should be notified to the parent, %false + * otherwise. + */ +static bool task_participate_group_stop(struct task_struct *task) +{ + struct signal_struct *sig = task->signal; + bool consume = task->group_stop & GROUP_STOP_CONSUME; + + WARN_ON_ONCE(!(task->group_stop & GROUP_STOP_PENDING)); + + task_clear_group_stop_pending(task); + + if (!consume) + return false; + + if (!WARN_ON_ONCE(sig->group_stop_count == 0)) + sig->group_stop_count--; + + /* + * Tell the caller to notify completion iff we are entering into a + * fresh group stop. Read comment in do_signal_stop() for details. + */ + if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) { + sig->flags = SIGNAL_STOP_STOPPED; + return true; + } + return false; +} + /* * allocate a new signal queue record * - this may be called without locks if and only if t == current, otherwise an - * appopriate lock must be held to stop the target task from exiting + * appropriate lock must be held to stop the target task from exiting */ static struct sigqueue * __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) @@ -375,15 +452,15 @@ int unhandled_signal(struct task_struct *tsk, int sig) return !tracehook_consider_fatal_signal(tsk, sig); } - -/* Notify the system that a driver wants to block all signals for this +/* + * Notify the system that a driver wants to block all signals for this * process, and wants to be notified if any signals at all were to be * sent/acted upon. If the notifier routine returns non-zero, then the * signal will be acted upon after all. If the notifier routine returns 0, * then then signal will be blocked. Only one block per process is * allowed. priv is a pointer to private data that the notifier routine - * can use to determine if the signal should be blocked or not. */ - + * can use to determine if the signal should be blocked or not. + */ void block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask) { @@ -434,9 +511,10 @@ still_pending: copy_siginfo(info, &first->info); __sigqueue_free(first); } else { - /* Ok, it wasn't in the queue. This must be - a fast-pathed signal or we must have been - out of queue space. So zero out the info. + /* + * Ok, it wasn't in the queue. This must be + * a fast-pathed signal or we must have been + * out of queue space. So zero out the info. */ info->si_signo = sig; info->si_errno = 0; @@ -468,7 +546,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, } /* - * Dequeue a signal and return the element to the caller, which is + * Dequeue a signal and return the element to the caller, which is * expected to free it. * * All callers have to hold the siglock. @@ -490,7 +568,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) * itimers are process shared and we restart periodic * itimers in the signal delivery path to prevent DoS * attacks in the high resolution timer case. This is - * compliant with the old way of self restarting + * compliant with the old way of self-restarting * itimers, as the SIGALRM is a legacy signal and only * queued once. Changing the restart behaviour to * restart the timer in the signal dequeue path is @@ -526,7 +604,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) * is to alert stop-signal processing code when another * processor has come along and cleared the flag. */ - tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; + current->group_stop |= GROUP_STOP_DEQUEUED; } if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { /* @@ -591,7 +669,7 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) if (sigisemptyset(&m)) return 0; - signandsets(&s->signal, &s->signal, mask); + sigandnsets(&s->signal, &s->signal, mask); list_for_each_entry_safe(q, n, &s->list, list) { if (sigismember(mask, q->info.si_signo)) { list_del_init(&q->list); @@ -636,13 +714,33 @@ static inline bool si_fromuser(const struct siginfo *info) } /* + * called with RCU read lock from check_kill_permission() + */ +static int kill_ok_by_cred(struct task_struct *t) +{ + const struct cred *cred = current_cred(); + const struct cred *tcred = __task_cred(t); + + if (cred->user->user_ns == tcred->user->user_ns && + (cred->euid == tcred->suid || + cred->euid == tcred->uid || + cred->uid == tcred->suid || + cred->uid == tcred->uid)) + return 1; + + if (ns_capable(tcred->user->user_ns, CAP_KILL)) + return 1; + + return 0; +} + +/* * Bad permissions for sending the signal * - the caller must hold the RCU read lock */ static int check_kill_permission(int sig, struct siginfo *info, struct task_struct *t) { - const struct cred *cred, *tcred; struct pid *sid; int error; @@ -656,14 +754,8 @@ static int check_kill_permission(int sig, struct siginfo *info, if (error) return error; - cred = current_cred(); - tcred = __task_cred(t); if (!same_thread_group(current, t) && - (cred->euid ^ tcred->suid) && - (cred->euid ^ tcred->uid) && - (cred->uid ^ tcred->suid) && - (cred->uid ^ tcred->uid) && - !capable(CAP_KILL)) { + !kill_ok_by_cred(t)) { switch (sig) { case SIGCONT: sid = task_session(t); @@ -712,34 +804,14 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) } else if (sig == SIGCONT) { unsigned int why; /* - * Remove all stop signals from all queues, - * and wake all threads. + * Remove all stop signals from all queues, wake all threads. */ rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); t = p; do { - unsigned int state; + task_clear_group_stop_pending(t); rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); - /* - * If there is a handler for SIGCONT, we must make - * sure that no thread returns to user mode before - * we post the signal, in case it was the only - * thread eligible to run the signal handler--then - * it must not do anything between resuming and - * running the handler. With the TIF_SIGPENDING - * flag set, the thread will pause and acquire the - * siglock that we hold now and until we've queued - * the pending signal. - * - * Wake up the stopped thread _after_ setting - * TIF_SIGPENDING - */ - state = __TASK_STOPPED; - if (sig_user_defined(t, SIGCONT) && !sigismember(&t->blocked, SIGCONT)) { - set_tsk_thread_flag(t, TIF_SIGPENDING); - state |= TASK_INTERRUPTIBLE; - } - wake_up_state(t, state); + wake_up_state(t, __TASK_STOPPED); } while_each_thread(p, t); /* @@ -765,13 +837,6 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) signal->flags = why | SIGNAL_STOP_CONTINUED; signal->group_stop_count = 0; signal->group_exit_code = 0; - } else { - /* - * We are not stopped, but there could be a stop - * signal in the middle of being processed after - * being removed from the queue. Clear that too. - */ - signal->flags &= ~SIGNAL_STOP_DEQUEUED; } } @@ -860,6 +925,7 @@ static void complete_signal(int sig, struct task_struct *p, int group) signal->group_stop_count = 0; t = p; do { + task_clear_group_stop_pending(t); sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); } while_each_thread(p, t); @@ -909,14 +975,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, if (info == SEND_SIG_FORCED) goto out_set; - /* Real-time signals must be queued if sent by sigqueue, or - some other real-time mechanism. It is implementation - defined whether kill() does so. We attempt to do so, on - the principle of least surprise, but since kill is not - allowed to fail with EAGAIN when low on memory we just - make sure at least one signal gets delivered and don't - pass on the info struct. */ - + /* + * Real-time signals must be queued if sent by sigqueue, or + * some other real-time mechanism. It is implementation + * defined whether kill() does so. We attempt to do so, on + * the principle of least surprise, but since kill is not + * allowed to fail with EAGAIN when low on memory we just + * make sure at least one signal gets delivered and don't + * pass on the info struct. + */ if (sig < SIGRTMIN) override_rlimit = (is_si_special(info) || info->si_code >= 0); else @@ -1093,6 +1160,7 @@ int zap_other_threads(struct task_struct *p) p->signal->group_stop_count = 0; while_each_thread(p, t) { + task_clear_group_stop_pending(t); count++; /* Don't bother with already dead threads */ @@ -1187,8 +1255,7 @@ retry: return error; } -int -kill_proc_info(int sig, struct siginfo *info, pid_t pid) +int kill_proc_info(int sig, struct siginfo *info, pid_t pid) { int error; rcu_read_lock(); @@ -1285,8 +1352,7 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) * These are for backward compatibility with the rest of the kernel source. */ -int -send_sig_info(int sig, struct siginfo *info, struct task_struct *p) +int send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { /* * Make sure legacy kernel users don't send in bad values @@ -1354,7 +1420,7 @@ EXPORT_SYMBOL(kill_pid); * These functions support sending signals using preallocated sigqueue * structures. This is needed "because realtime applications cannot * afford to lose notifications of asynchronous events, like timer - * expirations or I/O completions". In the case of Posix Timers + * expirations or I/O completions". In the case of POSIX Timers * we allocate the sigqueue structure from the timer_create. If this * allocation fails we are able to report the failure to the application * with an EAGAIN error. @@ -1522,16 +1588,30 @@ int do_notify_parent(struct task_struct *tsk, int sig) return ret; } -static void do_notify_parent_cldstop(struct task_struct *tsk, int why) +/** + * do_notify_parent_cldstop - notify parent of stopped/continued state change + * @tsk: task reporting the state change + * @for_ptracer: the notification is for ptracer + * @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report + * + * Notify @tsk's parent that the stopped/continued state has changed. If + * @for_ptracer is %false, @tsk's group leader notifies to its real parent. + * If %true, @tsk reports to @tsk->parent which should be the ptracer. + * + * CONTEXT: + * Must be called with tasklist_lock at least read locked. + */ +static void do_notify_parent_cldstop(struct task_struct *tsk, + bool for_ptracer, int why) { struct siginfo info; unsigned long flags; struct task_struct *parent; struct sighand_struct *sighand; - if (task_ptrace(tsk)) + if (for_ptracer) { parent = tsk->parent; - else { + } else { tsk = tsk->group_leader; parent = tsk->real_parent; } @@ -1539,7 +1619,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why) info.si_signo = SIGCHLD; info.si_errno = 0; /* - * see comment in do_notify_parent() abot the following 3 lines + * see comment in do_notify_parent() about the following 4 lines */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); @@ -1597,7 +1677,7 @@ static inline int may_ptrace_stop(void) } /* - * Return nonzero if there is a SIGKILL that should be waking us up. + * Return non-zero if there is a SIGKILL that should be waking us up. * Called with the siglock held. */ static int sigkill_pending(struct task_struct *tsk) @@ -1607,6 +1687,15 @@ static int sigkill_pending(struct task_struct *tsk) } /* + * Test whether the target task of the usual cldstop notification - the + * real_parent of @child - is in the same group as the ptracer. + */ +static bool real_parent_is_ptracer(struct task_struct *child) +{ + return same_thread_group(child->parent, child->real_parent); +} + +/* * This must be called with current->sighand->siglock held. * * This should be the path for all ptrace stops. @@ -1617,10 +1706,12 @@ static int sigkill_pending(struct task_struct *tsk) * If we actually decide not to stop at all because the tracer * is gone, we keep current->exit_code unless clear_code. */ -static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) +static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) __releases(¤t->sighand->siglock) __acquires(¤t->sighand->siglock) { + bool gstop_done = false; + if (arch_ptrace_stop_needed(exit_code, info)) { /* * The arch code has something special to do before a @@ -1641,21 +1732,49 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) } /* - * If there is a group stop in progress, - * we must participate in the bookkeeping. + * If @why is CLD_STOPPED, we're trapping to participate in a group + * stop. Do the bookkeeping. Note that if SIGCONT was delievered + * while siglock was released for the arch hook, PENDING could be + * clear now. We act as if SIGCONT is received after TASK_TRACED + * is entered - ignore it. */ - if (current->signal->group_stop_count > 0) - --current->signal->group_stop_count; + if (why == CLD_STOPPED && (current->group_stop & GROUP_STOP_PENDING)) + gstop_done = task_participate_group_stop(current); current->last_siginfo = info; current->exit_code = exit_code; - /* Let the debugger run. */ - __set_current_state(TASK_TRACED); + /* + * TRACED should be visible before TRAPPING is cleared; otherwise, + * the tracer might fail do_wait(). + */ + set_current_state(TASK_TRACED); + + /* + * We're committing to trapping. Clearing GROUP_STOP_TRAPPING and + * transition to TASK_TRACED should be atomic with respect to + * siglock. This hsould be done after the arch hook as siglock is + * released and regrabbed across it. + */ + task_clear_group_stop_trapping(current); + spin_unlock_irq(¤t->sighand->siglock); read_lock(&tasklist_lock); if (may_ptrace_stop()) { - do_notify_parent_cldstop(current, CLD_TRAPPED); + /* + * Notify parents of the stop. + * + * While ptraced, there are two parents - the ptracer and + * the real_parent of the group_leader. The ptracer should + * know about every stop while the real parent is only + * interested in the completion of group stop. The states + * for the two don't interact with each other. Notify + * separately unless they're gonna be duplicates. + */ + do_notify_parent_cldstop(current, true, why); + if (gstop_done && !real_parent_is_ptracer(current)) + do_notify_parent_cldstop(current, false, why); + /* * Don't want to allow preemption here, because * sys_ptrace() needs this task to be inactive. @@ -1670,7 +1789,16 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) /* * By the time we got the lock, our tracer went away. * Don't drop the lock yet, another tracer may come. + * + * If @gstop_done, the ptracer went away between group stop + * completion and here. During detach, it would have set + * GROUP_STOP_PENDING on us and we'll re-enter TASK_STOPPED + * in do_signal_stop() on return, so notifying the real + * parent of the group stop completion is enough. */ + if (gstop_done) + do_notify_parent_cldstop(current, false, why); + __set_current_state(TASK_RUNNING); if (clear_code) current->exit_code = 0; @@ -1714,79 +1842,128 @@ void ptrace_notify(int exit_code) /* Let the debugger run. */ spin_lock_irq(¤t->sighand->siglock); - ptrace_stop(exit_code, 1, &info); + ptrace_stop(exit_code, CLD_TRAPPED, 1, &info); spin_unlock_irq(¤t->sighand->siglock); } /* * This performs the stopping for SIGSTOP and other stop signals. * We have to stop all threads in the thread group. - * Returns nonzero if we've actually stopped and released the siglock. + * Returns non-zero if we've actually stopped and released the siglock. * Returns zero if we didn't stop and still hold the siglock. */ static int do_signal_stop(int signr) { struct signal_struct *sig = current->signal; - int notify; - if (!sig->group_stop_count) { + if (!(current->group_stop & GROUP_STOP_PENDING)) { + unsigned int gstop = GROUP_STOP_PENDING | GROUP_STOP_CONSUME; struct task_struct *t; - if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || + /* signr will be recorded in task->group_stop for retries */ + WARN_ON_ONCE(signr & ~GROUP_STOP_SIGMASK); + + if (!likely(current->group_stop & GROUP_STOP_DEQUEUED) || unlikely(signal_group_exit(sig))) return 0; /* - * There is no group stop already in progress. - * We must initiate one now. + * There is no group stop already in progress. We must + * initiate one now. + * + * While ptraced, a task may be resumed while group stop is + * still in effect and then receive a stop signal and + * initiate another group stop. This deviates from the + * usual behavior as two consecutive stop signals can't + * cause two group stops when !ptraced. That is why we + * also check !task_is_stopped(t) below. + * + * The condition can be distinguished by testing whether + * SIGNAL_STOP_STOPPED is already set. Don't generate + * group_exit_code in such case. + * + * This is not necessary for SIGNAL_STOP_CONTINUED because + * an intervening stop signal is required to cause two + * continued events regardless of ptrace. */ - sig->group_exit_code = signr; + if (!(sig->flags & SIGNAL_STOP_STOPPED)) + sig->group_exit_code = signr; + else + WARN_ON_ONCE(!task_ptrace(current)); + current->group_stop &= ~GROUP_STOP_SIGMASK; + current->group_stop |= signr | gstop; sig->group_stop_count = 1; - for (t = next_thread(current); t != current; t = next_thread(t)) + for (t = next_thread(current); t != current; + t = next_thread(t)) { + t->group_stop &= ~GROUP_STOP_SIGMASK; /* * Setting state to TASK_STOPPED for a group * stop is always done with the siglock held, * so this check has no races. */ - if (!(t->flags & PF_EXITING) && - !task_is_stopped_or_traced(t)) { + if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) { + t->group_stop |= signr | gstop; sig->group_stop_count++; signal_wake_up(t, 0); } + } } - /* - * If there are no other threads in the group, or if there is - * a group stop in progress and we are the last to stop, report - * to the parent. When ptraced, every thread reports itself. - */ - notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0; - notify = tracehook_notify_jctl(notify, CLD_STOPPED); - /* - * tracehook_notify_jctl() can drop and reacquire siglock, so - * we keep ->group_stop_count != 0 before the call. If SIGCONT - * or SIGKILL comes in between ->group_stop_count == 0. - */ - if (sig->group_stop_count) { - if (!--sig->group_stop_count) - sig->flags = SIGNAL_STOP_STOPPED; - current->exit_code = sig->group_exit_code; +retry: + if (likely(!task_ptrace(current))) { + int notify = 0; + + /* + * If there are no other threads in the group, or if there + * is a group stop in progress and we are the last to stop, + * report to the parent. + */ + if (task_participate_group_stop(current)) + notify = CLD_STOPPED; + __set_current_state(TASK_STOPPED); + spin_unlock_irq(¤t->sighand->siglock); + + /* + * Notify the parent of the group stop completion. Because + * we're not holding either the siglock or tasklist_lock + * here, ptracer may attach inbetween; however, this is for + * group stop and should always be delivered to the real + * parent of the group leader. The new ptracer will get + * its notification when this task transitions into + * TASK_TRACED. + */ + if (notify) { + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current, false, notify); + read_unlock(&tasklist_lock); + } + + /* Now we don't run again until woken by SIGCONT or SIGKILL */ + schedule(); + + spin_lock_irq(¤t->sighand->siglock); + } else { + ptrace_stop(current->group_stop & GROUP_STOP_SIGMASK, + CLD_STOPPED, 0, NULL); + current->exit_code = 0; } - spin_unlock_irq(¤t->sighand->siglock); - if (notify) { - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current, notify); - read_unlock(&tasklist_lock); + /* + * GROUP_STOP_PENDING could be set if another group stop has + * started since being woken up or ptrace wants us to transit + * between TASK_STOPPED and TRACED. Retry group stop. + */ + if (current->group_stop & GROUP_STOP_PENDING) { + WARN_ON_ONCE(!(current->group_stop & GROUP_STOP_SIGMASK)); + goto retry; } - /* Now we don't run again until woken by SIGCONT or SIGKILL */ - do { - schedule(); - } while (try_to_freeze()); + /* PTRACE_ATTACH might have raced with task killing, clear trapping */ + task_clear_group_stop_trapping(current); + + spin_unlock_irq(¤t->sighand->siglock); tracehook_finish_jctl(); - current->exit_code = 0; return 1; } @@ -1800,7 +1977,7 @@ static int ptrace_signal(int signr, siginfo_t *info, ptrace_signal_deliver(regs, cookie); /* Let the debugger run. */ - ptrace_stop(signr, 0, info); + ptrace_stop(signr, CLD_TRAPPED, 0, info); /* We're back. Did the debugger cancel the sig? */ signr = current->exit_code; @@ -1809,10 +1986,12 @@ static int ptrace_signal(int signr, siginfo_t *info, current->exit_code = 0; - /* Update the siginfo structure if the signal has - changed. If the debugger wanted something - specific in the siginfo structure then it should - have updated *info via PTRACE_SETSIGINFO. */ + /* + * Update the siginfo structure if the signal has + * changed. If the debugger wanted something + * specific in the siginfo structure then it should + * have updated *info via PTRACE_SETSIGINFO. + */ if (signr != info->si_signo) { info->si_signo = signr; info->si_errno = 0; @@ -1853,25 +2032,43 @@ relock: * the CLD_ si_code into SIGNAL_CLD_MASK bits. */ if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { - int why = (signal->flags & SIGNAL_STOP_CONTINUED) - ? CLD_CONTINUED : CLD_STOPPED; + struct task_struct *leader; + int why; + + if (signal->flags & SIGNAL_CLD_CONTINUED) + why = CLD_CONTINUED; + else + why = CLD_STOPPED; + signal->flags &= ~SIGNAL_CLD_MASK; - why = tracehook_notify_jctl(why, CLD_CONTINUED); spin_unlock_irq(&sighand->siglock); - if (why) { - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current->group_leader, why); - read_unlock(&tasklist_lock); - } + /* + * Notify the parent that we're continuing. This event is + * always per-process and doesn't make whole lot of sense + * for ptracers, who shouldn't consume the state via + * wait(2) either, but, for backward compatibility, notify + * the ptracer of the group leader too unless it's gonna be + * a duplicate. + */ + read_lock(&tasklist_lock); + + do_notify_parent_cldstop(current, false, why); + + leader = current->group_leader; + if (task_ptrace(leader) && !real_parent_is_ptracer(leader)) + do_notify_parent_cldstop(leader, true, why); + + read_unlock(&tasklist_lock); + goto relock; } for (;;) { struct k_sigaction *ka; /* - * Tracing can induce an artifical signal and choose sigaction. + * Tracing can induce an artificial signal and choose sigaction. * The return value in @signr determines the default action, * but @info->si_signo is the signal number we will report. */ @@ -1881,8 +2078,8 @@ relock: if (unlikely(signr != 0)) ka = return_ka; else { - if (unlikely(signal->group_stop_count > 0) && - do_signal_stop(0)) + if (unlikely(current->group_stop & + GROUP_STOP_PENDING) && do_signal_stop(0)) goto relock; signr = dequeue_signal(current, ¤t->blocked, @@ -2001,10 +2198,42 @@ relock: return signr; } +/* + * It could be that complete_signal() picked us to notify about the + * group-wide signal. Other threads should be notified now to take + * the shared signals in @which since we will not. + */ +static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which) +{ + sigset_t retarget; + struct task_struct *t; + + sigandsets(&retarget, &tsk->signal->shared_pending.signal, which); + if (sigisemptyset(&retarget)) + return; + + t = tsk; + while_each_thread(tsk, t) { + if (t->flags & PF_EXITING) + continue; + + if (!has_pending_signals(&retarget, &t->blocked)) + continue; + /* Remove the signals this thread can handle. */ + sigandsets(&retarget, &retarget, &t->blocked); + + if (!signal_pending(t)) + signal_wake_up(t, 0); + + if (sigisemptyset(&retarget)) + break; + } +} + void exit_signals(struct task_struct *tsk) { int group_stop = 0; - struct task_struct *t; + sigset_t unblocked; if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { tsk->flags |= PF_EXITING; @@ -2020,25 +2249,23 @@ void exit_signals(struct task_struct *tsk) if (!signal_pending(tsk)) goto out; - /* It could be that __group_complete_signal() choose us to - * notify about group-wide signal. Another thread should be - * woken now to take the signal since we will not. - */ - for (t = tsk; (t = next_thread(t)) != tsk; ) - if (!signal_pending(t) && !(t->flags & PF_EXITING)) - recalc_sigpending_and_wake(t); + unblocked = tsk->blocked; + signotset(&unblocked); + retarget_shared_pending(tsk, &unblocked); - if (unlikely(tsk->signal->group_stop_count) && - !--tsk->signal->group_stop_count) { - tsk->signal->flags = SIGNAL_STOP_STOPPED; - group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED); - } + if (unlikely(tsk->group_stop & GROUP_STOP_PENDING) && + task_participate_group_stop(tsk)) + group_stop = CLD_STOPPED; out: spin_unlock_irq(&tsk->sighand->siglock); + /* + * If group stop has completed, deliver the notification. This + * should always go to the real parent of the group leader. + */ if (unlikely(group_stop)) { read_lock(&tasklist_lock); - do_notify_parent_cldstop(tsk, group_stop); + do_notify_parent_cldstop(tsk, false, group_stop); read_unlock(&tasklist_lock); } } @@ -2058,6 +2285,9 @@ EXPORT_SYMBOL(unblock_all_signals); * System call entry points. */ +/** + * sys_restart_syscall - restart a system call + */ SYSCALL_DEFINE0(restart_syscall) { struct restart_block *restart = ¤t_thread_info()->restart_block; @@ -2069,11 +2299,33 @@ long do_no_restart_syscall(struct restart_block *param) return -EINTR; } -/* - * We don't need to get the kernel lock - this is all local to this - * particular thread.. (and that's good, because this is _heavily_ - * used by various programs) +static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset) +{ + if (signal_pending(tsk) && !thread_group_empty(tsk)) { + sigset_t newblocked; + /* A set of now blocked but previously unblocked signals. */ + sigandnsets(&newblocked, newset, ¤t->blocked); + retarget_shared_pending(tsk, &newblocked); + } + tsk->blocked = *newset; + recalc_sigpending(); +} + +/** + * set_current_blocked - change current->blocked mask + * @newset: new mask + * + * It is wrong to change ->blocked directly, this helper should be used + * to ensure the process can't miss a shared signal we are going to block. */ +void set_current_blocked(const sigset_t *newset) +{ + struct task_struct *tsk = current; + + spin_lock_irq(&tsk->sighand->siglock); + __set_task_blocked(tsk, newset); + spin_unlock_irq(&tsk->sighand->siglock); +} /* * This is also useful for kernel threads that want to temporarily @@ -2085,66 +2337,66 @@ long do_no_restart_syscall(struct restart_block *param) */ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) { - int error; + struct task_struct *tsk = current; + sigset_t newset; - spin_lock_irq(¤t->sighand->siglock); + /* Lockless, only current can change ->blocked, never from irq */ if (oldset) - *oldset = current->blocked; + *oldset = tsk->blocked; - error = 0; switch (how) { case SIG_BLOCK: - sigorsets(¤t->blocked, ¤t->blocked, set); + sigorsets(&newset, &tsk->blocked, set); break; case SIG_UNBLOCK: - signandsets(¤t->blocked, ¤t->blocked, set); + sigandnsets(&newset, &tsk->blocked, set); break; case SIG_SETMASK: - current->blocked = *set; + newset = *set; break; default: - error = -EINVAL; + return -EINVAL; } - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - return error; + set_current_blocked(&newset); + return 0; } -SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set, +/** + * sys_rt_sigprocmask - change the list of currently blocked signals + * @how: whether to add, remove, or set signals + * @set: stores pending signals + * @oset: previous value of signal mask if non-null + * @sigsetsize: size of sigset_t type + */ +SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset, sigset_t __user *, oset, size_t, sigsetsize) { - int error = -EINVAL; sigset_t old_set, new_set; + int error; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) - goto out; + return -EINVAL; - if (set) { - error = -EFAULT; - if (copy_from_user(&new_set, set, sizeof(*set))) - goto out; + old_set = current->blocked; + + if (nset) { + if (copy_from_user(&new_set, nset, sizeof(sigset_t))) + return -EFAULT; sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); - error = sigprocmask(how, &new_set, &old_set); + error = sigprocmask(how, &new_set, NULL); if (error) - goto out; - if (oset) - goto set_old; - } else if (oset) { - spin_lock_irq(¤t->sighand->siglock); - old_set = current->blocked; - spin_unlock_irq(¤t->sighand->siglock); + return error; + } - set_old: - error = -EFAULT; - if (copy_to_user(oset, &old_set, sizeof(*oset))) - goto out; + if (oset) { + if (copy_to_user(oset, &old_set, sizeof(sigset_t))) + return -EFAULT; } - error = 0; -out: - return error; + + return 0; } long do_sigpending(void __user *set, unsigned long sigsetsize) @@ -2169,8 +2421,14 @@ long do_sigpending(void __user *set, unsigned long sigsetsize) out: return error; -} +} +/** + * sys_rt_sigpending - examine a pending signal that has been raised + * while blocked + * @set: stores pending signals + * @sigsetsize: size of sigset_t type or larger + */ SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) { return do_sigpending(set, sigsetsize); @@ -2219,9 +2477,9 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) err |= __put_user(from->si_trapno, &to->si_trapno); #endif #ifdef BUS_MCEERR_AO - /* + /* * Other callers might not initialize the si_lsb field, - * so check explicitely for the right codes here. + * so check explicitly for the right codes here. */ if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); @@ -2250,15 +2508,82 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) #endif +/** + * do_sigtimedwait - wait for queued signals specified in @which + * @which: queued signals to wait for + * @info: if non-null, the signal's siginfo is returned here + * @ts: upper bound on process time suspension + */ +int do_sigtimedwait(const sigset_t *which, siginfo_t *info, + const struct timespec *ts) +{ + struct task_struct *tsk = current; + long timeout = MAX_SCHEDULE_TIMEOUT; + sigset_t mask = *which; + int sig; + + if (ts) { + if (!timespec_valid(ts)) + return -EINVAL; + timeout = timespec_to_jiffies(ts); + /* + * We can be close to the next tick, add another one + * to ensure we will wait at least the time asked for. + */ + if (ts->tv_sec || ts->tv_nsec) + timeout++; + } + + /* + * Invert the set of allowed signals to get those we want to block. + */ + sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); + signotset(&mask); + + spin_lock_irq(&tsk->sighand->siglock); + sig = dequeue_signal(tsk, &mask, info); + if (!sig && timeout) { + /* + * None ready, temporarily unblock those we're interested + * while we are sleeping in so that we'll be awakened when + * they arrive. Unblocking is always fine, we can avoid + * set_current_blocked(). + */ + tsk->real_blocked = tsk->blocked; + sigandsets(&tsk->blocked, &tsk->blocked, &mask); + recalc_sigpending(); + spin_unlock_irq(&tsk->sighand->siglock); + + timeout = schedule_timeout_interruptible(timeout); + + spin_lock_irq(&tsk->sighand->siglock); + __set_task_blocked(tsk, &tsk->real_blocked); + siginitset(&tsk->real_blocked, 0); + sig = dequeue_signal(tsk, &mask, info); + } + spin_unlock_irq(&tsk->sighand->siglock); + + if (sig) + return sig; + return timeout ? -EINTR : -EAGAIN; +} + +/** + * sys_rt_sigtimedwait - synchronously wait for queued signals specified + * in @uthese + * @uthese: queued signals to wait for + * @uinfo: if non-null, the signal's siginfo is returned here + * @uts: upper bound on process time suspension + * @sigsetsize: size of sigset_t type + */ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, siginfo_t __user *, uinfo, const struct timespec __user *, uts, size_t, sigsetsize) { - int ret, sig; sigset_t these; struct timespec ts; siginfo_t info; - long timeout = 0; + int ret; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) @@ -2266,65 +2591,27 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, if (copy_from_user(&these, uthese, sizeof(these))) return -EFAULT; - - /* - * Invert the set of allowed signals to get those we - * want to block. - */ - sigdelsetmask(&these, sigmask(SIGKILL)|sigmask(SIGSTOP)); - signotset(&these); if (uts) { if (copy_from_user(&ts, uts, sizeof(ts))) return -EFAULT; - if (ts.tv_nsec >= 1000000000L || ts.tv_nsec < 0 - || ts.tv_sec < 0) - return -EINVAL; } - spin_lock_irq(¤t->sighand->siglock); - sig = dequeue_signal(current, &these, &info); - if (!sig) { - timeout = MAX_SCHEDULE_TIMEOUT; - if (uts) - timeout = (timespec_to_jiffies(&ts) - + (ts.tv_sec || ts.tv_nsec)); - - if (timeout) { - /* None ready -- temporarily unblock those we're - * interested while we are sleeping in so that we'll - * be awakened when they arrive. */ - current->real_blocked = current->blocked; - sigandsets(¤t->blocked, ¤t->blocked, &these); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - timeout = schedule_timeout_interruptible(timeout); - - spin_lock_irq(¤t->sighand->siglock); - sig = dequeue_signal(current, &these, &info); - current->blocked = current->real_blocked; - siginitset(¤t->real_blocked, 0); - recalc_sigpending(); - } - } - spin_unlock_irq(¤t->sighand->siglock); + ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL); - if (sig) { - ret = sig; - if (uinfo) { - if (copy_siginfo_to_user(uinfo, &info)) - ret = -EFAULT; - } - } else { - ret = -EAGAIN; - if (timeout) - ret = -EINTR; + if (ret > 0 && uinfo) { + if (copy_siginfo_to_user(uinfo, &info)) + ret = -EFAULT; } return ret; } +/** + * sys_kill - send a signal to a process + * @pid: the PID of the process + * @sig: signal to be sent + */ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) { struct siginfo info; @@ -2400,7 +2687,11 @@ SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig) return do_tkill(tgid, pid, sig); } -/* +/** + * sys_tkill - send signal to one specific task + * @pid: the PID of the task + * @sig: signal to be sent + * * Send a signal to only one task, even if it's a CLONE_THREAD task. */ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) @@ -2412,6 +2703,12 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) return do_tkill(0, pid, sig); } +/** + * sys_rt_sigqueueinfo - send signal information to a signal + * @pid: the PID of the thread + * @sig: signal to be sent + * @uinfo: signal info to be sent + */ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, siginfo_t __user *, uinfo) { @@ -2421,9 +2718,13 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, return -EFAULT; /* Not even root can pretend to send signals from the kernel. - Nor can they impersonate a kill(), which adds source info. */ - if (info.si_code >= 0) + * Nor can they impersonate a kill()/tgkill(), which adds source info. + */ + if (info.si_code >= 0 || info.si_code == SI_TKILL) { + /* We used to allow any < 0 si_code */ + WARN_ON_ONCE(info.si_code < 0); return -EPERM; + } info.si_signo = sig; /* POSIX.1b doesn't mention process groups. */ @@ -2437,9 +2738,13 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) return -EINVAL; /* Not even root can pretend to send signals from the kernel. - Nor can they impersonate a kill(), which adds source info. */ - if (info->si_code >= 0) + * Nor can they impersonate a kill()/tgkill(), which adds source info. + */ + if (info->si_code >= 0 || info->si_code == SI_TKILL) { + /* We used to allow any < 0 si_code */ + WARN_ON_ONCE(info->si_code < 0); return -EPERM; + } info->si_signo = sig; return do_send_specific(tgid, pid, sig, info); @@ -2531,12 +2836,11 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s error = -EINVAL; /* - * - * Note - this code used to test ss_flags incorrectly + * Note - this code used to test ss_flags incorrectly: * old code may have been written using ss_flags==0 * to mean ss_flags==SS_ONSTACK (as this was the only * way that worked) - this fix preserves that older - * mechanism + * mechanism. */ if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0) goto out; @@ -2570,6 +2874,10 @@ out: #ifdef __ARCH_WANT_SYS_SIGPENDING +/** + * sys_sigpending - examine pending signals + * @set: where mask of pending signal is returned + */ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) { return do_sigpending(set, sizeof(*set)); @@ -2578,60 +2886,65 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) #endif #ifdef __ARCH_WANT_SYS_SIGPROCMASK -/* Some platforms have their own version with special arguments others - support only sys_rt_sigprocmask. */ +/** + * sys_sigprocmask - examine and change blocked signals + * @how: whether to add, remove, or set signals + * @nset: signals to add or remove (if non-null) + * @oset: previous value of signal mask if non-null + * + * Some platforms have their own version with special arguments; + * others support only sys_rt_sigprocmask. + */ -SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set, +SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, old_sigset_t __user *, oset) { - int error; old_sigset_t old_set, new_set; + sigset_t new_blocked; - if (set) { - error = -EFAULT; - if (copy_from_user(&new_set, set, sizeof(*set))) - goto out; + old_set = current->blocked.sig[0]; + + if (nset) { + if (copy_from_user(&new_set, nset, sizeof(*nset))) + return -EFAULT; new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); - spin_lock_irq(¤t->sighand->siglock); - old_set = current->blocked.sig[0]; + new_blocked = current->blocked; - error = 0; switch (how) { - default: - error = -EINVAL; - break; case SIG_BLOCK: - sigaddsetmask(¤t->blocked, new_set); + sigaddsetmask(&new_blocked, new_set); break; case SIG_UNBLOCK: - sigdelsetmask(¤t->blocked, new_set); + sigdelsetmask(&new_blocked, new_set); break; case SIG_SETMASK: - current->blocked.sig[0] = new_set; + new_blocked.sig[0] = new_set; break; + default: + return -EINVAL; } - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - if (error) - goto out; - if (oset) - goto set_old; - } else if (oset) { - old_set = current->blocked.sig[0]; - set_old: - error = -EFAULT; + set_current_blocked(&new_blocked); + } + + if (oset) { if (copy_to_user(oset, &old_set, sizeof(*oset))) - goto out; + return -EFAULT; } - error = 0; -out: - return error; + + return 0; } #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ #ifdef __ARCH_WANT_SYS_RT_SIGACTION +/** + * sys_rt_sigaction - alter an action taken by a process + * @sig: signal to be sent + * @act: new sigaction + * @oact: used to save the previous sigaction + * @sigsetsize: size of sigset_t type + */ SYSCALL_DEFINE4(rt_sigaction, int, sig, const struct sigaction __user *, act, struct sigaction __user *, oact, @@ -2710,14 +3023,22 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler) SYSCALL_DEFINE0(pause) { - current->state = TASK_INTERRUPTIBLE; - schedule(); + while (!signal_pending(current)) { + current->state = TASK_INTERRUPTIBLE; + schedule(); + } return -ERESTARTNOHAND; } #endif #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND +/** + * sys_rt_sigsuspend - replace the signal mask for a value with the + * @unewset value until a signal is received + * @unewset: new signal mask value + * @sigsetsize: size of sigset_t type + */ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) { sigset_t newset; diff --git a/kernel/smp.c b/kernel/smp.c index 9910744f0856..73a195193558 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -194,7 +194,7 @@ void generic_smp_call_function_interrupt(void) */ list_for_each_entry_rcu(data, &call_function.queue, csd.list) { int refs; - void (*func) (void *info); + smp_call_func_t func; /* * Since we walk the list without any locks, we might @@ -214,17 +214,17 @@ void generic_smp_call_function_interrupt(void) if (atomic_read(&data->refs) == 0) continue; - func = data->csd.func; /* for later warn */ - data->csd.func(data->csd.info); + func = data->csd.func; /* save for later warn */ + func(data->csd.info); /* - * If the cpu mask is not still set then it enabled interrupts, - * we took another smp interrupt, and executed the function - * twice on this cpu. In theory that copy decremented refs. + * If the cpu mask is not still set then func enabled + * interrupts (BUG), and this cpu took another smp call + * function interrupt and executed func(info) twice + * on this cpu. That nested execution decremented refs. */ if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) { - WARN(1, "%pS enabled interrupts and double executed\n", - func); + WARN(1, "%pf enabled interrupts and double executed\n", func); continue; } @@ -450,7 +450,7 @@ void smp_call_function_many(const struct cpumask *mask, { struct call_function_data *data; unsigned long flags; - int cpu, next_cpu, this_cpu = smp_processor_id(); + int refs, cpu, next_cpu, this_cpu = smp_processor_id(); /* * Can deadlock when called with interrupts disabled. @@ -461,7 +461,7 @@ void smp_call_function_many(const struct cpumask *mask, WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() && !oops_in_progress && !early_boot_irqs_disabled); - /* So, what's a CPU they want? Ignoring this one. */ + /* Try to fastpath. So, what's a CPU they want? Ignoring this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); if (cpu == this_cpu) cpu = cpumask_next_and(cpu, mask, cpu_online_mask); @@ -483,22 +483,49 @@ void smp_call_function_many(const struct cpumask *mask, data = &__get_cpu_var(cfd_data); csd_lock(&data->csd); + + /* This BUG_ON verifies our reuse assertions and can be removed */ BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); + /* + * The global call function queue list add and delete are protected + * by a lock, but the list is traversed without any lock, relying + * on the rcu list add and delete to allow safe concurrent traversal. + * We reuse the call function data without waiting for any grace + * period after some other cpu removes it from the global queue. + * This means a cpu might find our data block as it is being + * filled out. + * + * We hold off the interrupt handler on the other cpu by + * ordering our writes to the cpu mask vs our setting of the + * refs counter. We assert only the cpu owning the data block + * will set a bit in cpumask, and each bit will only be cleared + * by the subject cpu. Each cpu must first find its bit is + * set and then check that refs is set indicating the element is + * ready to be processed, otherwise it must skip the entry. + * + * On the previous iteration refs was set to 0 by another cpu. + * To avoid the use of transitivity, set the counter to 0 here + * so the wmb will pair with the rmb in the interrupt handler. + */ + atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */ + data->csd.func = func; data->csd.info = info; - cpumask_and(data->cpumask, mask, cpu_online_mask); - cpumask_clear_cpu(this_cpu, data->cpumask); - /* - * To ensure the interrupt handler gets an complete view - * we order the cpumask and refs writes and order the read - * of them in the interrupt handler. In addition we may - * only clear our own cpu bit from the mask. - */ + /* Ensure 0 refs is visible before mask. Also orders func and info */ smp_wmb(); - atomic_set(&data->refs, cpumask_weight(data->cpumask)); + /* We rely on the "and" being processed before the store */ + cpumask_and(data->cpumask, mask, cpu_online_mask); + cpumask_clear_cpu(this_cpu, data->cpumask); + refs = cpumask_weight(data->cpumask); + + /* Some callers race with other cpus changing the passed mask */ + if (unlikely(!refs)) { + csd_unlock(&data->csd); + return; + } raw_spin_lock_irqsave(&call_function.lock, flags); /* @@ -507,6 +534,12 @@ void smp_call_function_many(const struct cpumask *mask, * will not miss any other list entries: */ list_add_rcu(&data->csd.list, &call_function.queue); + /* + * We rely on the wmb() in list_add_rcu to complete our writes + * to the cpumask before this write to refs, which indicates + * data is on the list and is ready to be processed. + */ + atomic_set(&data->refs, refs); raw_spin_unlock_irqrestore(&call_function.lock, flags); /* @@ -571,6 +604,87 @@ void ipi_call_unlock_irq(void) } #endif /* USE_GENERIC_SMP_HELPERS */ +/* Setup configured maximum number of CPUs to activate */ +unsigned int setup_max_cpus = NR_CPUS; +EXPORT_SYMBOL(setup_max_cpus); + + +/* + * Setup routine for controlling SMP activation + * + * Command-line option of "nosmp" or "maxcpus=0" will disable SMP + * activation entirely (the MPS table probe still happens, though). + * + * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer + * greater than 0, limits the maximum number of CPUs activated in + * SMP mode to <NUM>. + */ + +void __weak arch_disable_smp_support(void) { } + +static int __init nosmp(char *str) +{ + setup_max_cpus = 0; + arch_disable_smp_support(); + + return 0; +} + +early_param("nosmp", nosmp); + +/* this is hard limit */ +static int __init nrcpus(char *str) +{ + int nr_cpus; + + get_option(&str, &nr_cpus); + if (nr_cpus > 0 && nr_cpus < nr_cpu_ids) + nr_cpu_ids = nr_cpus; + + return 0; +} + +early_param("nr_cpus", nrcpus); + +static int __init maxcpus(char *str) +{ + get_option(&str, &setup_max_cpus); + if (setup_max_cpus == 0) + arch_disable_smp_support(); + + return 0; +} + +early_param("maxcpus", maxcpus); + +/* Setup number of possible processor ids */ +int nr_cpu_ids __read_mostly = NR_CPUS; +EXPORT_SYMBOL(nr_cpu_ids); + +/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */ +void __init setup_nr_cpu_ids(void) +{ + nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; +} + +/* Called by boot processor to activate the rest. */ +void __init smp_init(void) +{ + unsigned int cpu; + + /* FIXME: This should be done in userspace --RR */ + for_each_present_cpu(cpu) { + if (num_online_cpus() >= setup_max_cpus) + break; + if (!cpu_online(cpu)) + cpu_up(cpu); + } + + /* Any cleanup work */ + printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); + smp_cpus_done(setup_max_cpus); +} + /* * Call a function on all processors. May be used during early boot while * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead diff --git a/kernel/softirq.c b/kernel/softirq.c index 68eb5efec388..13960170cad4 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -54,11 +54,11 @@ EXPORT_SYMBOL(irq_stat); static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; -static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); +DEFINE_PER_CPU(struct task_struct *, ksoftirqd); char *softirq_to_name[NR_SOFTIRQS] = { "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", - "TASKLET", "SCHED", "HRTIMER", "RCU" + "TASKLET", "SCHED", "HRTIMER" }; /* @@ -311,9 +311,21 @@ void irq_enter(void) } #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED -# define invoke_softirq() __do_softirq() +static inline void invoke_softirq(void) +{ + if (!force_irqthreads) + __do_softirq(); + else + wakeup_softirqd(); +} #else -# define invoke_softirq() do_softirq() +static inline void invoke_softirq(void) +{ + if (!force_irqthreads) + do_softirq(); + else + wakeup_softirqd(); +} #endif /* @@ -555,7 +567,7 @@ static void __tasklet_hrtimer_trampoline(unsigned long data) /** * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks * @ttimer: tasklet_hrtimer which is initialized - * @function: hrtimer callback funtion which gets called from softirq context + * @function: hrtimer callback function which gets called from softirq context * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) */ @@ -721,7 +733,6 @@ static int run_ksoftirqd(void * __bind_cpu) { set_current_state(TASK_INTERRUPTIBLE); - current->flags |= PF_KSOFTIRQD; while (!kthread_should_stop()) { preempt_disable(); if (!local_softirq_pending()) { @@ -738,7 +749,10 @@ static int run_ksoftirqd(void * __bind_cpu) don't process */ if (cpu_is_offline((long)__bind_cpu)) goto wait_to_die; - do_softirq(); + local_irq_disable(); + if (local_softirq_pending()) + __do_softirq(); + local_irq_enable(); preempt_enable_no_resched(); cond_resched(); preempt_disable(); @@ -831,7 +845,10 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); + p = kthread_create_on_node(run_ksoftirqd, + hcpu, + cpu_to_node(hotcpu), + "ksoftirqd/%d", hotcpu); if (IS_ERR(p)) { printk("ksoftirqd for %i failed\n", hotcpu); return notifier_from_errno(PTR_ERR(p)); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 2df820b03beb..e3516b29076c 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -301,8 +301,10 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, case CPU_UP_PREPARE: BUG_ON(stopper->thread || stopper->enabled || !list_empty(&stopper->works)); - p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", - cpu); + p = kthread_create_on_node(cpu_stopper_thread, + stopper, + cpu_to_node(cpu), + "migration/%d", cpu); if (IS_ERR(p)) return notifier_from_errno(PTR_ERR(p)); get_task_struct(p); diff --git a/kernel/sys.c b/kernel/sys.c index 18da702ec813..e4128b278f23 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -37,6 +37,7 @@ #include <linux/ptrace.h> #include <linux/fs_struct.h> #include <linux/gfp.h> +#include <linux/syscore_ops.h> #include <linux/compat.h> #include <linux/syscalls.h> @@ -119,16 +120,33 @@ EXPORT_SYMBOL(cad_pid); void (*pm_power_off_prepare)(void); /* + * Returns true if current's euid is same as p's uid or euid, + * or has CAP_SYS_NICE to p's user_ns. + * + * Called with rcu_read_lock, creds are safe + */ +static bool set_one_prio_perm(struct task_struct *p) +{ + const struct cred *cred = current_cred(), *pcred = __task_cred(p); + + if (pcred->user->user_ns == cred->user->user_ns && + (pcred->uid == cred->euid || + pcred->euid == cred->euid)) + return true; + if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) + return true; + return false; +} + +/* * set the priority of a task * - the caller must hold the RCU read lock */ static int set_one_prio(struct task_struct *p, int niceval, int error) { - const struct cred *cred = current_cred(), *pcred = __task_cred(p); int no_nice; - if (pcred->uid != cred->euid && - pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) { + if (!set_one_prio_perm(p)) { error = -EPERM; goto out; } @@ -296,8 +314,9 @@ void kernel_restart_prepare(char *cmd) { blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); system_state = SYSTEM_RESTART; + usermodehelper_disable(); device_shutdown(); - sysdev_shutdown(); + syscore_shutdown(); } /** @@ -325,6 +344,7 @@ static void kernel_shutdown_prepare(enum system_states state) blocking_notifier_call_chain(&reboot_notifier_list, (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); system_state = state; + usermodehelper_disable(); device_shutdown(); } /** @@ -335,7 +355,7 @@ static void kernel_shutdown_prepare(enum system_states state) void kernel_halt(void) { kernel_shutdown_prepare(SYSTEM_HALT); - sysdev_shutdown(); + syscore_shutdown(); printk(KERN_EMERG "System halted.\n"); kmsg_dump(KMSG_DUMP_HALT); machine_halt(); @@ -354,7 +374,7 @@ void kernel_power_off(void) if (pm_power_off_prepare) pm_power_off_prepare(); disable_nonboot_cpus(); - sysdev_shutdown(); + syscore_shutdown(); printk(KERN_EMERG "Power down.\n"); kmsg_dump(KMSG_DUMP_POWEROFF); machine_power_off(); @@ -502,7 +522,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) if (rgid != (gid_t) -1) { if (old->gid == rgid || old->egid == rgid || - capable(CAP_SETGID)) + nsown_capable(CAP_SETGID)) new->gid = rgid; else goto error; @@ -511,7 +531,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) if (old->gid == egid || old->egid == egid || old->sgid == egid || - capable(CAP_SETGID)) + nsown_capable(CAP_SETGID)) new->egid = egid; else goto error; @@ -546,7 +566,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid) old = current_cred(); retval = -EPERM; - if (capable(CAP_SETGID)) + if (nsown_capable(CAP_SETGID)) new->gid = new->egid = new->sgid = new->fsgid = gid; else if (gid == old->gid || gid == old->sgid) new->egid = new->fsgid = gid; @@ -613,7 +633,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) new->uid = ruid; if (old->uid != ruid && old->euid != ruid && - !capable(CAP_SETUID)) + !nsown_capable(CAP_SETUID)) goto error; } @@ -622,7 +642,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) if (old->uid != euid && old->euid != euid && old->suid != euid && - !capable(CAP_SETUID)) + !nsown_capable(CAP_SETUID)) goto error; } @@ -670,7 +690,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) old = current_cred(); retval = -EPERM; - if (capable(CAP_SETUID)) { + if (nsown_capable(CAP_SETUID)) { new->suid = new->uid = uid; if (uid != old->uid) { retval = set_user(new); @@ -712,7 +732,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) old = current_cred(); retval = -EPERM; - if (!capable(CAP_SETUID)) { + if (!nsown_capable(CAP_SETUID)) { if (ruid != (uid_t) -1 && ruid != old->uid && ruid != old->euid && ruid != old->suid) goto error; @@ -776,7 +796,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) old = current_cred(); retval = -EPERM; - if (!capable(CAP_SETGID)) { + if (!nsown_capable(CAP_SETGID)) { if (rgid != (gid_t) -1 && rgid != old->gid && rgid != old->egid && rgid != old->sgid) goto error; @@ -836,7 +856,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid) if (uid == old->uid || uid == old->euid || uid == old->suid || uid == old->fsuid || - capable(CAP_SETUID)) { + nsown_capable(CAP_SETUID)) { if (uid != old_fsuid) { new->fsuid = uid; if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) @@ -869,7 +889,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid) if (gid == old->gid || gid == old->egid || gid == old->sgid || gid == old->fsgid || - capable(CAP_SETGID)) { + nsown_capable(CAP_SETGID)) { if (gid != old_fsgid) { new->fsgid = gid; goto change_okay; @@ -1177,8 +1197,9 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) int errno; char tmp[__NEW_UTS_LEN]; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; + if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; down_write(&uts_sem); @@ -1226,7 +1247,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) int errno; char tmp[__NEW_UTS_LEN]; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; @@ -1341,6 +1362,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource, rlim = tsk->signal->rlim + resource; task_lock(tsk->group_leader); if (new_rlim) { + /* Keep the capable check against init_user_ns until + cgroups can contain all limits */ if (new_rlim->rlim_max > rlim->rlim_max && !capable(CAP_SYS_RESOURCE)) retval = -EPERM; @@ -1384,19 +1407,22 @@ static int check_prlimit_permission(struct task_struct *task) { const struct cred *cred = current_cred(), *tcred; - tcred = __task_cred(task); - if (current != task && - (cred->uid != tcred->euid || - cred->uid != tcred->suid || - cred->uid != tcred->uid || - cred->gid != tcred->egid || - cred->gid != tcred->sgid || - cred->gid != tcred->gid) && - !capable(CAP_SYS_RESOURCE)) { - return -EPERM; - } + if (current == task) + return 0; - return 0; + tcred = __task_cred(task); + if (cred->user->user_ns == tcred->user->user_ns && + (cred->uid == tcred->euid && + cred->uid == tcred->suid && + cred->uid == tcred->uid && + cred->gid == tcred->egid && + cred->gid == tcred->sgid && + cred->gid == tcred->gid)) + return 0; + if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE)) + return 0; + + return -EPERM; } SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index c782fe9924c7..62cbc8877fef 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -46,7 +46,9 @@ cond_syscall(sys_getsockopt); cond_syscall(compat_sys_getsockopt); cond_syscall(sys_shutdown); cond_syscall(sys_sendmsg); +cond_syscall(sys_sendmmsg); cond_syscall(compat_sys_sendmsg); +cond_syscall(compat_sys_sendmmsg); cond_syscall(sys_recvmsg); cond_syscall(sys_recvmmsg); cond_syscall(compat_sys_recvmsg); @@ -69,15 +71,22 @@ cond_syscall(compat_sys_epoll_pwait); cond_syscall(sys_semget); cond_syscall(sys_semop); cond_syscall(sys_semtimedop); +cond_syscall(compat_sys_semtimedop); cond_syscall(sys_semctl); +cond_syscall(compat_sys_semctl); cond_syscall(sys_msgget); cond_syscall(sys_msgsnd); +cond_syscall(compat_sys_msgsnd); cond_syscall(sys_msgrcv); +cond_syscall(compat_sys_msgrcv); cond_syscall(sys_msgctl); +cond_syscall(compat_sys_msgctl); cond_syscall(sys_shmget); cond_syscall(sys_shmat); +cond_syscall(compat_sys_shmat); cond_syscall(sys_shmdt); cond_syscall(sys_shmctl); +cond_syscall(compat_sys_shmctl); cond_syscall(sys_mq_open); cond_syscall(sys_mq_unlink); cond_syscall(sys_mq_timedsend); @@ -186,3 +195,8 @@ cond_syscall(sys_perf_event_open); /* fanotify! */ cond_syscall(sys_fanotify_init); cond_syscall(sys_fanotify_mark); + +/* open by handle */ +cond_syscall(sys_name_to_handle_at); +cond_syscall(sys_open_by_handle_at); +cond_syscall(compat_sys_open_by_handle_at); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4eed0af5d144..4fc92445a29c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -56,6 +56,7 @@ #include <linux/kprobes.h> #include <linux/pipe_fs_i.h> #include <linux/oom.h> +#include <linux/kmod.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -117,6 +118,7 @@ static int neg_one = -1; static int zero; static int __maybe_unused one = 1; static int __maybe_unused two = 2; +static int __maybe_unused three = 3; static unsigned long one_ul = 1; static int one_hundred = 100; #ifdef CONFIG_PRINTK @@ -169,6 +171,11 @@ static int proc_taint(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif +#ifdef CONFIG_PRINTK +static int proc_dmesg_restrict(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +#endif + #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses it's own private copy */ static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; @@ -361,20 +368,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_rt_handler, }, - { - .procname = "sched_compat_yield", - .data = &sysctl_sched_compat_yield, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #ifdef CONFIG_SCHED_AUTOGROUP { .procname = "sched_autogroup_enabled", .data = &sysctl_sched_autogroup_enabled, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, @@ -617,6 +617,11 @@ static struct ctl_table kern_table[] = { .child = random_table, }, { + .procname = "usermodehelper", + .mode = 0555, + .child = usermodehelper_table, + }, + { .procname = "overflowuid", .data = &overflowuid, .maxlen = sizeof(int), @@ -713,7 +718,7 @@ static struct ctl_table kern_table[] = { .data = &kptr_restrict, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dmesg_restrict, .extra1 = &zero, .extra2 = &two, }, @@ -731,14 +736,16 @@ static struct ctl_table kern_table[] = { .data = &watchdog_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = proc_dowatchdog_enabled, + .proc_handler = proc_dowatchdog, + .extra1 = &zero, + .extra2 = &one, }, { .procname = "watchdog_thresh", - .data = &softlockup_thresh, + .data = &watchdog_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dowatchdog_thresh, + .proc_handler = proc_dowatchdog, .extra1 = &neg_one, .extra2 = &sixty, }, @@ -756,7 +763,9 @@ static struct ctl_table kern_table[] = { .data = &watchdog_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = proc_dowatchdog_enabled, + .proc_handler = proc_dowatchdog, + .extra1 = &zero, + .extra2 = &one, }, #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) @@ -948,7 +957,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_perf_event_sample_rate, .maxlen = sizeof(sysctl_perf_event_sample_rate), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = perf_proc_update_handler, }, #endif #ifdef CONFIG_KMEMCHECK @@ -978,14 +987,18 @@ static struct ctl_table vm_table[] = { .data = &sysctl_overcommit_memory, .maxlen = sizeof(sysctl_overcommit_memory), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, }, { .procname = "panic_on_oom", .data = &sysctl_panic_on_oom, .maxlen = sizeof(sysctl_panic_on_oom), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, }, { .procname = "oom_kill_allocating_task", @@ -1013,7 +1026,8 @@ static struct ctl_table vm_table[] = { .data = &page_cluster, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, }, { .procname = "dirty_background_ratio", @@ -1061,7 +1075,8 @@ static struct ctl_table vm_table[] = { .data = &dirty_expire_interval, .maxlen = sizeof(dirty_expire_interval), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, }, { .procname = "nr_pdflush_threads", @@ -1137,6 +1152,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = drop_caches_sysctl_handler, + .extra1 = &one, + .extra2 = &three, }, #ifdef CONFIG_COMPACTION { @@ -1489,7 +1506,7 @@ static struct ctl_table fs_table[] = { static struct ctl_table debug_table[] = { #if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ - defined(CONFIG_S390) + defined(CONFIG_S390) || defined(CONFIG_TILE) { .procname = "exception-trace", .data = &show_unhandled_signals, @@ -1690,13 +1707,8 @@ static int test_perm(int mode, int op) int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) { - int error; int mode; - error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC)); - if (error) - return error; - if (root->permissions) mode = root->permissions(root, current->nsproxy, table); else @@ -2397,6 +2409,17 @@ static int proc_taint(struct ctl_table *table, int write, return err; } +#ifdef CONFIG_PRINTK +static int proc_dmesg_restrict(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} +#endif + struct do_proc_dointvec_minmax_conv_param { int *min; int *max; diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index b875bedf7c9a..3b8e028b9601 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1321,13 +1321,11 @@ static ssize_t binary_sysctl(const int *name, int nlen, void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) { const struct bin_table *table = NULL; - struct nameidata nd; struct vfsmount *mnt; struct file *file; ssize_t result; char *pathname; int flags; - int acc_mode; pathname = sysctl_getname(name, nlen, &table); result = PTR_ERR(pathname); @@ -1337,28 +1335,17 @@ static ssize_t binary_sysctl(const int *name, int nlen, /* How should the sysctl be accessed? */ if (oldval && oldlen && newval && newlen) { flags = O_RDWR; - acc_mode = MAY_READ | MAY_WRITE; } else if (newval && newlen) { flags = O_WRONLY; - acc_mode = MAY_WRITE; } else if (oldval && oldlen) { flags = O_RDONLY; - acc_mode = MAY_READ; } else { result = 0; goto out_putname; } mnt = current->nsproxy->pid_ns->proc_mnt; - result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd); - if (result) - goto out_putname; - - result = may_open(&nd.path, acc_mode, flags); - if (result) - goto out_putpath; - - file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred()); + file = file_open_root(mnt->mnt_root, mnt, pathname, flags); result = PTR_ERR(file); if (IS_ERR(file)) goto out_putname; @@ -1370,10 +1357,6 @@ out_putname: putname(pathname); out: return result; - -out_putpath: - path_put(&nd.path); - goto out_putname; } diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index 10b90d8a03c4..4e4932a7b360 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -111,11 +111,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) const char *fail = NULL; if (table->parent) { - if (table->procname && !table->parent->procname) + if (!table->parent->procname) set_fail(&fail, table, "Parent without procname"); } - if (!table->procname) - set_fail(&fail, table, "No procname"); if (table->child) { if (table->data) set_fail(&fail, table, "Directory with data?"); @@ -144,13 +142,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) set_fail(&fail, table, "No maxlen"); } #ifdef CONFIG_PROC_SYSCTL - if (table->procname && !table->proc_handler) + if (!table->proc_handler) set_fail(&fail, table, "No proc_handler"); #endif -#if 0 - if (!table->procname && table->proc_handler) - set_fail(&fail, table, "proc_handler without procname"); -#endif sysctl_check_leaf(namespaces, table, &fail); } if (table->mode > 0777) diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 3971c6b9d58d..9ffea360a778 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -685,7 +685,7 @@ static int __init taskstats_init(void) goto err_cgroup_ops; family_registered = 1; - printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); + pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); return 0; err_cgroup_ops: genl_unregister_ops(&family, &taskstats_ops); diff --git a/kernel/time.c b/kernel/time.c index 32174359576f..8e8dc6d705c9 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -150,7 +150,7 @@ static inline void warp_clock(void) * various programs will get confused when the clock gets warped. */ -int do_sys_settimeofday(struct timespec *tv, struct timezone *tz) +int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) { static int firsttime = 1; int error = 0; @@ -645,7 +645,7 @@ u64 nsec_to_clock_t(u64 x) } /** - * nsecs_to_jiffies - Convert nsecs in u64 to jiffies + * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 * * @n: nsecs in u64 * @@ -657,7 +657,7 @@ u64 nsec_to_clock_t(u64 x) * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years */ -unsigned long nsecs_to_jiffies(u64 n) +u64 nsecs_to_jiffies64(u64 n) { #if (NSEC_PER_SEC % HZ) == 0 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ @@ -674,22 +674,23 @@ unsigned long nsecs_to_jiffies(u64 n) #endif } -#if (BITS_PER_LONG < 64) -u64 get_jiffies_64(void) +/** + * nsecs_to_jiffies - Convert nsecs in u64 to jiffies + * + * @n: nsecs in u64 + * + * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. + * And this doesn't return MAX_JIFFY_OFFSET since this function is designed + * for scheduler, not for use in device drivers to calculate timeout value. + * + * note: + * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) + * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years + */ +unsigned long nsecs_to_jiffies(u64 n) { - unsigned long seq; - u64 ret; - - do { - seq = read_seqbegin(&xtime_lock); - ret = jiffies_64; - } while (read_seqretry(&xtime_lock, seq)); - return ret; + return (unsigned long)nsecs_to_jiffies64(n); } -EXPORT_SYMBOL(get_jiffies_64); -#endif - -EXPORT_SYMBOL(jiffies); /* * Add two timespec values and do a safety check for overflow. diff --git a/kernel/time/Makefile b/kernel/time/Makefile index ee266620b06c..e2fd74b8e8c2 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,4 +1,5 @@ -obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o +obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o +obj-y += timeconv.o posix-clock.o alarmtimer.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c new file mode 100644 index 000000000000..2d966244ea60 --- /dev/null +++ b/kernel/time/alarmtimer.c @@ -0,0 +1,702 @@ +/* + * Alarmtimer interface + * + * This interface provides a timer which is similarto hrtimers, + * but triggers a RTC alarm if the box is suspend. + * + * This interface is influenced by the Android RTC Alarm timer + * interface. + * + * Copyright (C) 2010 IBM Corperation + * + * Author: John Stultz <john.stultz@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/time.h> +#include <linux/hrtimer.h> +#include <linux/timerqueue.h> +#include <linux/rtc.h> +#include <linux/alarmtimer.h> +#include <linux/mutex.h> +#include <linux/platform_device.h> +#include <linux/posix-timers.h> +#include <linux/workqueue.h> +#include <linux/freezer.h> + +/** + * struct alarm_base - Alarm timer bases + * @lock: Lock for syncrhonized access to the base + * @timerqueue: Timerqueue head managing the list of events + * @timer: hrtimer used to schedule events while running + * @gettime: Function to read the time correlating to the base + * @base_clockid: clockid for the base + */ +static struct alarm_base { + spinlock_t lock; + struct timerqueue_head timerqueue; + struct hrtimer timer; + ktime_t (*gettime)(void); + clockid_t base_clockid; +} alarm_bases[ALARM_NUMTYPE]; + +#ifdef CONFIG_RTC_CLASS +/* rtc timer and device for setting alarm wakeups at suspend */ +static struct rtc_timer rtctimer; +static struct rtc_device *rtcdev; +#endif + +/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */ +static ktime_t freezer_delta; +static DEFINE_SPINLOCK(freezer_delta_lock); + + +/** + * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue + * @base: pointer to the base where the timer is being run + * @alarm: pointer to alarm being enqueued. + * + * Adds alarm to a alarm_base timerqueue and if necessary sets + * an hrtimer to run. + * + * Must hold base->lock when calling. + */ +static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) +{ + timerqueue_add(&base->timerqueue, &alarm->node); + if (&alarm->node == timerqueue_getnext(&base->timerqueue)) { + hrtimer_try_to_cancel(&base->timer); + hrtimer_start(&base->timer, alarm->node.expires, + HRTIMER_MODE_ABS); + } +} + +/** + * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue + * @base: pointer to the base where the timer is running + * @alarm: pointer to alarm being removed + * + * Removes alarm to a alarm_base timerqueue and if necessary sets + * a new timer to run. + * + * Must hold base->lock when calling. + */ +static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm) +{ + struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue); + + timerqueue_del(&base->timerqueue, &alarm->node); + if (next == &alarm->node) { + hrtimer_try_to_cancel(&base->timer); + next = timerqueue_getnext(&base->timerqueue); + if (!next) + return; + hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS); + } +} + + +/** + * alarmtimer_fired - Handles alarm hrtimer being fired. + * @timer: pointer to hrtimer being run + * + * When a alarm timer fires, this runs through the timerqueue to + * see which alarms expired, and runs those. If there are more alarm + * timers queued for the future, we set the hrtimer to fire when + * when the next future alarm timer expires. + */ +static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) +{ + struct alarm_base *base = container_of(timer, struct alarm_base, timer); + struct timerqueue_node *next; + unsigned long flags; + ktime_t now; + int ret = HRTIMER_NORESTART; + + spin_lock_irqsave(&base->lock, flags); + now = base->gettime(); + while ((next = timerqueue_getnext(&base->timerqueue))) { + struct alarm *alarm; + ktime_t expired = next->expires; + + if (expired.tv64 >= now.tv64) + break; + + alarm = container_of(next, struct alarm, node); + + timerqueue_del(&base->timerqueue, &alarm->node); + alarm->enabled = 0; + /* Re-add periodic timers */ + if (alarm->period.tv64) { + alarm->node.expires = ktime_add(expired, alarm->period); + timerqueue_add(&base->timerqueue, &alarm->node); + alarm->enabled = 1; + } + spin_unlock_irqrestore(&base->lock, flags); + if (alarm->function) + alarm->function(alarm); + spin_lock_irqsave(&base->lock, flags); + } + + if (next) { + hrtimer_set_expires(&base->timer, next->expires); + ret = HRTIMER_RESTART; + } + spin_unlock_irqrestore(&base->lock, flags); + + return ret; + +} + +#ifdef CONFIG_RTC_CLASS +/** + * alarmtimer_suspend - Suspend time callback + * @dev: unused + * @state: unused + * + * When we are going into suspend, we look through the bases + * to see which is the soonest timer to expire. We then + * set an rtc timer to fire that far into the future, which + * will wake us from suspend. + */ +static int alarmtimer_suspend(struct device *dev) +{ + struct rtc_time tm; + ktime_t min, now; + unsigned long flags; + int i; + + spin_lock_irqsave(&freezer_delta_lock, flags); + min = freezer_delta; + freezer_delta = ktime_set(0, 0); + spin_unlock_irqrestore(&freezer_delta_lock, flags); + + /* If we have no rtcdev, just return */ + if (!rtcdev) + return 0; + + /* Find the soonest timer to expire*/ + for (i = 0; i < ALARM_NUMTYPE; i++) { + struct alarm_base *base = &alarm_bases[i]; + struct timerqueue_node *next; + ktime_t delta; + + spin_lock_irqsave(&base->lock, flags); + next = timerqueue_getnext(&base->timerqueue); + spin_unlock_irqrestore(&base->lock, flags); + if (!next) + continue; + delta = ktime_sub(next->expires, base->gettime()); + if (!min.tv64 || (delta.tv64 < min.tv64)) + min = delta; + } + if (min.tv64 == 0) + return 0; + + /* XXX - Should we enforce a minimum sleep time? */ + WARN_ON(min.tv64 < NSEC_PER_SEC); + + /* Setup an rtc timer to fire that far in the future */ + rtc_timer_cancel(rtcdev, &rtctimer); + rtc_read_time(rtcdev, &tm); + now = rtc_tm_to_ktime(tm); + now = ktime_add(now, min); + + rtc_timer_start(rtcdev, &rtctimer, now, ktime_set(0, 0)); + + return 0; +} +#else +static int alarmtimer_suspend(struct device *dev) +{ + return 0; +} +#endif + +static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) +{ + ktime_t delta; + unsigned long flags; + struct alarm_base *base = &alarm_bases[type]; + + delta = ktime_sub(absexp, base->gettime()); + + spin_lock_irqsave(&freezer_delta_lock, flags); + if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64)) + freezer_delta = delta; + spin_unlock_irqrestore(&freezer_delta_lock, flags); +} + + +/** + * alarm_init - Initialize an alarm structure + * @alarm: ptr to alarm to be initialized + * @type: the type of the alarm + * @function: callback that is run when the alarm fires + */ +void alarm_init(struct alarm *alarm, enum alarmtimer_type type, + void (*function)(struct alarm *)) +{ + timerqueue_init(&alarm->node); + alarm->period = ktime_set(0, 0); + alarm->function = function; + alarm->type = type; + alarm->enabled = 0; +} + +/** + * alarm_start - Sets an alarm to fire + * @alarm: ptr to alarm to set + * @start: time to run the alarm + * @period: period at which the alarm will recur + */ +void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + unsigned long flags; + + spin_lock_irqsave(&base->lock, flags); + if (alarm->enabled) + alarmtimer_remove(base, alarm); + alarm->node.expires = start; + alarm->period = period; + alarmtimer_enqueue(base, alarm); + alarm->enabled = 1; + spin_unlock_irqrestore(&base->lock, flags); +} + +/** + * alarm_cancel - Tries to cancel an alarm timer + * @alarm: ptr to alarm to be canceled + */ +void alarm_cancel(struct alarm *alarm) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + unsigned long flags; + + spin_lock_irqsave(&base->lock, flags); + if (alarm->enabled) + alarmtimer_remove(base, alarm); + alarm->enabled = 0; + spin_unlock_irqrestore(&base->lock, flags); +} + + +/** + * clock2alarm - helper that converts from clockid to alarmtypes + * @clockid: clockid. + */ +static enum alarmtimer_type clock2alarm(clockid_t clockid) +{ + if (clockid == CLOCK_REALTIME_ALARM) + return ALARM_REALTIME; + if (clockid == CLOCK_BOOTTIME_ALARM) + return ALARM_BOOTTIME; + return -1; +} + +/** + * alarm_handle_timer - Callback for posix timers + * @alarm: alarm that fired + * + * Posix timer callback for expired alarm timers. + */ +static void alarm_handle_timer(struct alarm *alarm) +{ + struct k_itimer *ptr = container_of(alarm, struct k_itimer, + it.alarmtimer); + if (posix_timer_event(ptr, 0) != 0) + ptr->it_overrun++; +} + +/** + * alarm_clock_getres - posix getres interface + * @which_clock: clockid + * @tp: timespec to fill + * + * Returns the granularity of underlying alarm base clock + */ +static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) +{ + clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; + + return hrtimer_get_res(baseid, tp); +} + +/** + * alarm_clock_get - posix clock_get interface + * @which_clock: clockid + * @tp: timespec to fill. + * + * Provides the underlying alarm base time. + */ +static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) +{ + struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; + + *tp = ktime_to_timespec(base->gettime()); + return 0; +} + +/** + * alarm_timer_create - posix timer_create interface + * @new_timer: k_itimer pointer to manage + * + * Initializes the k_itimer structure. + */ +static int alarm_timer_create(struct k_itimer *new_timer) +{ + enum alarmtimer_type type; + struct alarm_base *base; + + if (!capable(CAP_WAKE_ALARM)) + return -EPERM; + + type = clock2alarm(new_timer->it_clock); + base = &alarm_bases[type]; + alarm_init(&new_timer->it.alarmtimer, type, alarm_handle_timer); + return 0; +} + +/** + * alarm_timer_get - posix timer_get interface + * @new_timer: k_itimer pointer + * @cur_setting: itimerspec data to fill + * + * Copies the itimerspec data out from the k_itimer + */ +static void alarm_timer_get(struct k_itimer *timr, + struct itimerspec *cur_setting) +{ + cur_setting->it_interval = + ktime_to_timespec(timr->it.alarmtimer.period); + cur_setting->it_value = + ktime_to_timespec(timr->it.alarmtimer.node.expires); + return; +} + +/** + * alarm_timer_del - posix timer_del interface + * @timr: k_itimer pointer to be deleted + * + * Cancels any programmed alarms for the given timer. + */ +static int alarm_timer_del(struct k_itimer *timr) +{ + alarm_cancel(&timr->it.alarmtimer); + return 0; +} + +/** + * alarm_timer_set - posix timer_set interface + * @timr: k_itimer pointer to be deleted + * @flags: timer flags + * @new_setting: itimerspec to be used + * @old_setting: itimerspec being replaced + * + * Sets the timer to new_setting, and starts the timer. + */ +static int alarm_timer_set(struct k_itimer *timr, int flags, + struct itimerspec *new_setting, + struct itimerspec *old_setting) +{ + /* Save old values */ + old_setting->it_interval = + ktime_to_timespec(timr->it.alarmtimer.period); + old_setting->it_value = + ktime_to_timespec(timr->it.alarmtimer.node.expires); + + /* If the timer was already set, cancel it */ + alarm_cancel(&timr->it.alarmtimer); + + /* start the timer */ + alarm_start(&timr->it.alarmtimer, + timespec_to_ktime(new_setting->it_value), + timespec_to_ktime(new_setting->it_interval)); + return 0; +} + +/** + * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep + * @alarm: ptr to alarm that fired + * + * Wakes up the task that set the alarmtimer + */ +static void alarmtimer_nsleep_wakeup(struct alarm *alarm) +{ + struct task_struct *task = (struct task_struct *)alarm->data; + + alarm->data = NULL; + if (task) + wake_up_process(task); +} + +/** + * alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation + * @alarm: ptr to alarmtimer + * @absexp: absolute expiration time + * + * Sets the alarm timer and sleeps until it is fired or interrupted. + */ +static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp) +{ + alarm->data = (void *)current; + do { + set_current_state(TASK_INTERRUPTIBLE); + alarm_start(alarm, absexp, ktime_set(0, 0)); + if (likely(alarm->data)) + schedule(); + + alarm_cancel(alarm); + } while (alarm->data && !signal_pending(current)); + + __set_current_state(TASK_RUNNING); + + return (alarm->data == NULL); +} + + +/** + * update_rmtp - Update remaining timespec value + * @exp: expiration time + * @type: timer type + * @rmtp: user pointer to remaining timepsec value + * + * Helper function that fills in rmtp value with time between + * now and the exp value + */ +static int update_rmtp(ktime_t exp, enum alarmtimer_type type, + struct timespec __user *rmtp) +{ + struct timespec rmt; + ktime_t rem; + + rem = ktime_sub(exp, alarm_bases[type].gettime()); + + if (rem.tv64 <= 0) + return 0; + rmt = ktime_to_timespec(rem); + + if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) + return -EFAULT; + + return 1; + +} + +/** + * alarm_timer_nsleep_restart - restartblock alarmtimer nsleep + * @restart: ptr to restart block + * + * Handles restarted clock_nanosleep calls + */ +static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) +{ + enum alarmtimer_type type = restart->nanosleep.clockid; + ktime_t exp; + struct timespec __user *rmtp; + struct alarm alarm; + int ret = 0; + + exp.tv64 = restart->nanosleep.expires; + alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); + + if (alarmtimer_do_nsleep(&alarm, exp)) + goto out; + + if (freezing(current)) + alarmtimer_freezerset(exp, type); + + rmtp = restart->nanosleep.rmtp; + if (rmtp) { + ret = update_rmtp(exp, type, rmtp); + if (ret <= 0) + goto out; + } + + + /* The other values in restart are already filled in */ + ret = -ERESTART_RESTARTBLOCK; +out: + return ret; +} + +/** + * alarm_timer_nsleep - alarmtimer nanosleep + * @which_clock: clockid + * @flags: determins abstime or relative + * @tsreq: requested sleep time (abs or rel) + * @rmtp: remaining sleep time saved + * + * Handles clock_nanosleep calls against _ALARM clockids + */ +static int alarm_timer_nsleep(const clockid_t which_clock, int flags, + struct timespec *tsreq, struct timespec __user *rmtp) +{ + enum alarmtimer_type type = clock2alarm(which_clock); + struct alarm alarm; + ktime_t exp; + int ret = 0; + struct restart_block *restart; + + if (!capable(CAP_WAKE_ALARM)) + return -EPERM; + + alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); + + exp = timespec_to_ktime(*tsreq); + /* Convert (if necessary) to absolute time */ + if (flags != TIMER_ABSTIME) { + ktime_t now = alarm_bases[type].gettime(); + exp = ktime_add(now, exp); + } + + if (alarmtimer_do_nsleep(&alarm, exp)) + goto out; + + if (freezing(current)) + alarmtimer_freezerset(exp, type); + + /* abs timers don't set remaining time or restart */ + if (flags == TIMER_ABSTIME) { + ret = -ERESTARTNOHAND; + goto out; + } + + if (rmtp) { + ret = update_rmtp(exp, type, rmtp); + if (ret <= 0) + goto out; + } + + restart = ¤t_thread_info()->restart_block; + restart->fn = alarm_timer_nsleep_restart; + restart->nanosleep.clockid = type; + restart->nanosleep.expires = exp.tv64; + restart->nanosleep.rmtp = rmtp; + ret = -ERESTART_RESTARTBLOCK; + +out: + return ret; +} + + +/* Suspend hook structures */ +static const struct dev_pm_ops alarmtimer_pm_ops = { + .suspend = alarmtimer_suspend, +}; + +static struct platform_driver alarmtimer_driver = { + .driver = { + .name = "alarmtimer", + .pm = &alarmtimer_pm_ops, + } +}; + +/** + * alarmtimer_init - Initialize alarm timer code + * + * This function initializes the alarm bases and registers + * the posix clock ids. + */ +static int __init alarmtimer_init(void) +{ + int error = 0; + int i; + struct k_clock alarm_clock = { + .clock_getres = alarm_clock_getres, + .clock_get = alarm_clock_get, + .timer_create = alarm_timer_create, + .timer_set = alarm_timer_set, + .timer_del = alarm_timer_del, + .timer_get = alarm_timer_get, + .nsleep = alarm_timer_nsleep, + }; + + posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); + posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); + + /* Initialize alarm bases */ + alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; + alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real; + alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME; + alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime; + for (i = 0; i < ALARM_NUMTYPE; i++) { + timerqueue_init_head(&alarm_bases[i].timerqueue); + spin_lock_init(&alarm_bases[i].lock); + hrtimer_init(&alarm_bases[i].timer, + alarm_bases[i].base_clockid, + HRTIMER_MODE_ABS); + alarm_bases[i].timer.function = alarmtimer_fired; + } + error = platform_driver_register(&alarmtimer_driver); + platform_device_register_simple("alarmtimer", -1, NULL, 0); + + return error; +} +device_initcall(alarmtimer_init); + +#ifdef CONFIG_RTC_CLASS +/** + * has_wakealarm - check rtc device has wakealarm ability + * @dev: current device + * @name_ptr: name to be returned + * + * This helper function checks to see if the rtc device can wake + * from suspend. + */ +static int __init has_wakealarm(struct device *dev, void *name_ptr) +{ + struct rtc_device *candidate = to_rtc_device(dev); + + if (!candidate->ops->set_alarm) + return 0; + if (!device_may_wakeup(candidate->dev.parent)) + return 0; + + *(const char **)name_ptr = dev_name(dev); + return 1; +} + +/** + * alarmtimer_init_late - Late initializing of alarmtimer code + * + * This function locates a rtc device to use for wakealarms. + * Run as late_initcall to make sure rtc devices have been + * registered. + */ +static int __init alarmtimer_init_late(void) +{ + struct device *dev; + char *str; + + /* Find an rtc device and init the rtc_timer */ + dev = class_find_device(rtc_class, NULL, &str, has_wakealarm); + /* If we have a device then str is valid. See has_wakealarm() */ + if (dev) { + rtcdev = rtc_class_open(str); + /* + * Drop the reference we got in class_find_device, + * rtc_open takes its own. + */ + put_device(dev); + } + if (!rtcdev) { + printk(KERN_WARNING "No RTC device found, ALARM timers will" + " not wake from suspend"); + } + rtc_timer_init(&rtctimer, NULL, NULL); + + return 0; +} +#else +static int __init alarmtimer_init_late(void) +{ + printk(KERN_WARNING "Kernel not built with RTC support, ALARM timers" + " will not wake from suspend"); + return 0; +} +#endif +late_initcall(alarmtimer_init_late); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index d7395fdfb9f3..c027d4f602f1 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -18,7 +18,6 @@ #include <linux/notifier.h> #include <linux/smp.h> #include <linux/sysdev.h> -#include <linux/tick.h> #include "tick-internal.h" @@ -195,6 +194,70 @@ void clockevents_register_device(struct clock_event_device *dev) } EXPORT_SYMBOL_GPL(clockevents_register_device); +static void clockevents_config(struct clock_event_device *dev, + u32 freq) +{ + u64 sec; + + if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) + return; + + /* + * Calculate the maximum number of seconds we can sleep. Limit + * to 10 minutes for hardware which can program more than + * 32bit ticks so we still get reasonable conversion values. + */ + sec = dev->max_delta_ticks; + do_div(sec, freq); + if (!sec) + sec = 1; + else if (sec > 600 && dev->max_delta_ticks > UINT_MAX) + sec = 600; + + clockevents_calc_mult_shift(dev, freq, sec); + dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev); + dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev); +} + +/** + * clockevents_config_and_register - Configure and register a clock event device + * @dev: device to register + * @freq: The clock frequency + * @min_delta: The minimum clock ticks to program in oneshot mode + * @max_delta: The maximum clock ticks to program in oneshot mode + * + * min/max_delta can be 0 for devices which do not support oneshot mode. + */ +void clockevents_config_and_register(struct clock_event_device *dev, + u32 freq, unsigned long min_delta, + unsigned long max_delta) +{ + dev->min_delta_ticks = min_delta; + dev->max_delta_ticks = max_delta; + clockevents_config(dev, freq); + clockevents_register_device(dev); +} + +/** + * clockevents_update_freq - Update frequency and reprogram a clock event device. + * @dev: device to modify + * @freq: new device frequency + * + * Reconfigure and reprogram a clock event device in oneshot + * mode. Must be called on the cpu for which the device delivers per + * cpu timer events with interrupts disabled! Returns 0 on success, + * -ETIME when the event is in the past. + */ +int clockevents_update_freq(struct clock_event_device *dev, u32 freq) +{ + clockevents_config(dev, freq); + + if (dev->mode != CLOCK_EVT_MODE_ONESHOT) + return 0; + + return clockevents_program_event(dev, dev->next_event, ktime_get()); +} + /* * Noop handler when we shut down an event device */ diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 6519cf62d9cd..1c95fd677328 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -626,19 +626,6 @@ static void clocksource_enqueue(struct clocksource *cs) list_add(&cs->list, entry); } - -/* - * Maximum time we expect to go between ticks. This includes idle - * tickless time. It provides the trade off between selecting a - * mult/shift pair that is very precise but can only handle a short - * period of time, vs. a mult/shift pair that can handle long periods - * of time but isn't as precise. - * - * This is a subsystem constant, and actual hardware limitations - * may override it (ie: clocksources that wrap every 3 seconds). - */ -#define MAX_UPDATE_LENGTH 5 /* Seconds */ - /** * __clocksource_updatefreq_scale - Used update clocksource with new freq * @t: clocksource to be registered @@ -652,15 +639,28 @@ static void clocksource_enqueue(struct clocksource *cs) */ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) { + u64 sec; + /* - * Ideally we want to use some of the limits used in - * clocksource_max_deferment, to provide a more informed - * MAX_UPDATE_LENGTH. But for now this just gets the - * register interface working properly. + * Calc the maximum number of seconds which we can run before + * wrapping around. For clocksources which have a mask > 32bit + * we need to limit the max sleep time to have a good + * conversion precision. 10 minutes is still a reasonable + * amount. That results in a shift value of 24 for a + * clocksource with mask >= 40bit and f >= 4GHz. That maps to + * ~ 0.06ppm granularity for NTP. We apply the same 12.5% + * margin as we do in clocksource_max_deferment() */ + sec = (cs->mask - (cs->mask >> 5)); + do_div(sec, freq); + do_div(sec, scale); + if (!sec) + sec = 1; + else if (sec > 600 && cs->mask > UINT_MAX) + sec = 600; + clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, - NSEC_PER_SEC/scale, - MAX_UPDATE_LENGTH*scale); + NSEC_PER_SEC / scale, sec * scale); cs->max_idle_ns = clocksource_max_deferment(cs); } EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); @@ -685,8 +685,8 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) /* Add clocksource to the clcoksource list */ mutex_lock(&clocksource_mutex); clocksource_enqueue(cs); - clocksource_select(); clocksource_enqueue_watchdog(cs); + clocksource_select(); mutex_unlock(&clocksource_mutex); return 0; } @@ -706,8 +706,8 @@ int clocksource_register(struct clocksource *cs) mutex_lock(&clocksource_mutex); clocksource_enqueue(cs); - clocksource_select(); clocksource_enqueue_watchdog(cs); + clocksource_select(); mutex_unlock(&clocksource_mutex); return 0; } diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 5404a8456909..a470154e0408 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -22,8 +22,11 @@ ************************************************************************/ #include <linux/clocksource.h> #include <linux/jiffies.h> +#include <linux/module.h> #include <linux/init.h> +#include "tick-internal.h" + /* The Jiffies based clocksource is the lowest common * denominator clock source which should function on * all systems. It has the same coarse resolution as @@ -31,7 +34,7 @@ * inaccuracies caused by missed or lost timer * interrupts and the inability for the timer * interrupt hardware to accuratly tick at the - * requested HZ value. It is also not reccomended + * requested HZ value. It is also not recommended * for "tick-less" systems. */ #define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) @@ -64,6 +67,23 @@ struct clocksource clocksource_jiffies = { .shift = JIFFIES_SHIFT, }; +#if (BITS_PER_LONG < 64) +u64 get_jiffies_64(void) +{ + unsigned long seq; + u64 ret; + + do { + seq = read_seqbegin(&xtime_lock); + ret = jiffies_64; + } while (read_seqretry(&xtime_lock, seq)); + return ret; +} +EXPORT_SYMBOL(get_jiffies_64); +#endif + +EXPORT_SYMBOL(jiffies); + static int __init init_jiffies_clocksource(void) { return clocksource_register(&clocksource_jiffies); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5c00242fa921..f6117a4c7cb8 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -16,6 +16,8 @@ #include <linux/mm.h> #include <linux/module.h> +#include "tick-internal.h" + /* * NTP timekeeping variables: */ @@ -646,6 +648,19 @@ int do_adjtimex(struct timex *txc) hrtimer_cancel(&leap_timer); } + if (txc->modes & ADJ_SETOFFSET) { + struct timespec delta; + delta.tv_sec = txc->time.tv_sec; + delta.tv_nsec = txc->time.tv_usec; + if (!capable(CAP_SYS_TIME)) + return -EPERM; + if (!(txc->modes & ADJ_NANO)) + delta.tv_nsec *= 1000; + result = timekeeping_inject_offset(&delta); + if (result) + return result; + } + getnstimeofday(&ts); write_seqlock_irq(&xtime_lock); diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c new file mode 100644 index 000000000000..c340ca658f37 --- /dev/null +++ b/kernel/time/posix-clock.c @@ -0,0 +1,445 @@ +/* + * posix-clock.c - support for dynamic clock devices + * + * Copyright (C) 2010 OMICRON electronics GmbH + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include <linux/device.h> +#include <linux/file.h> +#include <linux/posix-clock.h> +#include <linux/slab.h> +#include <linux/syscalls.h> +#include <linux/uaccess.h> + +static void delete_clock(struct kref *kref); + +/* + * Returns NULL if the posix_clock instance attached to 'fp' is old and stale. + */ +static struct posix_clock *get_posix_clock(struct file *fp) +{ + struct posix_clock *clk = fp->private_data; + + down_read(&clk->rwsem); + + if (!clk->zombie) + return clk; + + up_read(&clk->rwsem); + + return NULL; +} + +static void put_posix_clock(struct posix_clock *clk) +{ + up_read(&clk->rwsem); +} + +static ssize_t posix_clock_read(struct file *fp, char __user *buf, + size_t count, loff_t *ppos) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -EINVAL; + + if (!clk) + return -ENODEV; + + if (clk->ops.read) + err = clk->ops.read(clk, fp->f_flags, buf, count); + + put_posix_clock(clk); + + return err; +} + +static unsigned int posix_clock_poll(struct file *fp, poll_table *wait) +{ + struct posix_clock *clk = get_posix_clock(fp); + int result = 0; + + if (!clk) + return -ENODEV; + + if (clk->ops.poll) + result = clk->ops.poll(clk, fp, wait); + + put_posix_clock(clk); + + return result; +} + +static int posix_clock_fasync(int fd, struct file *fp, int on) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = 0; + + if (!clk) + return -ENODEV; + + if (clk->ops.fasync) + err = clk->ops.fasync(clk, fd, fp, on); + + put_posix_clock(clk); + + return err; +} + +static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENODEV; + + if (!clk) + return -ENODEV; + + if (clk->ops.mmap) + err = clk->ops.mmap(clk, vma); + + put_posix_clock(clk); + + return err; +} + +static long posix_clock_ioctl(struct file *fp, + unsigned int cmd, unsigned long arg) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENOTTY; + + if (!clk) + return -ENODEV; + + if (clk->ops.ioctl) + err = clk->ops.ioctl(clk, cmd, arg); + + put_posix_clock(clk); + + return err; +} + +#ifdef CONFIG_COMPAT +static long posix_clock_compat_ioctl(struct file *fp, + unsigned int cmd, unsigned long arg) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENOTTY; + + if (!clk) + return -ENODEV; + + if (clk->ops.ioctl) + err = clk->ops.ioctl(clk, cmd, arg); + + put_posix_clock(clk); + + return err; +} +#endif + +static int posix_clock_open(struct inode *inode, struct file *fp) +{ + int err; + struct posix_clock *clk = + container_of(inode->i_cdev, struct posix_clock, cdev); + + down_read(&clk->rwsem); + + if (clk->zombie) { + err = -ENODEV; + goto out; + } + if (clk->ops.open) + err = clk->ops.open(clk, fp->f_mode); + else + err = 0; + + if (!err) { + kref_get(&clk->kref); + fp->private_data = clk; + } +out: + up_read(&clk->rwsem); + return err; +} + +static int posix_clock_release(struct inode *inode, struct file *fp) +{ + struct posix_clock *clk = fp->private_data; + int err = 0; + + if (clk->ops.release) + err = clk->ops.release(clk); + + kref_put(&clk->kref, delete_clock); + + fp->private_data = NULL; + + return err; +} + +static const struct file_operations posix_clock_file_operations = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .read = posix_clock_read, + .poll = posix_clock_poll, + .unlocked_ioctl = posix_clock_ioctl, + .open = posix_clock_open, + .release = posix_clock_release, + .fasync = posix_clock_fasync, + .mmap = posix_clock_mmap, +#ifdef CONFIG_COMPAT + .compat_ioctl = posix_clock_compat_ioctl, +#endif +}; + +int posix_clock_register(struct posix_clock *clk, dev_t devid) +{ + int err; + + kref_init(&clk->kref); + init_rwsem(&clk->rwsem); + + cdev_init(&clk->cdev, &posix_clock_file_operations); + clk->cdev.owner = clk->ops.owner; + err = cdev_add(&clk->cdev, devid, 1); + + return err; +} +EXPORT_SYMBOL_GPL(posix_clock_register); + +static void delete_clock(struct kref *kref) +{ + struct posix_clock *clk = container_of(kref, struct posix_clock, kref); + + if (clk->release) + clk->release(clk); +} + +void posix_clock_unregister(struct posix_clock *clk) +{ + cdev_del(&clk->cdev); + + down_write(&clk->rwsem); + clk->zombie = true; + up_write(&clk->rwsem); + + kref_put(&clk->kref, delete_clock); +} +EXPORT_SYMBOL_GPL(posix_clock_unregister); + +struct posix_clock_desc { + struct file *fp; + struct posix_clock *clk; +}; + +static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd) +{ + struct file *fp = fget(CLOCKID_TO_FD(id)); + int err = -EINVAL; + + if (!fp) + return err; + + if (fp->f_op->open != posix_clock_open || !fp->private_data) + goto out; + + cd->fp = fp; + cd->clk = get_posix_clock(fp); + + err = cd->clk ? 0 : -ENODEV; +out: + if (err) + fput(fp); + return err; +} + +static void put_clock_desc(struct posix_clock_desc *cd) +{ + put_posix_clock(cd->clk); + fput(cd->fp); +} + +static int pc_clock_adjtime(clockid_t id, struct timex *tx) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if ((cd.fp->f_mode & FMODE_WRITE) == 0) { + err = -EACCES; + goto out; + } + + if (cd.clk->ops.clock_adjtime) + err = cd.clk->ops.clock_adjtime(cd.clk, tx); + else + err = -EOPNOTSUPP; +out: + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_gettime(clockid_t id, struct timespec *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.clock_gettime) + err = cd.clk->ops.clock_gettime(cd.clk, ts); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_getres(clockid_t id, struct timespec *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.clock_getres) + err = cd.clk->ops.clock_getres(cd.clk, ts); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_settime(clockid_t id, const struct timespec *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if ((cd.fp->f_mode & FMODE_WRITE) == 0) { + err = -EACCES; + goto out; + } + + if (cd.clk->ops.clock_settime) + err = cd.clk->ops.clock_settime(cd.clk, ts); + else + err = -EOPNOTSUPP; +out: + put_clock_desc(&cd); + + return err; +} + +static int pc_timer_create(struct k_itimer *kit) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.timer_create) + err = cd.clk->ops.timer_create(cd.clk, kit); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_timer_delete(struct k_itimer *kit) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.timer_delete) + err = cd.clk->ops.timer_delete(cd.clk, kit); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + + if (get_clock_desc(id, &cd)) + return; + + if (cd.clk->ops.timer_gettime) + cd.clk->ops.timer_gettime(cd.clk, kit, ts); + + put_clock_desc(&cd); +} + +static int pc_timer_settime(struct k_itimer *kit, int flags, + struct itimerspec *ts, struct itimerspec *old) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.timer_settime) + err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +struct k_clock clock_posix_dynamic = { + .clock_getres = pc_clock_getres, + .clock_set = pc_clock_settime, + .clock_get = pc_clock_gettime, + .clock_adj = pc_clock_adjtime, + .timer_create = pc_timer_create, + .timer_set = pc_timer_settime, + .timer_del = pc_timer_delete, + .timer_get = pc_timer_gettime, +}; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index a3b5aff62606..c7218d132738 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -18,7 +18,6 @@ #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> -#include <linux/tick.h> #include "tick-internal.h" @@ -457,23 +456,27 @@ void tick_broadcast_oneshot_control(unsigned long reason) unsigned long flags; int cpu; - raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - /* * Periodic mode does not care about the enter/exit of power * states */ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) - goto out; + return; - bc = tick_broadcast_device.evtdev; + /* + * We are called with preemtion disabled from the depth of the + * idle code, so we can't be moved away. + */ cpu = smp_processor_id(); td = &per_cpu(tick_cpu_device, cpu); dev = td->evtdev; if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) - goto out; + return; + bc = tick_broadcast_device.evtdev; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); @@ -490,8 +493,6 @@ void tick_broadcast_oneshot_control(unsigned long reason) tick_program_event(dev->next_event, 1); } } - -out: raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } @@ -523,10 +524,11 @@ static void tick_broadcast_init_next_event(struct cpumask *mask, */ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { + int cpu = smp_processor_id(); + /* Set it up only once ! */ if (bc->event_handler != tick_handle_oneshot_broadcast) { int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; - int cpu = smp_processor_id(); bc->event_handler = tick_handle_oneshot_broadcast; clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); @@ -552,6 +554,15 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) tick_broadcast_set_event(tick_next_period, 1); } else bc->next_event.tv64 = KTIME_MAX; + } else { + /* + * The first cpu which switches to oneshot mode sets + * the bit for all other cpus which are in the general + * (periodic) broadcast mask. So the bit is set and + * would prevent the first broadcast enter after this + * to program the bc device. + */ + tick_broadcast_clear_oneshot(cpu); } } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index ed228ef6f6b8..119528de8235 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -18,7 +18,6 @@ #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> -#include <linux/tick.h> #include <asm/irq_regs.h> diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index f65d3a723a64..1009b06d6f89 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -1,6 +1,10 @@ /* * tick internal variable and functions used by low/high res code */ +#include <linux/hrtimer.h> +#include <linux/tick.h> + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD #define TICK_DO_TIMER_NONE -1 #define TICK_DO_TIMER_BOOT -2 @@ -135,3 +139,8 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) { return !(dev->features & CLOCK_EVT_FEAT_DUMMY); } + +#endif + +extern void do_timer(unsigned long ticks); +extern seqlock_t xtime_lock; diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 5cbc101f908b..2d04411a5f05 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -18,7 +18,6 @@ #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> -#include <linux/tick.h> #include "tick-internal.h" diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index c55ea2433471..d5097c44b407 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -19,7 +19,6 @@ #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> -#include <linux/tick.h> #include <linux/module.h> #include <asm/irq_regs.h> diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d27c7562902c..342408cf68dd 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -14,7 +14,7 @@ #include <linux/init.h> #include <linux/mm.h> #include <linux/sched.h> -#include <linux/sysdev.h> +#include <linux/syscore_ops.h> #include <linux/clocksource.h> #include <linux/jiffies.h> #include <linux/time.h> @@ -353,7 +353,7 @@ EXPORT_SYMBOL(do_gettimeofday); * * Sets the time of day to the new time and update NTP and notify hrtimers */ -int do_settimeofday(struct timespec *tv) +int do_settimeofday(const struct timespec *tv) { struct timespec ts_delta; unsigned long flags; @@ -387,6 +387,42 @@ int do_settimeofday(struct timespec *tv) EXPORT_SYMBOL(do_settimeofday); + +/** + * timekeeping_inject_offset - Adds or subtracts from the current time. + * @tv: pointer to the timespec variable containing the offset + * + * Adds or subtracts an offset value from the current time. + */ +int timekeeping_inject_offset(struct timespec *ts) +{ + unsigned long flags; + + if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) + return -EINVAL; + + write_seqlock_irqsave(&xtime_lock, flags); + + timekeeping_forward_now(); + + xtime = timespec_add(xtime, *ts); + wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts); + + timekeeper.ntp_error = 0; + ntp_clear(); + + update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + timekeeper.mult); + + write_sequnlock_irqrestore(&xtime_lock, flags); + + /* signal hrtimers about time change */ + clock_was_set(); + + return 0; +} +EXPORT_SYMBOL(timekeeping_inject_offset); + /** * change_clocksource - Swaps clocksources if a new one is available * @@ -560,14 +596,65 @@ void __init timekeeping_init(void) static struct timespec timekeeping_suspend_time; /** + * __timekeeping_inject_sleeptime - Internal function to add sleep interval + * @delta: pointer to a timespec delta value + * + * Takes a timespec offset measuring a suspend interval and properly + * adds the sleep offset to the timekeeping variables. + */ +static void __timekeeping_inject_sleeptime(struct timespec *delta) +{ + xtime = timespec_add(xtime, *delta); + wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta); + total_sleep_time = timespec_add(total_sleep_time, *delta); +} + + +/** + * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values + * @delta: pointer to a timespec delta value + * + * This hook is for architectures that cannot support read_persistent_clock + * because their RTC/persistent clock is only accessible when irqs are enabled. + * + * This function should only be called by rtc_resume(), and allows + * a suspend offset to be injected into the timekeeping values. + */ +void timekeeping_inject_sleeptime(struct timespec *delta) +{ + unsigned long flags; + struct timespec ts; + + /* Make sure we don't set the clock twice */ + read_persistent_clock(&ts); + if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) + return; + + write_seqlock_irqsave(&xtime_lock, flags); + timekeeping_forward_now(); + + __timekeeping_inject_sleeptime(delta); + + timekeeper.ntp_error = 0; + ntp_clear(); + update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + timekeeper.mult); + + write_sequnlock_irqrestore(&xtime_lock, flags); + + /* signal hrtimers about time change */ + clock_was_set(); +} + + +/** * timekeeping_resume - Resumes the generic timekeeping subsystem. - * @dev: unused * * This is for the generic clocksource timekeeping. * xtime/wall_to_monotonic/jiffies/etc are * still managed by arch specific suspend/resume code. */ -static int timekeeping_resume(struct sys_device *dev) +static void timekeeping_resume(void) { unsigned long flags; struct timespec ts; @@ -580,9 +667,7 @@ static int timekeeping_resume(struct sys_device *dev) if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { ts = timespec_sub(ts, timekeeping_suspend_time); - xtime = timespec_add(xtime, ts); - wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); - total_sleep_time = timespec_add(total_sleep_time, ts); + __timekeeping_inject_sleeptime(&ts); } /* re-base the last cycle value */ timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); @@ -595,12 +680,10 @@ static int timekeeping_resume(struct sys_device *dev) clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); /* Resume hrtimers */ - hres_timers_resume(); - - return 0; + hrtimers_resume(); } -static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) +static int timekeeping_suspend(void) { unsigned long flags; @@ -618,26 +701,18 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) } /* sysfs resume/suspend bits for timekeeping */ -static struct sysdev_class timekeeping_sysclass = { - .name = "timekeeping", +static struct syscore_ops timekeeping_syscore_ops = { .resume = timekeeping_resume, .suspend = timekeeping_suspend, }; -static struct sys_device device_timer = { - .id = 0, - .cls = &timekeeping_sysclass, -}; - -static int __init timekeeping_init_device(void) +static int __init timekeeping_init_ops(void) { - int error = sysdev_class_register(&timekeeping_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; + register_syscore_ops(&timekeeping_syscore_ops); + return 0; } -device_initcall(timekeeping_init_device); +device_initcall(timekeeping_init_ops); /* * If the error is already larger, we look ahead even further @@ -779,7 +854,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) * * Called from the timer interrupt, must hold a write on xtime_lock. */ -void update_wall_time(void) +static void update_wall_time(void) { struct clocksource *clock; cycle_t offset; @@ -871,7 +946,7 @@ void update_wall_time(void) * getboottime - Return the real time of system boot. * @ts: pointer to the timespec to be set * - * Returns the time of day in a timespec. + * Returns the wall-time of boot in a timespec. * * This is based on the wall_to_monotonic offset and the total suspend * time. Calls to settimeofday will affect the value returned (which @@ -889,6 +964,55 @@ void getboottime(struct timespec *ts) } EXPORT_SYMBOL_GPL(getboottime); + +/** + * get_monotonic_boottime - Returns monotonic time since boot + * @ts: pointer to the timespec to be set + * + * Returns the monotonic time since boot in a timespec. + * + * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also + * includes the time spent in suspend. + */ +void get_monotonic_boottime(struct timespec *ts) +{ + struct timespec tomono, sleep; + unsigned int seq; + s64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqbegin(&xtime_lock); + *ts = xtime; + tomono = wall_to_monotonic; + sleep = total_sleep_time; + nsecs = timekeeping_get_ns(); + + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, + ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs); +} +EXPORT_SYMBOL_GPL(get_monotonic_boottime); + +/** + * ktime_get_boottime - Returns monotonic time since boot in a ktime + * + * Returns the monotonic time since boot in a ktime + * + * This is similar to CLOCK_MONTONIC/ktime_get, but also + * includes the time spent in suspend. + */ +ktime_t ktime_get_boottime(void) +{ + struct timespec ts; + + get_monotonic_boottime(&ts); + return timespec_to_ktime(ts); +} +EXPORT_SYMBOL_GPL(ktime_get_boottime); + /** * monotonic_to_bootbased - Convert the monotonic time to boot based. * @ts: pointer to the timespec to be converted @@ -910,11 +1034,6 @@ struct timespec __current_kernel_time(void) return xtime; } -struct timespec __get_wall_to_monotonic(void) -{ - return wall_to_monotonic; -} - struct timespec current_kernel_time(void) { struct timespec now; @@ -946,3 +1065,63 @@ struct timespec get_monotonic_coarse(void) now.tv_nsec + mono.tv_nsec); return now; } + +/* + * The 64-bit jiffies value is not atomic - you MUST NOT read it + * without sampling the sequence number in xtime_lock. + * jiffies is defined in the linker script... + */ +void do_timer(unsigned long ticks) +{ + jiffies_64 += ticks; + update_wall_time(); + calc_global_load(ticks); +} + +/** + * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic, + * and sleep offsets. + * @xtim: pointer to timespec to be set with xtime + * @wtom: pointer to timespec to be set with wall_to_monotonic + * @sleep: pointer to timespec to be set with time in suspend + */ +void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, + struct timespec *wtom, struct timespec *sleep) +{ + unsigned long seq; + + do { + seq = read_seqbegin(&xtime_lock); + *xtim = xtime; + *wtom = wall_to_monotonic; + *sleep = total_sleep_time; + } while (read_seqretry(&xtime_lock, seq)); +} + +/** + * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format + */ +ktime_t ktime_get_monotonic_offset(void) +{ + unsigned long seq; + struct timespec wtom; + + do { + seq = read_seqbegin(&xtime_lock); + wtom = wall_to_monotonic; + } while (read_seqretry(&xtime_lock, seq)); + return timespec_to_ktime(wtom); +} + +/** + * xtime_update() - advances the timekeeping infrastructure + * @ticks: number of ticks, that have elapsed since the last call. + * + * Must be called with interrupts disabled. + */ +void xtime_update(unsigned long ticks) +{ + write_seqlock(&xtime_lock); + do_timer(ticks); + write_sequnlock(&xtime_lock); +} diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 2f3b585b8d7d..a5d0a3a85dd8 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -236,7 +236,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, unsigned int timer_flag) { /* - * It doesnt matter which lock we take: + * It doesn't matter which lock we take: */ raw_spinlock_t *lock; struct entry *entry, input; diff --git a/kernel/timer.c b/kernel/timer.c index d6459923d245..fd6198692b57 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -404,6 +404,11 @@ static void timer_stats_account_timer(struct timer_list *timer) {} static struct debug_obj_descr timer_debug_descr; +static void *timer_debug_hint(void *addr) +{ + return ((struct timer_list *) addr)->function; +} + /* * fixup_init is called when: * - an active object is initialized @@ -477,6 +482,7 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state) static struct debug_obj_descr timer_debug_descr = { .name = "timer_list", + .debug_hint = timer_debug_hint, .fixup_init = timer_fixup_init, .fixup_activate = timer_fixup_activate, .fixup_free = timer_fixup_free, @@ -964,6 +970,25 @@ EXPORT_SYMBOL(try_to_del_timer_sync); * add_timer_on(). Upon exit the timer is not queued and the handler is * not running on any CPU. * + * Note: You must not hold locks that are held in interrupt context + * while calling this function. Even if the lock has nothing to do + * with the timer in question. Here's why: + * + * CPU0 CPU1 + * ---- ---- + * <SOFTIRQ> + * call_timer_fn(); + * base->running_timer = mytimer; + * spin_lock_irq(somelock); + * <IRQ> + * spin_lock(somelock); + * del_timer_sync(mytimer); + * while (base->running_timer == mytimer); + * + * Now del_timer_sync() will never return and never release somelock. + * The interrupt on the other CPU is waiting to grab somelock but + * it has interrupted the softirq that CPU0 is waiting to finish. + * * The function returns whether it has deactivated a pending timer or not. */ int del_timer_sync(struct timer_list *timer) @@ -971,6 +996,10 @@ int del_timer_sync(struct timer_list *timer) #ifdef CONFIG_LOCKDEP unsigned long flags; + /* + * If lockdep gives a backtrace here, please reference + * the synchronization rules above. + */ local_irq_save(flags); lock_map_acquire(&timer->lockdep_map); lock_map_release(&timer->lockdep_map); @@ -1295,19 +1324,6 @@ void run_local_timers(void) raise_softirq(TIMER_SOFTIRQ); } -/* - * The 64-bit jiffies value is not atomic - you MUST NOT read it - * without sampling the sequence number in xtime_lock. - * jiffies is defined in the linker script... - */ - -void do_timer(unsigned long ticks) -{ - jiffies_64 += ticks; - update_wall_time(); - calc_global_load(ticks); -} - #ifdef __ARCH_WANT_SYS_ALARM /* diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 14674dce77a6..2ad39e556cb4 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -141,7 +141,7 @@ if FTRACE config FUNCTION_TRACER bool "Kernel Function Tracer" depends on HAVE_FUNCTION_TRACER - select FRAME_POINTER if !ARM_UNWIND && !S390 + select FRAME_POINTER if !ARM_UNWIND && !S390 && !MICROBLAZE select KALLSYMS select GENERIC_TRACER select CONTEXT_SWITCH_TRACER @@ -275,7 +275,7 @@ config PROFILE_ANNOTATED_BRANCHES This tracer profiles all the the likely and unlikely macros in the kernel. It will display the results in: - /sys/kernel/debug/tracing/profile_annotated_branch + /sys/kernel/debug/tracing/trace_stat/branch_annotated Note: this will add a significant overhead; only turn this on if you need to profile the system's use of these macros. @@ -288,7 +288,7 @@ config PROFILE_ALL_BRANCHES taken in the kernel is recorded whether it hit or miss. The results will be displayed in: - /sys/kernel/debug/tracing/profile_branch + /sys/kernel/debug/tracing/trace_stat/branch_all This option also enables the likely/unlikely profiler. diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index cbafed7d4f38..6957aa298dfa 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -703,28 +703,21 @@ void blk_trace_shutdown(struct request_queue *q) * **/ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, - u32 what) + u32 what) { struct blk_trace *bt = q->blk_trace; - int rw = rq->cmd_flags & 0x03; if (likely(!bt)) return; - if (rq->cmd_flags & REQ_DISCARD) - rw |= REQ_DISCARD; - - if (rq->cmd_flags & REQ_SECURE) - rw |= REQ_SECURE; - if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { what |= BLK_TC_ACT(BLK_TC_PC); - __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, + __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags, what, rq->errors, rq->cmd_len, rq->cmd); } else { what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw, - what, rq->errors, 0, NULL); + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), + rq->cmd_flags, what, rq->errors, 0, NULL); } } @@ -857,29 +850,21 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); } -static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q) +static void blk_add_trace_unplug(void *ignore, struct request_queue *q, + unsigned int depth, bool explicit) { struct blk_trace *bt = q->blk_trace; if (bt) { - unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; - __be64 rpdu = cpu_to_be64(pdu); - - __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0, - sizeof(rpdu), &rpdu); - } -} + __be64 rpdu = cpu_to_be64(depth); + u32 what; -static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q) -{ - struct blk_trace *bt = q->blk_trace; - - if (bt) { - unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; - __be64 rpdu = cpu_to_be64(pdu); + if (explicit) + what = BLK_TA_UNPLUG_IO; + else + what = BLK_TA_UNPLUG_TIMER; - __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0, - sizeof(rpdu), &rpdu); + __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); } } @@ -1022,9 +1007,7 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_plug(blk_add_trace_plug, NULL); WARN_ON(ret); - ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); - WARN_ON(ret); - ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); + ret = register_trace_block_unplug(blk_add_trace_unplug, NULL); WARN_ON(ret); ret = register_trace_block_split(blk_add_trace_split, NULL); WARN_ON(ret); @@ -1039,8 +1022,7 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); unregister_trace_block_split(blk_add_trace_split, NULL); - unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); - unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); + unregister_trace_block_unplug(blk_add_trace_unplug, NULL); unregister_trace_block_plug(blk_add_trace_plug, NULL); unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); unregister_trace_block_getrq(blk_add_trace_getrq, NULL); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f3dadae83883..1ee417fcbfa5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -39,20 +39,26 @@ #include "trace_stat.h" #define FTRACE_WARN_ON(cond) \ - do { \ - if (WARN_ON(cond)) \ + ({ \ + int ___r = cond; \ + if (WARN_ON(___r)) \ ftrace_kill(); \ - } while (0) + ___r; \ + }) #define FTRACE_WARN_ON_ONCE(cond) \ - do { \ - if (WARN_ON_ONCE(cond)) \ + ({ \ + int ___r = cond; \ + if (WARN_ON_ONCE(___r)) \ ftrace_kill(); \ - } while (0) + ___r; \ + }) /* hash bits for specific function selection */ #define FTRACE_HASH_BITS 7 #define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) +#define FTRACE_HASH_DEFAULT_BITS 10 +#define FTRACE_HASH_MAX_BITS 12 /* ftrace_enabled is a method to turn ftrace on or off */ int ftrace_enabled __read_mostly; @@ -81,28 +87,40 @@ static struct ftrace_ops ftrace_list_end __read_mostly = .func = ftrace_stub, }; -static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; +static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; +static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; +static struct ftrace_ops global_ops; + +static void +ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); /* - * Traverse the ftrace_list, invoking all entries. The reason that we + * Traverse the ftrace_global_list, invoking all entries. The reason that we * can use rcu_dereference_raw() is that elements removed from this list * are simply leaked, so there is no need to interact with a grace-period * mechanism. The rcu_dereference_raw() calls are needed to handle - * concurrent insertions into the ftrace_list. + * concurrent insertions into the ftrace_global_list. * * Silly Alpha and silly pointer-speculation compiler optimizations! */ -static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) +static void ftrace_global_list_func(unsigned long ip, + unsigned long parent_ip) { - struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/ + struct ftrace_ops *op; + if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) + return; + + trace_recursion_set(TRACE_GLOBAL_BIT); + op = rcu_dereference_raw(ftrace_global_list); /*see above*/ while (op != &ftrace_list_end) { op->func(ip, parent_ip); op = rcu_dereference_raw(op->next); /*see above*/ }; + trace_recursion_clear(TRACE_GLOBAL_BIT); } static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip) @@ -147,46 +165,69 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) } #endif -static int __register_ftrace_function(struct ftrace_ops *ops) +static void update_global_ops(void) { - ops->next = ftrace_list; + ftrace_func_t func; + /* - * We are entering ops into the ftrace_list but another - * CPU might be walking that list. We need to make sure - * the ops->next pointer is valid before another CPU sees - * the ops pointer included into the ftrace_list. + * If there's only one function registered, then call that + * function directly. Otherwise, we need to iterate over the + * registered callers. */ - rcu_assign_pointer(ftrace_list, ops); + if (ftrace_global_list == &ftrace_list_end || + ftrace_global_list->next == &ftrace_list_end) + func = ftrace_global_list->func; + else + func = ftrace_global_list_func; - if (ftrace_enabled) { - ftrace_func_t func; + /* If we filter on pids, update to use the pid function */ + if (!list_empty(&ftrace_pids)) { + set_ftrace_pid_function(func); + func = ftrace_pid_func; + } - if (ops->next == &ftrace_list_end) - func = ops->func; - else - func = ftrace_list_func; + global_ops.func = func; +} - if (!list_empty(&ftrace_pids)) { - set_ftrace_pid_function(func); - func = ftrace_pid_func; - } +static void update_ftrace_function(void) +{ + ftrace_func_t func; + + update_global_ops(); + + /* + * If we are at the end of the list and this ops is + * not dynamic, then have the mcount trampoline call + * the function directly + */ + if (ftrace_ops_list == &ftrace_list_end || + (ftrace_ops_list->next == &ftrace_list_end && + !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC))) + func = ftrace_ops_list->func; + else + func = ftrace_ops_list_func; - /* - * For one func, simply call it directly. - * For more than one func, call the chain. - */ #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST - ftrace_trace_function = func; + ftrace_trace_function = func; #else - __ftrace_trace_function = func; - ftrace_trace_function = ftrace_test_stop_func; + __ftrace_trace_function = func; + ftrace_trace_function = ftrace_test_stop_func; #endif - } +} - return 0; +static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) +{ + ops->next = *list; + /* + * We are entering ops into the list but another + * CPU might be walking that list. We need to make sure + * the ops->next pointer is valid before another CPU sees + * the ops pointer included into the list. + */ + rcu_assign_pointer(*list, ops); } -static int __unregister_ftrace_function(struct ftrace_ops *ops) +static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) { struct ftrace_ops **p; @@ -194,13 +235,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) * If we are removing the last function, then simply point * to the ftrace_stub. */ - if (ftrace_list == ops && ops->next == &ftrace_list_end) { - ftrace_trace_function = ftrace_stub; - ftrace_list = &ftrace_list_end; + if (*list == ops && ops->next == &ftrace_list_end) { + *list = &ftrace_list_end; return 0; } - for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next) + for (p = list; *p != &ftrace_list_end; p = &(*p)->next) if (*p == ops) break; @@ -208,53 +248,83 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) return -1; *p = (*p)->next; + return 0; +} - if (ftrace_enabled) { - /* If we only have one func left, then call that directly */ - if (ftrace_list->next == &ftrace_list_end) { - ftrace_func_t func = ftrace_list->func; +static int __register_ftrace_function(struct ftrace_ops *ops) +{ + if (ftrace_disabled) + return -ENODEV; - if (!list_empty(&ftrace_pids)) { - set_ftrace_pid_function(func); - func = ftrace_pid_func; - } -#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST - ftrace_trace_function = func; -#else - __ftrace_trace_function = func; -#endif - } - } + if (FTRACE_WARN_ON(ops == &global_ops)) + return -EINVAL; + + if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EBUSY; + + if (!core_kernel_data((unsigned long)ops)) + ops->flags |= FTRACE_OPS_FL_DYNAMIC; + + if (ops->flags & FTRACE_OPS_FL_GLOBAL) { + int first = ftrace_global_list == &ftrace_list_end; + add_ftrace_ops(&ftrace_global_list, ops); + ops->flags |= FTRACE_OPS_FL_ENABLED; + if (first) + add_ftrace_ops(&ftrace_ops_list, &global_ops); + } else + add_ftrace_ops(&ftrace_ops_list, ops); + + if (ftrace_enabled) + update_ftrace_function(); return 0; } -static void ftrace_update_pid_func(void) +static int __unregister_ftrace_function(struct ftrace_ops *ops) { - ftrace_func_t func; + int ret; - if (ftrace_trace_function == ftrace_stub) - return; + if (ftrace_disabled) + return -ENODEV; -#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST - func = ftrace_trace_function; -#else - func = __ftrace_trace_function; -#endif + if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) + return -EBUSY; - if (!list_empty(&ftrace_pids)) { - set_ftrace_pid_function(func); - func = ftrace_pid_func; - } else { - if (func == ftrace_pid_func) - func = ftrace_pid_function; - } + if (FTRACE_WARN_ON(ops == &global_ops)) + return -EINVAL; -#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST - ftrace_trace_function = func; -#else - __ftrace_trace_function = func; -#endif + if (ops->flags & FTRACE_OPS_FL_GLOBAL) { + ret = remove_ftrace_ops(&ftrace_global_list, ops); + if (!ret && ftrace_global_list == &ftrace_list_end) + ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops); + if (!ret) + ops->flags &= ~FTRACE_OPS_FL_ENABLED; + } else + ret = remove_ftrace_ops(&ftrace_ops_list, ops); + + if (ret < 0) + return ret; + + if (ftrace_enabled) + update_ftrace_function(); + + /* + * Dynamic ops may be freed, we must make sure that all + * callers are done before leaving this function. + */ + if (ops->flags & FTRACE_OPS_FL_DYNAMIC) + synchronize_sched(); + + return 0; +} + +static void ftrace_update_pid_func(void) +{ + /* Only do something if we are tracing something */ + if (ftrace_trace_function == ftrace_stub) + return; + + update_ftrace_function(); } #ifdef CONFIG_FUNCTION_PROFILER @@ -888,8 +958,35 @@ enum { FTRACE_START_FUNC_RET = (1 << 3), FTRACE_STOP_FUNC_RET = (1 << 4), }; +struct ftrace_func_entry { + struct hlist_node hlist; + unsigned long ip; +}; + +struct ftrace_hash { + unsigned long size_bits; + struct hlist_head *buckets; + unsigned long count; + struct rcu_head rcu; +}; -static int ftrace_filtered; +/* + * We make these constant because no one should touch them, + * but they are used as the default "empty hash", to avoid allocating + * it all the time. These are in a read only section such that if + * anyone does try to modify it, it will cause an exception. + */ +static const struct hlist_head empty_buckets[1]; +static const struct ftrace_hash empty_hash = { + .buckets = (struct hlist_head *)empty_buckets, +}; +#define EMPTY_HASH ((struct ftrace_hash *)&empty_hash) + +static struct ftrace_ops global_ops = { + .func = ftrace_stub, + .notrace_hash = EMPTY_HASH, + .filter_hash = EMPTY_HASH, +}; static struct dyn_ftrace *ftrace_new_addrs; @@ -912,6 +1009,269 @@ static struct ftrace_page *ftrace_pages; static struct dyn_ftrace *ftrace_free_records; +static struct ftrace_func_entry * +ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) +{ + unsigned long key; + struct ftrace_func_entry *entry; + struct hlist_head *hhd; + struct hlist_node *n; + + if (!hash->count) + return NULL; + + if (hash->size_bits > 0) + key = hash_long(ip, hash->size_bits); + else + key = 0; + + hhd = &hash->buckets[key]; + + hlist_for_each_entry_rcu(entry, n, hhd, hlist) { + if (entry->ip == ip) + return entry; + } + return NULL; +} + +static void __add_hash_entry(struct ftrace_hash *hash, + struct ftrace_func_entry *entry) +{ + struct hlist_head *hhd; + unsigned long key; + + if (hash->size_bits) + key = hash_long(entry->ip, hash->size_bits); + else + key = 0; + + hhd = &hash->buckets[key]; + hlist_add_head(&entry->hlist, hhd); + hash->count++; +} + +static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip) +{ + struct ftrace_func_entry *entry; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + entry->ip = ip; + __add_hash_entry(hash, entry); + + return 0; +} + +static void +free_hash_entry(struct ftrace_hash *hash, + struct ftrace_func_entry *entry) +{ + hlist_del(&entry->hlist); + kfree(entry); + hash->count--; +} + +static void +remove_hash_entry(struct ftrace_hash *hash, + struct ftrace_func_entry *entry) +{ + hlist_del(&entry->hlist); + hash->count--; +} + +static void ftrace_hash_clear(struct ftrace_hash *hash) +{ + struct hlist_head *hhd; + struct hlist_node *tp, *tn; + struct ftrace_func_entry *entry; + int size = 1 << hash->size_bits; + int i; + + if (!hash->count) + return; + + for (i = 0; i < size; i++) { + hhd = &hash->buckets[i]; + hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) + free_hash_entry(hash, entry); + } + FTRACE_WARN_ON(hash->count); +} + +static void free_ftrace_hash(struct ftrace_hash *hash) +{ + if (!hash || hash == EMPTY_HASH) + return; + ftrace_hash_clear(hash); + kfree(hash->buckets); + kfree(hash); +} + +static void __free_ftrace_hash_rcu(struct rcu_head *rcu) +{ + struct ftrace_hash *hash; + + hash = container_of(rcu, struct ftrace_hash, rcu); + free_ftrace_hash(hash); +} + +static void free_ftrace_hash_rcu(struct ftrace_hash *hash) +{ + if (!hash || hash == EMPTY_HASH) + return; + call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); +} + +static struct ftrace_hash *alloc_ftrace_hash(int size_bits) +{ + struct ftrace_hash *hash; + int size; + + hash = kzalloc(sizeof(*hash), GFP_KERNEL); + if (!hash) + return NULL; + + size = 1 << size_bits; + hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL); + + if (!hash->buckets) { + kfree(hash); + return NULL; + } + + hash->size_bits = size_bits; + + return hash; +} + +static struct ftrace_hash * +alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) +{ + struct ftrace_func_entry *entry; + struct ftrace_hash *new_hash; + struct hlist_node *tp; + int size; + int ret; + int i; + + new_hash = alloc_ftrace_hash(size_bits); + if (!new_hash) + return NULL; + + /* Empty hash? */ + if (!hash || !hash->count) + return new_hash; + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) { + ret = add_hash_entry(new_hash, entry->ip); + if (ret < 0) + goto free_hash; + } + } + + FTRACE_WARN_ON(new_hash->count != hash->count); + + return new_hash; + + free_hash: + free_ftrace_hash(new_hash); + return NULL; +} + +static int +ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) +{ + struct ftrace_func_entry *entry; + struct hlist_node *tp, *tn; + struct hlist_head *hhd; + struct ftrace_hash *old_hash; + struct ftrace_hash *new_hash; + unsigned long key; + int size = src->count; + int bits = 0; + int i; + + /* + * If the new source is empty, just free dst and assign it + * the empty_hash. + */ + if (!src->count) { + free_ftrace_hash_rcu(*dst); + rcu_assign_pointer(*dst, EMPTY_HASH); + return 0; + } + + /* + * Make the hash size about 1/2 the # found + */ + for (size /= 2; size; size >>= 1) + bits++; + + /* Don't allocate too much */ + if (bits > FTRACE_HASH_MAX_BITS) + bits = FTRACE_HASH_MAX_BITS; + + new_hash = alloc_ftrace_hash(bits); + if (!new_hash) + return -ENOMEM; + + size = 1 << src->size_bits; + for (i = 0; i < size; i++) { + hhd = &src->buckets[i]; + hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) { + if (bits > 0) + key = hash_long(entry->ip, bits); + else + key = 0; + remove_hash_entry(src, entry); + __add_hash_entry(new_hash, entry); + } + } + + old_hash = *dst; + rcu_assign_pointer(*dst, new_hash); + free_ftrace_hash_rcu(old_hash); + + return 0; +} + +/* + * Test the hashes for this ops to see if we want to call + * the ops->func or not. + * + * It's a match if the ip is in the ops->filter_hash or + * the filter_hash does not exist or is empty, + * AND + * the ip is not in the ops->notrace_hash. + * + * This needs to be called with preemption disabled as + * the hashes are freed with call_rcu_sched(). + */ +static int +ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) +{ + struct ftrace_hash *filter_hash; + struct ftrace_hash *notrace_hash; + int ret; + + filter_hash = rcu_dereference_raw(ops->filter_hash); + notrace_hash = rcu_dereference_raw(ops->notrace_hash); + + if ((!filter_hash || !filter_hash->count || + ftrace_lookup_ip(filter_hash, ip)) && + (!notrace_hash || !notrace_hash->count || + !ftrace_lookup_ip(notrace_hash, ip))) + ret = 1; + else + ret = 0; + + return ret; +} + /* * This is a double for. Do not use 'break' to break out of the loop, * you must use a goto. @@ -926,6 +1286,105 @@ static struct dyn_ftrace *ftrace_free_records; } \ } +static void __ftrace_hash_rec_update(struct ftrace_ops *ops, + int filter_hash, + bool inc) +{ + struct ftrace_hash *hash; + struct ftrace_hash *other_hash; + struct ftrace_page *pg; + struct dyn_ftrace *rec; + int count = 0; + int all = 0; + + /* Only update if the ops has been registered */ + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return; + + /* + * In the filter_hash case: + * If the count is zero, we update all records. + * Otherwise we just update the items in the hash. + * + * In the notrace_hash case: + * We enable the update in the hash. + * As disabling notrace means enabling the tracing, + * and enabling notrace means disabling, the inc variable + * gets inversed. + */ + if (filter_hash) { + hash = ops->filter_hash; + other_hash = ops->notrace_hash; + if (!hash || !hash->count) + all = 1; + } else { + inc = !inc; + hash = ops->notrace_hash; + other_hash = ops->filter_hash; + /* + * If the notrace hash has no items, + * then there's nothing to do. + */ + if (hash && !hash->count) + return; + } + + do_for_each_ftrace_rec(pg, rec) { + int in_other_hash = 0; + int in_hash = 0; + int match = 0; + + if (all) { + /* + * Only the filter_hash affects all records. + * Update if the record is not in the notrace hash. + */ + if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip)) + match = 1; + } else { + in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip); + in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip); + + /* + * + */ + if (filter_hash && in_hash && !in_other_hash) + match = 1; + else if (!filter_hash && in_hash && + (in_other_hash || !other_hash->count)) + match = 1; + } + if (!match) + continue; + + if (inc) { + rec->flags++; + if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) + return; + } else { + if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) + return; + rec->flags--; + } + count++; + /* Shortcut, if we handled all records, we are done. */ + if (!all && count == hash->count) + return; + } while_for_each_ftrace_rec(); +} + +static void ftrace_hash_rec_disable(struct ftrace_ops *ops, + int filter_hash) +{ + __ftrace_hash_rec_update(ops, filter_hash, 0); +} + +static void ftrace_hash_rec_enable(struct ftrace_ops *ops, + int filter_hash) +{ + __ftrace_hash_rec_update(ops, filter_hash, 1); +} + static void ftrace_free_rec(struct dyn_ftrace *rec) { rec->freelist = ftrace_free_records; @@ -1047,18 +1506,18 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) ftrace_addr = (unsigned long)FTRACE_ADDR; /* - * If this record is not to be traced or we want to disable it, - * then disable it. + * If we are enabling tracing: * - * If we want to enable it and filtering is off, then enable it. + * If the record has a ref count, then we need to enable it + * because someone is using it. * - * If we want to enable it and filtering is on, enable it only if - * it's filtered + * Otherwise we make sure its disabled. + * + * If we are disabling tracing, then disable all records that + * are enabled. */ - if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) { - if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER)) - flag = FTRACE_FL_ENABLED; - } + if (enable && (rec->flags & ~FTRACE_FL_MASK)) + flag = FTRACE_FL_ENABLED; /* If the state of this record hasn't changed, then do nothing */ if ((rec->flags & FTRACE_FL_ENABLED) == flag) @@ -1079,19 +1538,16 @@ static void ftrace_replace_code(int enable) struct ftrace_page *pg; int failed; + if (unlikely(ftrace_disabled)) + return; + do_for_each_ftrace_rec(pg, rec) { - /* - * Skip over free records, records that have - * failed and not converted. - */ - if (rec->flags & FTRACE_FL_FREE || - rec->flags & FTRACE_FL_FAILED || - !(rec->flags & FTRACE_FL_CONVERTED)) + /* Skip over free records */ + if (rec->flags & FTRACE_FL_FREE) continue; failed = __ftrace_replace_code(rec, enable); if (failed) { - rec->flags |= FTRACE_FL_FAILED; ftrace_bug(failed, rec->ip); /* Stop processing */ return; @@ -1107,10 +1563,12 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) ip = rec->ip; + if (unlikely(ftrace_disabled)) + return 0; + ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); if (ret) { ftrace_bug(ret, ip); - rec->flags |= FTRACE_FL_FAILED; return 0; } return 1; @@ -1171,6 +1629,7 @@ static void ftrace_run_update_code(int command) static ftrace_func_t saved_ftrace_func; static int ftrace_start_up; +static int global_start_up; static void ftrace_startup_enable(int command) { @@ -1185,19 +1644,38 @@ static void ftrace_startup_enable(int command) ftrace_run_update_code(command); } -static void ftrace_startup(int command) +static int ftrace_startup(struct ftrace_ops *ops, int command) { + bool hash_enable = true; + if (unlikely(ftrace_disabled)) - return; + return -ENODEV; ftrace_start_up++; command |= FTRACE_ENABLE_CALLS; + /* ops marked global share the filter hashes */ + if (ops->flags & FTRACE_OPS_FL_GLOBAL) { + ops = &global_ops; + /* Don't update hash if global is already set */ + if (global_start_up) + hash_enable = false; + global_start_up++; + } + + ops->flags |= FTRACE_OPS_FL_ENABLED; + if (hash_enable) + ftrace_hash_rec_enable(ops, 1); + ftrace_startup_enable(command); + + return 0; } -static void ftrace_shutdown(int command) +static void ftrace_shutdown(struct ftrace_ops *ops, int command) { + bool hash_disable = true; + if (unlikely(ftrace_disabled)) return; @@ -1209,6 +1687,23 @@ static void ftrace_shutdown(int command) */ WARN_ON_ONCE(ftrace_start_up < 0); + if (ops->flags & FTRACE_OPS_FL_GLOBAL) { + ops = &global_ops; + global_start_up--; + WARN_ON_ONCE(global_start_up < 0); + /* Don't update hash if global still has users */ + if (global_start_up) { + WARN_ON_ONCE(!ftrace_start_up); + hash_disable = false; + } + } + + if (hash_disable) + ftrace_hash_rec_disable(ops, 1); + + if (ops != &global_ops || !global_start_up) + ops->flags &= ~FTRACE_OPS_FL_ENABLED; + if (!ftrace_start_up) command |= FTRACE_DISABLE_CALLS; @@ -1268,15 +1763,15 @@ static int ftrace_update_code(struct module *mod) p->flags = 0L; /* - * Do the initial record convertion from mcount jump + * Do the initial record conversion from mcount jump * to the NOP instructions. */ if (!ftrace_code_disable(mod, p)) { ftrace_free_rec(p); - continue; + /* Game over */ + break; } - p->flags |= FTRACE_FL_CONVERTED; ftrace_update_cnt++; /* @@ -1351,9 +1846,9 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) enum { FTRACE_ITER_FILTER = (1 << 0), FTRACE_ITER_NOTRACE = (1 << 1), - FTRACE_ITER_FAILURES = (1 << 2), - FTRACE_ITER_PRINTALL = (1 << 3), - FTRACE_ITER_HASH = (1 << 4), + FTRACE_ITER_PRINTALL = (1 << 2), + FTRACE_ITER_HASH = (1 << 3), + FTRACE_ITER_ENABLED = (1 << 4), }; #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ @@ -1365,6 +1860,8 @@ struct ftrace_iterator { struct dyn_ftrace *func; struct ftrace_func_probe *probe; struct trace_parser parser; + struct ftrace_hash *hash; + struct ftrace_ops *ops; int hidx; int idx; unsigned flags; @@ -1461,13 +1958,17 @@ static void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct ftrace_iterator *iter = m->private; + struct ftrace_ops *ops = &global_ops; struct dyn_ftrace *rec = NULL; + if (unlikely(ftrace_disabled)) + return NULL; + if (iter->flags & FTRACE_ITER_HASH) return t_hash_next(m, pos); (*pos)++; - iter->pos = *pos; + iter->pos = iter->func_pos = *pos; if (iter->flags & FTRACE_ITER_PRINTALL) return t_hash_start(m, pos); @@ -1483,17 +1984,15 @@ t_next(struct seq_file *m, void *v, loff_t *pos) rec = &iter->pg->records[iter->idx++]; if ((rec->flags & FTRACE_FL_FREE) || - (!(iter->flags & FTRACE_ITER_FAILURES) && - (rec->flags & FTRACE_FL_FAILED)) || - - ((iter->flags & FTRACE_ITER_FAILURES) && - !(rec->flags & FTRACE_FL_FAILED)) || - ((iter->flags & FTRACE_ITER_FILTER) && - !(rec->flags & FTRACE_FL_FILTER)) || + !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || ((iter->flags & FTRACE_ITER_NOTRACE) && - !(rec->flags & FTRACE_FL_NOTRACE))) { + !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) || + + ((iter->flags & FTRACE_ITER_ENABLED) && + !(rec->flags & ~FTRACE_FL_MASK))) { + rec = NULL; goto retry; } @@ -1502,7 +2001,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos) if (!rec) return t_hash_start(m, pos); - iter->func_pos = *pos; iter->func = rec; return iter; @@ -1518,10 +2016,15 @@ static void reset_iter_read(struct ftrace_iterator *iter) static void *t_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; + struct ftrace_ops *ops = &global_ops; void *p = NULL; loff_t l; mutex_lock(&ftrace_lock); + + if (unlikely(ftrace_disabled)) + return NULL; + /* * If an lseek was done, then reset and start from beginning. */ @@ -1533,7 +2036,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) * off, we can short cut and just print out that all * functions are enabled. */ - if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) { + if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) { if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; @@ -1591,7 +2094,11 @@ static int t_show(struct seq_file *m, void *v) if (!rec) return 0; - seq_printf(m, "%ps\n", (void *)rec->ip); + seq_printf(m, "%ps", (void *)rec->ip); + if (iter->flags & FTRACE_ITER_ENABLED) + seq_printf(m, " (%ld)", + rec->flags & ~FTRACE_FL_MASK); + seq_printf(m, "\n"); return 0; } @@ -1631,44 +2138,46 @@ ftrace_avail_open(struct inode *inode, struct file *file) } static int -ftrace_failures_open(struct inode *inode, struct file *file) +ftrace_enabled_open(struct inode *inode, struct file *file) { - int ret; - struct seq_file *m; struct ftrace_iterator *iter; + int ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + iter->pg = ftrace_pages_start; + iter->flags = FTRACE_ITER_ENABLED; - ret = ftrace_avail_open(inode, file); + ret = seq_open(file, &show_ftrace_seq_ops); if (!ret) { - m = file->private_data; - iter = m->private; - iter->flags = FTRACE_ITER_FAILURES; + struct seq_file *m = file->private_data; + + m->private = iter; + } else { + kfree(iter); } return ret; } - -static void ftrace_filter_reset(int enable) +static void ftrace_filter_reset(struct ftrace_hash *hash) { - struct ftrace_page *pg; - struct dyn_ftrace *rec; - unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; - mutex_lock(&ftrace_lock); - if (enable) - ftrace_filtered = 0; - do_for_each_ftrace_rec(pg, rec) { - if (rec->flags & FTRACE_FL_FAILED) - continue; - rec->flags &= ~type; - } while_for_each_ftrace_rec(); + ftrace_hash_clear(hash); mutex_unlock(&ftrace_lock); } static int -ftrace_regex_open(struct inode *inode, struct file *file, int enable) +ftrace_regex_open(struct ftrace_ops *ops, int flag, + struct inode *inode, struct file *file) { struct ftrace_iterator *iter; + struct ftrace_hash *hash; int ret = 0; if (unlikely(ftrace_disabled)) @@ -1683,21 +2192,42 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) return -ENOMEM; } + if (flag & FTRACE_ITER_NOTRACE) + hash = ops->notrace_hash; + else + hash = ops->filter_hash; + + iter->ops = ops; + iter->flags = flag; + + if (file->f_mode & FMODE_WRITE) { + mutex_lock(&ftrace_lock); + iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash); + mutex_unlock(&ftrace_lock); + + if (!iter->hash) { + trace_parser_put(&iter->parser); + kfree(iter); + return -ENOMEM; + } + } + mutex_lock(&ftrace_regex_lock); + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) - ftrace_filter_reset(enable); + ftrace_filter_reset(iter->hash); if (file->f_mode & FMODE_READ) { iter->pg = ftrace_pages_start; - iter->flags = enable ? FTRACE_ITER_FILTER : - FTRACE_ITER_NOTRACE; ret = seq_open(file, &show_ftrace_seq_ops); if (!ret) { struct seq_file *m = file->private_data; m->private = iter; } else { + /* Failed */ + free_ftrace_hash(iter->hash); trace_parser_put(&iter->parser); kfree(iter); } @@ -1711,13 +2241,15 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) static int ftrace_filter_open(struct inode *inode, struct file *file) { - return ftrace_regex_open(inode, file, 1); + return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER, + inode, file); } static int ftrace_notrace_open(struct inode *inode, struct file *file) { - return ftrace_regex_open(inode, file, 0); + return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE, + inode, file); } static loff_t @@ -1762,86 +2294,99 @@ static int ftrace_match(char *str, char *regex, int len, int type) } static int -ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type) +enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not) +{ + struct ftrace_func_entry *entry; + int ret = 0; + + entry = ftrace_lookup_ip(hash, rec->ip); + if (not) { + /* Do nothing if it doesn't exist */ + if (!entry) + return 0; + + free_hash_entry(hash, entry); + } else { + /* Do nothing if it exists */ + if (entry) + return 0; + + ret = add_hash_entry(hash, rec->ip); + } + return ret; +} + +static int +ftrace_match_record(struct dyn_ftrace *rec, char *mod, + char *regex, int len, int type) { char str[KSYM_SYMBOL_LEN]; + char *modname; + + kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); + + if (mod) { + /* module lookup requires matching the module */ + if (!modname || strcmp(modname, mod)) + return 0; + + /* blank search means to match all funcs in the mod */ + if (!len) + return 1; + } - kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); return ftrace_match(str, regex, len, type); } -static int ftrace_match_records(char *buff, int len, int enable) +static int +match_records(struct ftrace_hash *hash, char *buff, + int len, char *mod, int not) { - unsigned int search_len; + unsigned search_len = 0; struct ftrace_page *pg; struct dyn_ftrace *rec; - unsigned long flag; - char *search; - int type; - int not; + int type = MATCH_FULL; + char *search = buff; int found = 0; + int ret; - flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; - type = filter_parse_regex(buff, len, &search, ¬); - - search_len = strlen(search); + if (len) { + type = filter_parse_regex(buff, len, &search, ¬); + search_len = strlen(search); + } mutex_lock(&ftrace_lock); - do_for_each_ftrace_rec(pg, rec) { - if (rec->flags & FTRACE_FL_FAILED) - continue; + if (unlikely(ftrace_disabled)) + goto out_unlock; - if (ftrace_match_record(rec, search, search_len, type)) { - if (not) - rec->flags &= ~flag; - else - rec->flags |= flag; + do_for_each_ftrace_rec(pg, rec) { + + if (ftrace_match_record(rec, mod, search, search_len, type)) { + ret = enter_record(hash, rec, not); + if (ret < 0) { + found = ret; + goto out_unlock; + } found = 1; } - /* - * Only enable filtering if we have a function that - * is filtered on. - */ - if (enable && (rec->flags & FTRACE_FL_FILTER)) - ftrace_filtered = 1; } while_for_each_ftrace_rec(); + out_unlock: mutex_unlock(&ftrace_lock); return found; } static int -ftrace_match_module_record(struct dyn_ftrace *rec, char *mod, - char *regex, int len, int type) +ftrace_match_records(struct ftrace_hash *hash, char *buff, int len) { - char str[KSYM_SYMBOL_LEN]; - char *modname; - - kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); - - if (!modname || strcmp(modname, mod)) - return 0; - - /* blank search means to match all funcs in the mod */ - if (len) - return ftrace_match(str, regex, len, type); - else - return 1; + return match_records(hash, buff, len, NULL, 0); } -static int ftrace_match_module_records(char *buff, char *mod, int enable) +static int +ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod) { - unsigned search_len = 0; - struct ftrace_page *pg; - struct dyn_ftrace *rec; - int type = MATCH_FULL; - char *search = buff; - unsigned long flag; int not = 0; - int found = 0; - - flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; /* blank or '*' mean the same */ if (strcmp(buff, "*") == 0) @@ -1853,32 +2398,7 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable) not = 1; } - if (strlen(buff)) { - type = filter_parse_regex(buff, strlen(buff), &search, ¬); - search_len = strlen(search); - } - - mutex_lock(&ftrace_lock); - do_for_each_ftrace_rec(pg, rec) { - - if (rec->flags & FTRACE_FL_FAILED) - continue; - - if (ftrace_match_module_record(rec, mod, - search, search_len, type)) { - if (not) - rec->flags &= ~flag; - else - rec->flags |= flag; - found = 1; - } - if (enable && (rec->flags & FTRACE_FL_FILTER)) - ftrace_filtered = 1; - - } while_for_each_ftrace_rec(); - mutex_unlock(&ftrace_lock); - - return found; + return match_records(hash, buff, strlen(buff), mod, not); } /* @@ -1889,7 +2409,10 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable) static int ftrace_mod_callback(char *func, char *cmd, char *param, int enable) { + struct ftrace_ops *ops = &global_ops; + struct ftrace_hash *hash; char *mod; + int ret = -EINVAL; /* * cmd == 'mod' because we only registered this func @@ -1901,15 +2424,24 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable) /* we must have a module name */ if (!param) - return -EINVAL; + return ret; mod = strsep(¶m, ":"); if (!strlen(mod)) - return -EINVAL; + return ret; - if (ftrace_match_module_records(func, mod, enable)) - return 0; - return -EINVAL; + if (enable) + hash = ops->filter_hash; + else + hash = ops->notrace_hash; + + ret = ftrace_match_module_records(hash, func, mod); + if (!ret) + ret = -EINVAL; + if (ret < 0) + return ret; + + return 0; } static struct ftrace_func_command ftrace_mod_cmd = { @@ -1960,6 +2492,7 @@ static int ftrace_probe_registered; static void __enable_ftrace_function_probe(void) { + int ret; int i; if (ftrace_probe_registered) @@ -1974,13 +2507,16 @@ static void __enable_ftrace_function_probe(void) if (i == FTRACE_FUNC_HASHSIZE) return; - __register_ftrace_function(&trace_probe_ops); - ftrace_startup(0); + ret = __register_ftrace_function(&trace_probe_ops); + if (!ret) + ret = ftrace_startup(&trace_probe_ops, 0); + ftrace_probe_registered = 1; } static void __disable_ftrace_function_probe(void) { + int ret; int i; if (!ftrace_probe_registered) @@ -1993,8 +2529,10 @@ static void __disable_ftrace_function_probe(void) } /* no more funcs left */ - __unregister_ftrace_function(&trace_probe_ops); - ftrace_shutdown(0); + ret = __unregister_ftrace_function(&trace_probe_ops); + if (!ret) + ftrace_shutdown(&trace_probe_ops, 0); + ftrace_probe_registered = 0; } @@ -2030,12 +2568,13 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, return -EINVAL; mutex_lock(&ftrace_lock); - do_for_each_ftrace_rec(pg, rec) { - if (rec->flags & FTRACE_FL_FAILED) - continue; + if (unlikely(ftrace_disabled)) + goto out_unlock; + + do_for_each_ftrace_rec(pg, rec) { - if (!ftrace_match_record(rec, search, len, type)) + if (!ftrace_match_record(rec, NULL, search, len, type)) continue; entry = kmalloc(sizeof(*entry), GFP_KERNEL); @@ -2196,18 +2735,22 @@ int unregister_ftrace_command(struct ftrace_func_command *cmd) return ret; } -static int ftrace_process_regex(char *buff, int len, int enable) +static int ftrace_process_regex(struct ftrace_hash *hash, + char *buff, int len, int enable) { char *func, *command, *next = buff; struct ftrace_func_command *p; - int ret = -EINVAL; + int ret; func = strsep(&next, ":"); if (!next) { - if (ftrace_match_records(func, len, enable)) - return 0; - return ret; + ret = ftrace_match_records(hash, func, len); + if (!ret) + ret = -EINVAL; + if (ret < 0) + return ret; + return 0; } /* command found */ @@ -2240,6 +2783,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, mutex_lock(&ftrace_regex_lock); + ret = -ENODEV; + if (unlikely(ftrace_disabled)) + goto out_unlock; + if (file->f_mode & FMODE_READ) { struct seq_file *m = file->private_data; iter = m->private; @@ -2251,7 +2798,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, if (read >= 0 && trace_parser_loaded(parser) && !trace_parser_cont(parser)) { - ret = ftrace_process_regex(parser->buffer, + ret = ftrace_process_regex(iter->hash, parser->buffer, parser->idx, enable); trace_parser_clear(parser); if (ret) @@ -2279,22 +2826,49 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf, return ftrace_regex_write(file, ubuf, cnt, ppos, 0); } -static void -ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) +static int +ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, + int reset, int enable) { + struct ftrace_hash **orig_hash; + struct ftrace_hash *hash; + int ret; + + /* All global ops uses the global ops filters */ + if (ops->flags & FTRACE_OPS_FL_GLOBAL) + ops = &global_ops; + if (unlikely(ftrace_disabled)) - return; + return -ENODEV; + + if (enable) + orig_hash = &ops->filter_hash; + else + orig_hash = &ops->notrace_hash; + + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); + if (!hash) + return -ENOMEM; mutex_lock(&ftrace_regex_lock); if (reset) - ftrace_filter_reset(enable); + ftrace_filter_reset(hash); if (buf) - ftrace_match_records(buf, len, enable); + ftrace_match_records(hash, buf, len); + + mutex_lock(&ftrace_lock); + ret = ftrace_hash_move(orig_hash, hash); + mutex_unlock(&ftrace_lock); + mutex_unlock(&ftrace_regex_lock); + + free_ftrace_hash(hash); + return ret; } /** * ftrace_set_filter - set a function to filter on in ftrace + * @ops - the ops to set the filter with * @buf - the string that holds the function filter text. * @len - the length of the string. * @reset - non zero to reset all filters before applying this filter. @@ -2302,13 +2876,16 @@ ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) * Filters denote which functions should be enabled when tracing is enabled. * If @buf is NULL and reset is set, all functions will be enabled for tracing. */ -void ftrace_set_filter(unsigned char *buf, int len, int reset) +void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, + int len, int reset) { - ftrace_set_regex(buf, len, reset, 1); + ftrace_set_regex(ops, buf, len, reset, 1); } +EXPORT_SYMBOL_GPL(ftrace_set_filter); /** * ftrace_set_notrace - set a function to not trace in ftrace + * @ops - the ops to set the notrace filter with * @buf - the string that holds the function notrace text. * @len - the length of the string. * @reset - non zero to reset all filters before applying this filter. @@ -2317,10 +2894,44 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset) * is enabled. If @buf is NULL and reset is set, all functions will be enabled * for tracing. */ -void ftrace_set_notrace(unsigned char *buf, int len, int reset) +void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, + int len, int reset) { - ftrace_set_regex(buf, len, reset, 0); + ftrace_set_regex(ops, buf, len, reset, 0); } +EXPORT_SYMBOL_GPL(ftrace_set_notrace); +/** + * ftrace_set_filter - set a function to filter on in ftrace + * @ops - the ops to set the filter with + * @buf - the string that holds the function filter text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled. + * If @buf is NULL and reset is set, all functions will be enabled for tracing. + */ +void ftrace_set_global_filter(unsigned char *buf, int len, int reset) +{ + ftrace_set_regex(&global_ops, buf, len, reset, 1); +} +EXPORT_SYMBOL_GPL(ftrace_set_global_filter); + +/** + * ftrace_set_notrace - set a function to not trace in ftrace + * @ops - the ops to set the notrace filter with + * @buf - the string that holds the function notrace text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Notrace Filters denote which functions should not be enabled when tracing + * is enabled. If @buf is NULL and reset is set, all functions will be enabled + * for tracing. + */ +void ftrace_set_global_notrace(unsigned char *buf, int len, int reset) +{ + ftrace_set_regex(&global_ops, buf, len, reset, 0); +} +EXPORT_SYMBOL_GPL(ftrace_set_global_notrace); /* * command line interface to allow users to set filters on boot up. @@ -2371,22 +2982,23 @@ static void __init set_ftrace_early_graph(char *buf) } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -static void __init set_ftrace_early_filter(char *buf, int enable) +static void __init +set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable) { char *func; while (buf) { func = strsep(&buf, ","); - ftrace_set_regex(func, strlen(func), 0, enable); + ftrace_set_regex(ops, func, strlen(func), 0, enable); } } static void __init set_ftrace_early_filters(void) { if (ftrace_filter_buf[0]) - set_ftrace_early_filter(ftrace_filter_buf, 1); + set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1); if (ftrace_notrace_buf[0]) - set_ftrace_early_filter(ftrace_notrace_buf, 0); + set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0); #ifdef CONFIG_FUNCTION_GRAPH_TRACER if (ftrace_graph_buf[0]) set_ftrace_early_graph(ftrace_graph_buf); @@ -2394,11 +3006,14 @@ static void __init set_ftrace_early_filters(void) } static int -ftrace_regex_release(struct inode *inode, struct file *file, int enable) +ftrace_regex_release(struct inode *inode, struct file *file) { struct seq_file *m = (struct seq_file *)file->private_data; struct ftrace_iterator *iter; + struct ftrace_hash **orig_hash; struct trace_parser *parser; + int filter_hash; + int ret; mutex_lock(&ftrace_regex_lock); if (file->f_mode & FMODE_READ) { @@ -2411,33 +3026,41 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) parser = &iter->parser; if (trace_parser_loaded(parser)) { parser->buffer[parser->idx] = 0; - ftrace_match_records(parser->buffer, parser->idx, enable); + ftrace_match_records(iter->hash, parser->buffer, parser->idx); } - mutex_lock(&ftrace_lock); - if (ftrace_start_up && ftrace_enabled) - ftrace_run_update_code(FTRACE_ENABLE_CALLS); - mutex_unlock(&ftrace_lock); - trace_parser_put(parser); + + if (file->f_mode & FMODE_WRITE) { + filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); + + if (filter_hash) + orig_hash = &iter->ops->filter_hash; + else + orig_hash = &iter->ops->notrace_hash; + + mutex_lock(&ftrace_lock); + /* + * Remove the current set, update the hash and add + * them back. + */ + ftrace_hash_rec_disable(iter->ops, filter_hash); + ret = ftrace_hash_move(orig_hash, iter->hash); + if (!ret) { + ftrace_hash_rec_enable(iter->ops, filter_hash); + if (iter->ops->flags & FTRACE_OPS_FL_ENABLED + && ftrace_enabled) + ftrace_run_update_code(FTRACE_ENABLE_CALLS); + } + mutex_unlock(&ftrace_lock); + } + free_ftrace_hash(iter->hash); kfree(iter); mutex_unlock(&ftrace_regex_lock); return 0; } -static int -ftrace_filter_release(struct inode *inode, struct file *file) -{ - return ftrace_regex_release(inode, file, 1); -} - -static int -ftrace_notrace_release(struct inode *inode, struct file *file) -{ - return ftrace_regex_release(inode, file, 0); -} - static const struct file_operations ftrace_avail_fops = { .open = ftrace_avail_open, .read = seq_read, @@ -2445,8 +3068,8 @@ static const struct file_operations ftrace_avail_fops = { .release = seq_release_private, }; -static const struct file_operations ftrace_failures_fops = { - .open = ftrace_failures_open, +static const struct file_operations ftrace_enabled_fops = { + .open = ftrace_enabled_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_private, @@ -2457,7 +3080,7 @@ static const struct file_operations ftrace_filter_fops = { .read = seq_read, .write = ftrace_filter_write, .llseek = ftrace_regex_lseek, - .release = ftrace_filter_release, + .release = ftrace_regex_release, }; static const struct file_operations ftrace_notrace_fops = { @@ -2465,7 +3088,7 @@ static const struct file_operations ftrace_notrace_fops = { .read = seq_read, .write = ftrace_notrace_write, .llseek = ftrace_regex_lseek, - .release = ftrace_notrace_release, + .release = ftrace_regex_release, }; #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -2574,9 +3197,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) bool exists; int i; - if (ftrace_disabled) - return -ENODEV; - /* decode regex */ type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) @@ -2585,12 +3205,18 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) search_len = strlen(search); mutex_lock(&ftrace_lock); + + if (unlikely(ftrace_disabled)) { + mutex_unlock(&ftrace_lock); + return -ENODEV; + } + do_for_each_ftrace_rec(pg, rec) { - if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) + if (rec->flags & FTRACE_FL_FREE) continue; - if (ftrace_match_record(rec, search, search_len, type)) { + if (ftrace_match_record(rec, NULL, search, search_len, type)) { /* if it is in the array */ exists = false; for (i = 0; i < *idx; i++) { @@ -2680,8 +3306,8 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) trace_create_file("available_filter_functions", 0444, d_tracer, NULL, &ftrace_avail_fops); - trace_create_file("failures", 0444, - d_tracer, NULL, &ftrace_failures_fops); + trace_create_file("enabled_functions", 0444, + d_tracer, NULL, &ftrace_enabled_fops); trace_create_file("set_ftrace_filter", 0644, d_tracer, NULL, &ftrace_filter_fops); @@ -2704,7 +3330,6 @@ static int ftrace_process_locs(struct module *mod, { unsigned long *p; unsigned long addr; - unsigned long flags; mutex_lock(&ftrace_lock); p = start; @@ -2721,10 +3346,7 @@ static int ftrace_process_locs(struct module *mod, ftrace_record_ip(addr); } - /* disable interrupts to prevent kstop machine */ - local_irq_save(flags); ftrace_update_code(mod); - local_irq_restore(flags); mutex_unlock(&ftrace_lock); return 0; @@ -2736,10 +3358,11 @@ void ftrace_release_mod(struct module *mod) struct dyn_ftrace *rec; struct ftrace_page *pg; + mutex_lock(&ftrace_lock); + if (ftrace_disabled) - return; + goto out_unlock; - mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { if (within_module_core(rec->ip, mod)) { /* @@ -2750,6 +3373,7 @@ void ftrace_release_mod(struct module *mod) ftrace_free_rec(rec); } } while_for_each_ftrace_rec(); + out_unlock: mutex_unlock(&ftrace_lock); } @@ -2836,6 +3460,10 @@ void __init ftrace_init(void) #else +static struct ftrace_ops global_ops = { + .func = ftrace_stub, +}; + static int __init ftrace_nodyn_init(void) { ftrace_enabled = 1; @@ -2846,12 +3474,47 @@ device_initcall(ftrace_nodyn_init); static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } static inline void ftrace_startup_enable(int command) { } /* Keep as macros so we do not need to define the commands */ -# define ftrace_startup(command) do { } while (0) -# define ftrace_shutdown(command) do { } while (0) +# define ftrace_startup(ops, command) \ + ({ \ + (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ + 0; \ + }) +# define ftrace_shutdown(ops, command) do { } while (0) # define ftrace_startup_sysctl() do { } while (0) # define ftrace_shutdown_sysctl() do { } while (0) + +static inline int +ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) +{ + return 1; +} + #endif /* CONFIG_DYNAMIC_FTRACE */ +static void +ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) +{ + struct ftrace_ops *op; + + if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) + return; + + trace_recursion_set(TRACE_INTERNAL_BIT); + /* + * Some of the ops may be dynamically allocated, + * they must be freed after a synchronize_sched(). + */ + preempt_disable_notrace(); + op = rcu_dereference_raw(ftrace_ops_list); + while (op != &ftrace_list_end) { + if (ftrace_ops_test(op, ip)) + op->func(ip, parent_ip); + op = rcu_dereference_raw(op->next); + }; + preempt_enable_notrace(); + trace_recursion_clear(TRACE_INTERNAL_BIT); +} + static void clear_ftrace_swapper(void) { struct task_struct *p; @@ -3144,19 +3807,23 @@ void ftrace_kill(void) */ int register_ftrace_function(struct ftrace_ops *ops) { - int ret; - - if (unlikely(ftrace_disabled)) - return -1; + int ret = -1; mutex_lock(&ftrace_lock); + if (unlikely(ftrace_disabled)) + goto out_unlock; + ret = __register_ftrace_function(ops); - ftrace_startup(0); + if (!ret) + ret = ftrace_startup(ops, 0); + + out_unlock: mutex_unlock(&ftrace_lock); return ret; } +EXPORT_SYMBOL_GPL(register_ftrace_function); /** * unregister_ftrace_function - unregister a function for profiling. @@ -3170,25 +3837,27 @@ int unregister_ftrace_function(struct ftrace_ops *ops) mutex_lock(&ftrace_lock); ret = __unregister_ftrace_function(ops); - ftrace_shutdown(0); + if (!ret) + ftrace_shutdown(ops, 0); mutex_unlock(&ftrace_lock); return ret; } +EXPORT_SYMBOL_GPL(unregister_ftrace_function); int ftrace_enable_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret; - - if (unlikely(ftrace_disabled)) - return -ENODEV; + int ret = -ENODEV; mutex_lock(&ftrace_lock); - ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (unlikely(ftrace_disabled)) + goto out; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) goto out; @@ -3200,11 +3869,11 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, ftrace_startup_sysctl(); /* we are starting ftrace again */ - if (ftrace_list != &ftrace_list_end) { - if (ftrace_list->next == &ftrace_list_end) - ftrace_trace_function = ftrace_list->func; + if (ftrace_ops_list != &ftrace_list_end) { + if (ftrace_ops_list->next == &ftrace_list_end) + ftrace_trace_function = ftrace_ops_list->func; else - ftrace_trace_function = ftrace_list_func; + ftrace_trace_function = ftrace_ops_list_func; } } else { @@ -3328,7 +3997,7 @@ static int start_graph_tracing(void) /* The cpu_boot init_task->ret_stack will never be freed */ for_each_online_cpu(cpu) { if (!idle_task(cpu)->ret_stack) - ftrace_graph_init_task(idle_task(cpu)); + ftrace_graph_init_idle_task(idle_task(cpu), cpu); } do { @@ -3393,7 +4062,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, ftrace_graph_return = retfunc; ftrace_graph_entry = entryfunc; - ftrace_startup(FTRACE_START_FUNC_RET); + ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); out: mutex_unlock(&ftrace_lock); @@ -3410,7 +4079,7 @@ void unregister_ftrace_graph(void) ftrace_graph_active--; ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; - ftrace_shutdown(FTRACE_STOP_FUNC_RET); + ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); @@ -3418,6 +4087,49 @@ void unregister_ftrace_graph(void) mutex_unlock(&ftrace_lock); } +static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack); + +static void +graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) +{ + atomic_set(&t->tracing_graph_pause, 0); + atomic_set(&t->trace_overrun, 0); + t->ftrace_timestamp = 0; + /* make curr_ret_stack visible before we add the ret_stack */ + smp_wmb(); + t->ret_stack = ret_stack; +} + +/* + * Allocate a return stack for the idle task. May be the first + * time through, or it may be done by CPU hotplug online. + */ +void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) +{ + t->curr_ret_stack = -1; + /* + * The idle task has no parent, it either has its own + * stack or no stack at all. + */ + if (t->ret_stack) + WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu)); + + if (ftrace_graph_active) { + struct ftrace_ret_stack *ret_stack; + + ret_stack = per_cpu(idle_ret_stack, cpu); + if (!ret_stack) { + ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH + * sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!ret_stack) + return; + per_cpu(idle_ret_stack, cpu) = ret_stack; + } + graph_init_task(t, ret_stack); + } +} + /* Allocate a return stack for newly created task */ void ftrace_graph_init_task(struct task_struct *t) { @@ -3433,12 +4145,7 @@ void ftrace_graph_init_task(struct task_struct *t) GFP_KERNEL); if (!ret_stack) return; - atomic_set(&t->tracing_graph_pause, 0); - atomic_set(&t->trace_overrun, 0); - t->ftrace_timestamp = 0; - /* make curr_ret_stack visable before we add the ret_stack */ - smp_wmb(); - t->ret_stack = ret_stack; + graph_init_task(t, ret_stack); } } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bd1c35a4fbcc..b0c7aa407943 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5,7 +5,6 @@ */ #include <linux/ring_buffer.h> #include <linux/trace_clock.h> -#include <linux/ftrace_irq.h> #include <linux/spinlock.h> #include <linux/debugfs.h> #include <linux/uaccess.h> @@ -669,7 +668,7 @@ static struct list_head *rb_list_head(struct list_head *list) * the reader page). But if the next page is a header page, * its flags will be non zero. */ -static int inline +static inline int rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *page, struct list_head *list) { @@ -1429,6 +1428,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) } EXPORT_SYMBOL_GPL(ring_buffer_resize); +void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val) +{ + mutex_lock(&buffer->mutex); + if (val) + buffer->flags |= RB_FL_OVERWRITE; + else + buffer->flags &= ~RB_FL_OVERWRITE; + mutex_unlock(&buffer->mutex); +} +EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); + static inline void * __rb_data_page_index(struct buffer_data_page *bpage, unsigned index) { @@ -1468,7 +1478,7 @@ static inline unsigned long rb_page_entries(struct buffer_page *bpage) return local_read(&bpage->entries) & RB_WRITE_MASK; } -/* Size is determined by what has been commited */ +/* Size is determined by what has been committed */ static inline unsigned rb_page_size(struct buffer_page *bpage) { return rb_page_commit(bpage); @@ -2162,11 +2172,19 @@ rb_reserve_next_event(struct ring_buffer *buffer, if (likely(ts >= cpu_buffer->write_stamp)) { delta = diff; if (unlikely(test_time_stamp(delta))) { + int local_clock_stable = 1; +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK + local_clock_stable = sched_clock_stable; +#endif WARN_ONCE(delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", (unsigned long long)delta, (unsigned long long)ts, - (unsigned long long)cpu_buffer->write_stamp); + (unsigned long long)cpu_buffer->write_stamp, + local_clock_stable ? "" : + "If you just came from a suspend/resume,\n" + "please switch to the trace global clock:\n" + " echo global > /sys/kernel/debug/tracing/trace_clock\n"); add_timestamp = 1; } } @@ -2198,7 +2216,7 @@ static noinline void trace_recursive_fail(void) printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" "HC[%lu]:SC[%lu]:NMI[%lu]\n", - current->trace_recursion, + trace_recursion_buffer(), hardirq_count() >> HARDIRQ_SHIFT, softirq_count() >> SOFTIRQ_SHIFT, in_nmi()); @@ -2208,9 +2226,9 @@ static noinline void trace_recursive_fail(void) static inline int trace_recursive_lock(void) { - current->trace_recursion++; + trace_recursion_inc(); - if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) + if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH)) return 0; trace_recursive_fail(); @@ -2220,9 +2238,9 @@ static inline int trace_recursive_lock(void) static inline void trace_recursive_unlock(void) { - WARN_ON_ONCE(!current->trace_recursion); + WARN_ON_ONCE(!trace_recursion_buffer()); - current->trace_recursion--; + trace_recursion_dec(); } #else @@ -2914,7 +2932,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) /* * cpu_buffer->pages just needs to point to the buffer, it * has no specific buffer page to point to. Lets move it out - * of our way so we don't accidently swap it. + * of our way so we don't accidentally swap it. */ cpu_buffer->pages = reader->list.prev; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dc53ecb80589..ee9c921d7f21 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -41,8 +41,6 @@ #include "trace.h" #include "trace_output.h" -#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) - /* * On boot up, the ring buffer is set to the minimum size, so that * we do not waste memory on systems that are not using tracing. @@ -340,7 +338,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | - TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD; + TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; static int trace_stop_count; static DEFINE_SPINLOCK(tracing_start_lock); @@ -425,6 +423,7 @@ static const char *trace_options[] = { "sleep-time", "graph-time", "record-cmd", + "overwrite", NULL }; @@ -780,6 +779,11 @@ __acquires(kernel_lock) tracing_reset_online_cpus(tr); current_trace = type; + + /* If we expanded the buffers, make sure the max is expanded too */ + if (ring_buffer_expanded && type->use_max_tr) + ring_buffer_resize(max_tr.buffer, trace_buf_size); + /* the test is responsible for initializing and enabling */ pr_info("Testing tracer %s: ", type->name); ret = type->selftest(type, tr); @@ -792,6 +796,10 @@ __acquires(kernel_lock) /* Only reset on passing, to avoid touching corrupted buffers */ tracing_reset_online_cpus(tr); + /* Shrink the max buffer again */ + if (ring_buffer_expanded && type->use_max_tr) + ring_buffer_resize(max_tr.buffer, 1); + printk(KERN_CONT "PASSED\n"); } #endif @@ -1102,7 +1110,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; - entry->lock_depth = (tsk) ? tsk->lock_depth : 0; + entry->padding = 0; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | @@ -1749,10 +1757,9 @@ static void print_lat_help_header(struct seq_file *m) seq_puts(m, "# | / _----=> need-resched \n"); seq_puts(m, "# || / _---=> hardirq/softirq \n"); seq_puts(m, "# ||| / _--=> preempt-depth \n"); - seq_puts(m, "# |||| /_--=> lock-depth \n"); - seq_puts(m, "# |||||/ delay \n"); - seq_puts(m, "# cmd pid |||||| time | caller \n"); - seq_puts(m, "# \\ / |||||| \\ | / \n"); + seq_puts(m, "# |||| / delay \n"); + seq_puts(m, "# cmd pid ||||| time | caller \n"); + seq_puts(m, "# \\ / ||||| \\ | / \n"); } static void print_func_help_header(struct seq_file *m) @@ -2007,9 +2014,10 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) { enum print_line_t ret; - if (iter->lost_events) - trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", - iter->cpu, iter->lost_events); + if (iter->lost_events && + !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", + iter->cpu, iter->lost_events)) + return TRACE_TYPE_PARTIAL_LINE; if (iter->trace && iter->trace->print_line) { ret = iter->trace->print_line(iter); @@ -2529,6 +2537,9 @@ static void set_tracer_flags(unsigned int mask, int enabled) if (mask == TRACE_ITER_RECORD_CMD) trace_event_enable_cmd_record(enabled); + + if (mask == TRACE_ITER_OVERWRITE) + ring_buffer_change_overwrite(global_trace.buffer, enabled); } static ssize_t @@ -2710,6 +2721,10 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, mutex_lock(&trace_types_lock); if (tracer_enabled ^ val) { + + /* Only need to warn if this is used to change the state */ + WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on"); + if (val) { tracer_enabled = 1; if (current_trace->start) @@ -3216,6 +3231,14 @@ waitagain: if (iter->seq.len >= cnt) break; + + /* + * Setting the full flag means we reached the trace_seq buffer + * size and we should leave by partial output condition above. + * One of the trace_seq_* functions is not used properly. + */ + WARN_ONCE(iter->seq.full, "full flag set for trace type %d", + iter->ent->type); } trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); @@ -3226,7 +3249,7 @@ waitagain: trace_seq_init(&iter->seq); /* - * If there was nothing to send to user, inspite of consuming trace + * If there was nothing to send to user, in spite of consuming trace * entries, go back to wait for more entries. */ if (sret == -EBUSY) @@ -4551,9 +4574,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) __init static int tracer_alloc_buffers(void) { int ring_buf_size; + enum ring_buffer_flags rb_flags; int i; int ret = -ENOMEM; + if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) goto out; @@ -4566,12 +4591,13 @@ __init static int tracer_alloc_buffers(void) else ring_buf_size = 1; + rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + cpumask_copy(tracing_buffer_mask, cpu_possible_mask); cpumask_copy(tracing_cpumask, cpu_all_mask); /* TODO: make the number of buffers hot pluggable with CPUS */ - global_trace.buffer = ring_buffer_alloc(ring_buf_size, - TRACE_BUFFER_FLAGS); + global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags); if (!global_trace.buffer) { printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); WARN_ON(1); @@ -4581,7 +4607,7 @@ __init static int tracer_alloc_buffers(void) #ifdef CONFIG_TRACER_MAX_TRACE - max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS); + max_tr.buffer = ring_buffer_alloc(1, rb_flags); if (!max_tr.buffer) { printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); WARN_ON(1); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9021f8c0c0c3..229f8591f61d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -272,8 +272,8 @@ struct tracer { /* If you handled the flag setting, return 0 */ int (*set_flag)(u32 old_flags, u32 bit, int set); struct tracer *next; - int print_max; struct tracer_flags *flags; + int print_max; int use_max_tr; }; @@ -419,6 +419,8 @@ extern void trace_find_cmdline(int pid, char comm[]); extern unsigned long ftrace_update_tot_cnt; #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func extern int DYN_FTRACE_TEST_NAME(void); +#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 +extern int DYN_FTRACE_TEST_NAME2(void); #endif extern int ring_buffer_expanded; @@ -606,6 +608,7 @@ enum trace_iterator_flags { TRACE_ITER_SLEEP_TIME = 0x40000, TRACE_ITER_GRAPH_TIME = 0x80000, TRACE_ITER_RECORD_CMD = 0x100000, + TRACE_ITER_OVERWRITE = 0x200000, }; /* @@ -661,8 +664,10 @@ struct ftrace_event_field { }; struct event_filter { - int n_preds; - struct filter_pred **preds; + int n_preds; /* Number assigned */ + int a_preds; /* allocated */ + struct filter_pred *preds; + struct filter_pred *root; char *filter_string; }; @@ -674,11 +679,23 @@ struct event_subsystem { int nr_events; }; +#define FILTER_PRED_INVALID ((unsigned short)-1) +#define FILTER_PRED_IS_RIGHT (1 << 15) +#define FILTER_PRED_FOLD (1 << 15) + +/* + * The max preds is the size of unsigned short with + * two flags at the MSBs. One bit is used for both the IS_RIGHT + * and FOLD flags. The other is reserved. + * + * 2^14 preds is way more than enough. + */ +#define MAX_FILTER_PRED 16384 + struct filter_pred; struct regex; -typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, - int val1, int val2); +typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); typedef int (*regex_match_func)(char *str, struct regex *r, int len); @@ -700,11 +717,23 @@ struct filter_pred { filter_pred_fn_t fn; u64 val; struct regex regex; - char *field_name; + /* + * Leaf nodes use field_name, ops is used by AND and OR + * nodes. The field_name is always freed when freeing a pred. + * We can overload field_name for ops and have it freed + * as well. + */ + union { + char *field_name; + unsigned short *ops; + }; int offset; int not; int op; - int pop_n; + unsigned short index; + unsigned short parent; + unsigned short left; + unsigned short right; }; extern struct list_head ftrace_common_fields; @@ -755,4 +784,19 @@ extern const char *__stop___trace_bprintk_fmt[]; FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) #include "trace_entries.h" +/* Only current can touch trace_recursion */ +#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0) +#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0) + +/* Ring buffer has the 10 LSB bits to count */ +#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) + +/* for function tracing recursion */ +#define TRACE_INTERNAL_BIT (1<<11) +#define TRACE_GLOBAL_BIT (1<<12) + +#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) +#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) +#define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 685a67d55db0..6302747a1398 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -46,7 +46,7 @@ u64 notrace trace_clock_local(void) } /* - * trace_clock(): 'inbetween' trace clock. Not completely serialized, + * trace_clock(): 'between' trace clock. Not completely serialized, * but not completely incorrect when crossing CPUs either. * * This is based on cpu_clock(), which will allow at most ~1 jiffy of diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 6cf223764be8..e32744c84d94 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -27,7 +27,7 @@ * in the structure. * * * for structures within structures, the format of the internal - * structure is layed out. This allows the internal structure + * structure is laid out. This allows the internal structure * to be deciphered for the format file. Although these macros * may become out of sync with the internal structure, they * will create a compile error if it happens. Since the @@ -109,12 +109,12 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, */ #define FTRACE_CTX_FIELDS \ __field( unsigned int, prev_pid ) \ + __field( unsigned int, next_pid ) \ + __field( unsigned int, next_cpu ) \ __field( unsigned char, prev_prio ) \ __field( unsigned char, prev_state ) \ - __field( unsigned int, next_pid ) \ __field( unsigned char, next_prio ) \ - __field( unsigned char, next_state ) \ - __field( unsigned int, next_cpu ) + __field( unsigned char, next_state ) FTRACE_ENTRY(context_switch, ctx_switch_entry, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5f499e0438a4..686ec399f2a8 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -116,7 +116,7 @@ static int trace_define_common_fields(void) __common_field(unsigned char, flags); __common_field(unsigned char, preempt_count); __common_field(int, pid); - __common_field(int, lock_depth); + __common_field(int, padding); return ret; } @@ -326,6 +326,7 @@ int trace_set_clr_event(const char *system, const char *event, int set) { return __ftrace_set_clr_event(NULL, system, event, set); } +EXPORT_SYMBOL_GPL(trace_set_clr_event); /* 128 should be much more than enough */ #define EVENT_BUF_SIZE 127 @@ -1656,7 +1657,12 @@ static struct ftrace_ops trace_ops __initdata = static __init void event_trace_self_test_with_function(void) { - register_ftrace_function(&trace_ops); + int ret; + ret = register_ftrace_function(&trace_ops); + if (WARN_ON(ret < 0)) { + pr_info("Failed to enable function tracer for event tests\n"); + return; + } pr_info("Running tests again, along with the function tracer\n"); event_trace_self_tests(); unregister_ftrace_function(&trace_ops); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 36d40104b17f..8008ddcfbf20 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -123,9 +123,13 @@ struct filter_parse_state { } operand; }; +struct pred_stack { + struct filter_pred **preds; + int index; +}; + #define DEFINE_COMPARISON_PRED(type) \ -static int filter_pred_##type(struct filter_pred *pred, void *event, \ - int val1, int val2) \ +static int filter_pred_##type(struct filter_pred *pred, void *event) \ { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ @@ -152,8 +156,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event, \ } #define DEFINE_EQUALITY_PRED(size) \ -static int filter_pred_##size(struct filter_pred *pred, void *event, \ - int val1, int val2) \ +static int filter_pred_##size(struct filter_pred *pred, void *event) \ { \ u##size *addr = (u##size *)(event + pred->offset); \ u##size val = (u##size)pred->val; \ @@ -178,23 +181,8 @@ DEFINE_EQUALITY_PRED(32); DEFINE_EQUALITY_PRED(16); DEFINE_EQUALITY_PRED(8); -static int filter_pred_and(struct filter_pred *pred __attribute((unused)), - void *event __attribute((unused)), - int val1, int val2) -{ - return val1 && val2; -} - -static int filter_pred_or(struct filter_pred *pred __attribute((unused)), - void *event __attribute((unused)), - int val1, int val2) -{ - return val1 || val2; -} - /* Filter predicate for fixed sized arrays of characters */ -static int filter_pred_string(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_string(struct filter_pred *pred, void *event) { char *addr = (char *)(event + pred->offset); int cmp, match; @@ -207,8 +195,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event, } /* Filter predicate for char * pointers */ -static int filter_pred_pchar(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_pchar(struct filter_pred *pred, void *event) { char **addr = (char **)(event + pred->offset); int cmp, match; @@ -231,8 +218,7 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event, * and add it to the address of the entry, and at last we have * the address of the string. */ -static int filter_pred_strloc(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_strloc(struct filter_pred *pred, void *event) { u32 str_item = *(u32 *)(event + pred->offset); int str_loc = str_item & 0xffff; @@ -247,8 +233,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event, return match; } -static int filter_pred_none(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_none(struct filter_pred *pred, void *event) { return 0; } @@ -377,32 +362,147 @@ static void filter_build_regex(struct filter_pred *pred) pred->not ^= not; } +enum move_type { + MOVE_DOWN, + MOVE_UP_FROM_LEFT, + MOVE_UP_FROM_RIGHT +}; + +static struct filter_pred * +get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, + int index, enum move_type *move) +{ + if (pred->parent & FILTER_PRED_IS_RIGHT) + *move = MOVE_UP_FROM_RIGHT; + else + *move = MOVE_UP_FROM_LEFT; + pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT]; + + return pred; +} + +/* + * A series of AND or ORs where found together. Instead of + * climbing up and down the tree branches, an array of the + * ops were made in order of checks. We can just move across + * the array and short circuit if needed. + */ +static int process_ops(struct filter_pred *preds, + struct filter_pred *op, void *rec) +{ + struct filter_pred *pred; + int match = 0; + int type; + int i; + + /* + * Micro-optimization: We set type to true if op + * is an OR and false otherwise (AND). Then we + * just need to test if the match is equal to + * the type, and if it is, we can short circuit the + * rest of the checks: + * + * if ((match && op->op == OP_OR) || + * (!match && op->op == OP_AND)) + * return match; + */ + type = op->op == OP_OR; + + for (i = 0; i < op->val; i++) { + pred = &preds[op->ops[i]]; + match = pred->fn(pred, rec); + if (!!match == type) + return match; + } + return match; +} + /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { - int match, top = 0, val1 = 0, val2 = 0; - int stack[MAX_FILTER_PRED]; + int match = -1; + enum move_type move = MOVE_DOWN; + struct filter_pred *preds; struct filter_pred *pred; - int i; + struct filter_pred *root; + int n_preds; + int done = 0; + + /* no filter is considered a match */ + if (!filter) + return 1; + + n_preds = filter->n_preds; + + if (!n_preds) + return 1; + + /* + * n_preds, root and filter->preds are protect with preemption disabled. + */ + preds = rcu_dereference_sched(filter->preds); + root = rcu_dereference_sched(filter->root); + if (!root) + return 1; + + pred = root; - for (i = 0; i < filter->n_preds; i++) { - pred = filter->preds[i]; - if (!pred->pop_n) { - match = pred->fn(pred, rec, val1, val2); - stack[top++] = match; + /* match is currently meaningless */ + match = -1; + + do { + switch (move) { + case MOVE_DOWN: + /* only AND and OR have children */ + if (pred->left != FILTER_PRED_INVALID) { + /* If ops is set, then it was folded. */ + if (!pred->ops) { + /* keep going to down the left side */ + pred = &preds[pred->left]; + continue; + } + /* We can treat folded ops as a leaf node */ + match = process_ops(preds, pred, rec); + } else + match = pred->fn(pred, rec); + /* If this pred is the only pred */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + /* + * Check for short circuits. + * + * Optimization: !!match == (pred->op == OP_OR) + * is the same as: + * if ((match && pred->op == OP_OR) || + * (!match && pred->op == OP_AND)) + */ + if (!!match == (pred->op == OP_OR)) { + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + /* now go down the right side of the tree. */ + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + /* We finished this equation. */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); continue; } - if (pred->pop_n > top) { - WARN_ON_ONCE(1); - return 0; - } - val1 = stack[--top]; - val2 = stack[--top]; - match = pred->fn(pred, rec, val1, val2); - stack[top++] = match; - } + done = 1; + } while (!done); - return stack[--top]; + return match; } EXPORT_SYMBOL_GPL(filter_match_preds); @@ -414,6 +514,9 @@ static void parse_error(struct filter_parse_state *ps, int err, int pos) static void remove_filter_string(struct event_filter *filter) { + if (!filter) + return; + kfree(filter->filter_string); filter->filter_string = NULL; } @@ -473,9 +576,10 @@ static void append_filter_err(struct filter_parse_state *ps, void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) { - struct event_filter *filter = call->filter; + struct event_filter *filter; mutex_lock(&event_mutex); + filter = call->filter; if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else @@ -486,9 +590,10 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s) { - struct event_filter *filter = system->filter; + struct event_filter *filter; mutex_lock(&event_mutex); + filter = system->filter; if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else @@ -539,10 +644,58 @@ static void filter_clear_pred(struct filter_pred *pred) pred->regex.len = 0; } -static int filter_set_pred(struct filter_pred *dest, +static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) +{ + stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); + if (!stack->preds) + return -ENOMEM; + stack->index = n_preds; + return 0; +} + +static void __free_pred_stack(struct pred_stack *stack) +{ + kfree(stack->preds); + stack->index = 0; +} + +static int __push_pred_stack(struct pred_stack *stack, + struct filter_pred *pred) +{ + int index = stack->index; + + if (WARN_ON(index == 0)) + return -ENOSPC; + + stack->preds[--index] = pred; + stack->index = index; + return 0; +} + +static struct filter_pred * +__pop_pred_stack(struct pred_stack *stack) +{ + struct filter_pred *pred; + int index = stack->index; + + pred = stack->preds[index++]; + if (!pred) + return NULL; + + stack->index = index; + return pred; +} + +static int filter_set_pred(struct event_filter *filter, + int idx, + struct pred_stack *stack, struct filter_pred *src, filter_pred_fn_t fn) { + struct filter_pred *dest = &filter->preds[idx]; + struct filter_pred *left; + struct filter_pred *right; + *dest = *src; if (src->field_name) { dest->field_name = kstrdup(src->field_name, GFP_KERNEL); @@ -550,116 +703,140 @@ static int filter_set_pred(struct filter_pred *dest, return -ENOMEM; } dest->fn = fn; + dest->index = idx; - return 0; + if (dest->op == OP_OR || dest->op == OP_AND) { + right = __pop_pred_stack(stack); + left = __pop_pred_stack(stack); + if (!left || !right) + return -EINVAL; + /* + * If both children can be folded + * and they are the same op as this op or a leaf, + * then this op can be folded. + */ + if (left->index & FILTER_PRED_FOLD && + (left->op == dest->op || + left->left == FILTER_PRED_INVALID) && + right->index & FILTER_PRED_FOLD && + (right->op == dest->op || + right->left == FILTER_PRED_INVALID)) + dest->index |= FILTER_PRED_FOLD; + + dest->left = left->index & ~FILTER_PRED_FOLD; + dest->right = right->index & ~FILTER_PRED_FOLD; + left->parent = dest->index & ~FILTER_PRED_FOLD; + right->parent = dest->index | FILTER_PRED_IS_RIGHT; + } else { + /* + * Make dest->left invalid to be used as a quick + * way to know this is a leaf node. + */ + dest->left = FILTER_PRED_INVALID; + + /* All leafs allow folding the parent ops. */ + dest->index |= FILTER_PRED_FOLD; + } + + return __push_pred_stack(stack, dest); } -static void filter_disable_preds(struct ftrace_event_call *call) +static void __free_preds(struct event_filter *filter) { - struct event_filter *filter = call->filter; int i; - call->flags &= ~TRACE_EVENT_FL_FILTERED; + if (filter->preds) { + for (i = 0; i < filter->a_preds; i++) + kfree(filter->preds[i].field_name); + kfree(filter->preds); + filter->preds = NULL; + } + filter->a_preds = 0; filter->n_preds = 0; - - for (i = 0; i < MAX_FILTER_PRED; i++) - filter->preds[i]->fn = filter_pred_none; } -static void __free_preds(struct event_filter *filter) +static void filter_disable(struct ftrace_event_call *call) { - int i; + call->flags &= ~TRACE_EVENT_FL_FILTERED; +} +static void __free_filter(struct event_filter *filter) +{ if (!filter) return; - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (filter->preds[i]) - filter_free_pred(filter->preds[i]); - } - kfree(filter->preds); + __free_preds(filter); kfree(filter->filter_string); kfree(filter); } +/* + * Called when destroying the ftrace_event_call. + * The call is being freed, so we do not need to worry about + * the call being currently used. This is for module code removing + * the tracepoints from within it. + */ void destroy_preds(struct ftrace_event_call *call) { - __free_preds(call->filter); + __free_filter(call->filter); call->filter = NULL; - call->flags &= ~TRACE_EVENT_FL_FILTERED; } -static struct event_filter *__alloc_preds(void) +static struct event_filter *__alloc_filter(void) { struct event_filter *filter; + + filter = kzalloc(sizeof(*filter), GFP_KERNEL); + return filter; +} + +static int __alloc_preds(struct event_filter *filter, int n_preds) +{ struct filter_pred *pred; int i; - filter = kzalloc(sizeof(*filter), GFP_KERNEL); - if (!filter) - return ERR_PTR(-ENOMEM); + if (filter->preds) + __free_preds(filter); - filter->n_preds = 0; + filter->preds = + kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); - filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); if (!filter->preds) - goto oom; + return -ENOMEM; - for (i = 0; i < MAX_FILTER_PRED; i++) { - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) - goto oom; + filter->a_preds = n_preds; + filter->n_preds = 0; + + for (i = 0; i < n_preds; i++) { + pred = &filter->preds[i]; pred->fn = filter_pred_none; - filter->preds[i] = pred; } - return filter; - -oom: - __free_preds(filter); - return ERR_PTR(-ENOMEM); -} - -static int init_preds(struct ftrace_event_call *call) -{ - if (call->filter) - return 0; - - call->flags &= ~TRACE_EVENT_FL_FILTERED; - call->filter = __alloc_preds(); - if (IS_ERR(call->filter)) - return PTR_ERR(call->filter); - return 0; } -static int init_subsystem_preds(struct event_subsystem *system) +static void filter_free_subsystem_preds(struct event_subsystem *system) { struct ftrace_event_call *call; - int err; list_for_each_entry(call, &ftrace_events, list) { if (strcmp(call->class->system, system->name) != 0) continue; - err = init_preds(call); - if (err) - return err; + filter_disable(call); + remove_filter_string(call->filter); } - - return 0; } -static void filter_free_subsystem_preds(struct event_subsystem *system) +static void filter_free_subsystem_filters(struct event_subsystem *system) { struct ftrace_event_call *call; list_for_each_entry(call, &ftrace_events, list) { if (strcmp(call->class->system, system->name) != 0) continue; - - filter_disable_preds(call); - remove_filter_string(call->filter); + __free_filter(call->filter); + call->filter = NULL; } } @@ -667,18 +844,19 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, struct ftrace_event_call *call, struct event_filter *filter, struct filter_pred *pred, + struct pred_stack *stack, filter_pred_fn_t fn) { int idx, err; - if (filter->n_preds == MAX_FILTER_PRED) { + if (WARN_ON(filter->n_preds == filter->a_preds)) { parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); return -ENOSPC; } idx = filter->n_preds; - filter_clear_pred(filter->preds[idx]); - err = filter_set_pred(filter->preds[idx], pred, fn); + filter_clear_pred(&filter->preds[idx]); + err = filter_set_pred(filter, idx, stack, pred, fn); if (err) return err; @@ -763,6 +941,7 @@ static int filter_add_pred(struct filter_parse_state *ps, struct ftrace_event_call *call, struct event_filter *filter, struct filter_pred *pred, + struct pred_stack *stack, bool dry_run) { struct ftrace_event_field *field; @@ -770,17 +949,12 @@ static int filter_add_pred(struct filter_parse_state *ps, unsigned long long val; int ret; - pred->fn = filter_pred_none; + fn = pred->fn = filter_pred_none; - if (pred->op == OP_AND) { - pred->pop_n = 2; - fn = filter_pred_and; + if (pred->op == OP_AND) goto add_pred_fn; - } else if (pred->op == OP_OR) { - pred->pop_n = 2; - fn = filter_pred_or; + else if (pred->op == OP_OR) goto add_pred_fn; - } field = find_event_field(call, pred->field_name); if (!field) { @@ -829,7 +1003,7 @@ static int filter_add_pred(struct filter_parse_state *ps, add_pred_fn: if (!dry_run) - return filter_add_pred_fn(ps, call, filter, pred, fn); + return filter_add_pred_fn(ps, call, filter, pred, stack, fn); return 0; } @@ -1187,6 +1361,234 @@ static int check_preds(struct filter_parse_state *ps) return 0; } +static int count_preds(struct filter_parse_state *ps) +{ + struct postfix_elt *elt; + int n_preds = 0; + + list_for_each_entry(elt, &ps->postfix, list) { + if (elt->op == OP_NONE) + continue; + n_preds++; + } + + return n_preds; +} + +/* + * The tree is walked at filtering of an event. If the tree is not correctly + * built, it may cause an infinite loop. Check here that the tree does + * indeed terminate. + */ +static int check_pred_tree(struct event_filter *filter, + struct filter_pred *root) +{ + struct filter_pred *preds; + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int count = 0; + int done = 0; + int max; + + /* + * The max that we can hit a node is three times. + * Once going down, once coming up from left, and + * once coming up from right. This is more than enough + * since leafs are only hit a single time. + */ + max = 3 * filter->n_preds; + + preds = filter->preds; + if (!preds) + return -EINVAL; + pred = root; + + do { + if (WARN_ON(count++ > max)) + return -EINVAL; + + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + /* A leaf at the root is just a leaf in the tree */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + /* We are fine. */ + return 0; +} + +static int count_leafs(struct filter_pred *preds, struct filter_pred *root) +{ + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int count = 0; + int done = 0; + + pred = root; + + do { + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + /* A leaf at the root is just a leaf in the tree */ + if (pred == root) + return 1; + count++; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + return count; +} + +static int fold_pred(struct filter_pred *preds, struct filter_pred *root) +{ + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int count = 0; + int children; + int done = 0; + + /* No need to keep the fold flag */ + root->index &= ~FILTER_PRED_FOLD; + + /* If the root is a leaf then do nothing */ + if (root->left == FILTER_PRED_INVALID) + return 0; + + /* count the children */ + children = count_leafs(preds, &preds[root->left]); + children += count_leafs(preds, &preds[root->right]); + + root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL); + if (!root->ops) + return -ENOMEM; + + root->val = children; + + pred = root; + do { + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + if (WARN_ON(count == children)) + return -EINVAL; + pred->index &= ~FILTER_PRED_FOLD; + root->ops[count++] = pred->index; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + return 0; +} + +/* + * To optimize the processing of the ops, if we have several "ors" or + * "ands" together, we can put them in an array and process them all + * together speeding up the filter logic. + */ +static int fold_pred_tree(struct event_filter *filter, + struct filter_pred *root) +{ + struct filter_pred *preds; + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int done = 0; + int err; + + preds = filter->preds; + if (!preds) + return -EINVAL; + pred = root; + + do { + switch (move) { + case MOVE_DOWN: + if (pred->index & FILTER_PRED_FOLD) { + err = fold_pred(preds, pred); + if (err) + return err; + /* Folded nodes are like leafs */ + } else if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + + /* A leaf at the root is just a leaf in the tree */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + return 0; +} + static int replace_preds(struct ftrace_event_call *call, struct event_filter *filter, struct filter_parse_state *ps, @@ -1195,14 +1597,32 @@ static int replace_preds(struct ftrace_event_call *call, { char *operand1 = NULL, *operand2 = NULL; struct filter_pred *pred; + struct filter_pred *root; struct postfix_elt *elt; + struct pred_stack stack = { }; /* init to NULL */ int err; int n_preds = 0; + n_preds = count_preds(ps); + if (n_preds >= MAX_FILTER_PRED) { + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); + return -ENOSPC; + } + err = check_preds(ps); if (err) return err; + if (!dry_run) { + err = __alloc_pred_stack(&stack, n_preds); + if (err) + return err; + err = __alloc_preds(filter, n_preds); + if (err) + goto fail; + } + + n_preds = 0; list_for_each_entry(elt, &ps->postfix, list) { if (elt->op == OP_NONE) { if (!operand1) @@ -1211,14 +1631,16 @@ static int replace_preds(struct ftrace_event_call *call, operand2 = elt->operand; else { parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); - return -EINVAL; + err = -EINVAL; + goto fail; } continue; } - if (n_preds++ == MAX_FILTER_PRED) { + if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); - return -ENOSPC; + err = -ENOSPC; + goto fail; } if (elt->op == OP_AND || elt->op == OP_OR) { @@ -1228,76 +1650,181 @@ static int replace_preds(struct ftrace_event_call *call, if (!operand1 || !operand2) { parse_error(ps, FILT_ERR_MISSING_FIELD, 0); - return -EINVAL; + err = -EINVAL; + goto fail; } pred = create_pred(elt->op, operand1, operand2); add_pred: - if (!pred) - return -ENOMEM; - err = filter_add_pred(ps, call, filter, pred, dry_run); + if (!pred) { + err = -ENOMEM; + goto fail; + } + err = filter_add_pred(ps, call, filter, pred, &stack, dry_run); filter_free_pred(pred); if (err) - return err; + goto fail; operand1 = operand2 = NULL; } - return 0; + if (!dry_run) { + /* We should have one item left on the stack */ + pred = __pop_pred_stack(&stack); + if (!pred) + return -EINVAL; + /* This item is where we start from in matching */ + root = pred; + /* Make sure the stack is empty */ + pred = __pop_pred_stack(&stack); + if (WARN_ON(pred)) { + err = -EINVAL; + filter->root = NULL; + goto fail; + } + err = check_pred_tree(filter, root); + if (err) + goto fail; + + /* Optimize the tree */ + err = fold_pred_tree(filter, root); + if (err) + goto fail; + + /* We don't set root until we know it works */ + barrier(); + filter->root = root; + } + + err = 0; +fail: + __free_pred_stack(&stack); + return err; } +struct filter_list { + struct list_head list; + struct event_filter *filter; +}; + static int replace_system_preds(struct event_subsystem *system, struct filter_parse_state *ps, char *filter_string) { struct ftrace_event_call *call; + struct filter_list *filter_item; + struct filter_list *tmp; + LIST_HEAD(filter_list); bool fail = true; int err; list_for_each_entry(call, &ftrace_events, list) { - struct event_filter *filter = call->filter; if (strcmp(call->class->system, system->name) != 0) continue; - /* try to see if the filter can be applied */ - err = replace_preds(call, filter, ps, filter_string, true); + /* + * Try to see if the filter can be applied + * (filter arg is ignored on dry_run) + */ + err = replace_preds(call, NULL, ps, filter_string, true); if (err) + goto fail; + } + + list_for_each_entry(call, &ftrace_events, list) { + struct event_filter *filter; + + if (strcmp(call->class->system, system->name) != 0) continue; - /* really apply the filter */ - filter_disable_preds(call); - err = replace_preds(call, filter, ps, filter_string, false); + filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); + if (!filter_item) + goto fail_mem; + + list_add_tail(&filter_item->list, &filter_list); + + filter_item->filter = __alloc_filter(); + if (!filter_item->filter) + goto fail_mem; + filter = filter_item->filter; + + /* Can only fail on no memory */ + err = replace_filter_string(filter, filter_string); if (err) - filter_disable_preds(call); - else { + goto fail_mem; + + err = replace_preds(call, filter, ps, filter_string, false); + if (err) { + filter_disable(call); + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + append_filter_err(ps, filter); + } else call->flags |= TRACE_EVENT_FL_FILTERED; - replace_filter_string(filter, filter_string); - } + /* + * Regardless of if this returned an error, we still + * replace the filter for the call. + */ + filter = call->filter; + call->filter = filter_item->filter; + filter_item->filter = filter; + fail = false; } - if (fail) { - parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - return -EINVAL; + if (fail) + goto fail; + + /* + * The calls can still be using the old filters. + * Do a synchronize_sched() to ensure all calls are + * done with them before we free them. + */ + synchronize_sched(); + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + __free_filter(filter_item->filter); + list_del(&filter_item->list); + kfree(filter_item); } return 0; + fail: + /* No call succeeded */ + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + list_del(&filter_item->list); + kfree(filter_item); + } + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + return -EINVAL; + fail_mem: + /* If any call succeeded, we still need to sync */ + if (!fail) + synchronize_sched(); + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + __free_filter(filter_item->filter); + list_del(&filter_item->list); + kfree(filter_item); + } + return -ENOMEM; } int apply_event_filter(struct ftrace_event_call *call, char *filter_string) { - int err; struct filter_parse_state *ps; + struct event_filter *filter; + struct event_filter *tmp; + int err = 0; mutex_lock(&event_mutex); - err = init_preds(call); - if (err) - goto out_unlock; - if (!strcmp(strstrip(filter_string), "0")) { - filter_disable_preds(call); - remove_filter_string(call->filter); + filter_disable(call); + filter = call->filter; + if (!filter) + goto out_unlock; + call->filter = NULL; + /* Make sure the filter is not being used */ + synchronize_sched(); + __free_filter(filter); goto out_unlock; } @@ -1306,22 +1833,41 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) if (!ps) goto out_unlock; - filter_disable_preds(call); - replace_filter_string(call->filter, filter_string); + filter = __alloc_filter(); + if (!filter) { + kfree(ps); + goto out_unlock; + } + + replace_filter_string(filter, filter_string); parse_init(ps, filter_ops, filter_string); err = filter_parse(ps); if (err) { - append_filter_err(ps, call->filter); + append_filter_err(ps, filter); goto out; } - err = replace_preds(call, call->filter, ps, filter_string, false); - if (err) - append_filter_err(ps, call->filter); - else + err = replace_preds(call, filter, ps, filter_string, false); + if (err) { + filter_disable(call); + append_filter_err(ps, filter); + } else call->flags |= TRACE_EVENT_FL_FILTERED; out: + /* + * Always swap the call filter with the new filter + * even if there was an error. If there was an error + * in the filter, we disable the filter and show the error + * string + */ + tmp = call->filter; + call->filter = filter; + if (tmp) { + /* Make sure the call is done with the filter */ + synchronize_sched(); + __free_filter(tmp); + } filter_opstack_clear(ps); postfix_clear(ps); kfree(ps); @@ -1334,18 +1880,21 @@ out_unlock: int apply_subsystem_event_filter(struct event_subsystem *system, char *filter_string) { - int err; struct filter_parse_state *ps; + struct event_filter *filter; + int err = 0; mutex_lock(&event_mutex); - err = init_subsystem_preds(system); - if (err) - goto out_unlock; - if (!strcmp(strstrip(filter_string), "0")) { filter_free_subsystem_preds(system); remove_filter_string(system->filter); + filter = system->filter; + system->filter = NULL; + /* Ensure all filters are no longer used */ + synchronize_sched(); + filter_free_subsystem_filters(system); + __free_filter(filter); goto out_unlock; } @@ -1354,7 +1903,17 @@ int apply_subsystem_event_filter(struct event_subsystem *system, if (!ps) goto out_unlock; - replace_filter_string(system->filter, filter_string); + filter = __alloc_filter(); + if (!filter) + goto out; + + replace_filter_string(filter, filter_string); + /* + * No event actually uses the system filter + * we can free it without synchronize_sched(). + */ + __free_filter(system->filter); + system->filter = filter; parse_init(ps, filter_ops, filter_string); err = filter_parse(ps); @@ -1384,7 +1943,7 @@ void ftrace_profile_free_filter(struct perf_event *event) struct event_filter *filter = event->filter; event->filter = NULL; - __free_preds(filter); + __free_filter(filter); } int ftrace_profile_set_filter(struct perf_event *event, int event_id, @@ -1410,8 +1969,8 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, if (event->filter) goto out_unlock; - filter = __alloc_preds(); - if (IS_ERR(filter)) { + filter = __alloc_filter(); + if (!filter) { err = PTR_ERR(filter); goto out_unlock; } @@ -1419,7 +1978,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, err = -ENOMEM; ps = kzalloc(sizeof(*ps), GFP_KERNEL); if (!ps) - goto free_preds; + goto free_filter; parse_init(ps, filter_ops, filter_str); err = filter_parse(ps); @@ -1435,9 +1994,9 @@ free_ps: postfix_clear(ps); kfree(ps); -free_preds: +free_filter: if (err) - __free_preds(filter); + __free_filter(filter); out_unlock: mutex_unlock(&event_mutex); diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 16aee4d44e8f..8d0e1cc4e974 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -149,11 +149,13 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __read_mostly = { .func = function_trace_call, + .flags = FTRACE_OPS_FL_GLOBAL, }; static struct ftrace_ops trace_stack_ops __read_mostly = { .func = function_stack_trace_call, + .flags = FTRACE_OPS_FL_GLOBAL, }; /* Our two options */ diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 76b05980225c..962cdb24ed81 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -905,7 +905,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, * * returns 1 if * - we are inside irq code - * - we just extered irq code + * - we just entered irq code * * retunns 0 if * - funcgraph-interrupts option is set diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 92b6e1e12d98..c77424be284d 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -80,7 +80,7 @@ static struct tracer_flags tracer_flags = { * skip the latency if the sequence has changed - some other section * did a maximum and could disturb our measurement with serial console * printouts, etc. Truly coinciding maximum latencies should be rare - * and what happens together happens separately as well, so this doesnt + * and what happens together happens separately as well, so this doesn't * decrease the validity of the maximum found: */ static __cacheline_aligned_in_smp unsigned long max_sequence; @@ -153,6 +153,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __read_mostly = { .func = irqsoff_tracer_call, + .flags = FTRACE_OPS_FL_GLOBAL, }; #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 2dec9bcde8b4..f925c45f0afa 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -53,7 +53,6 @@ const char *reserved_field_names[] = { "common_preempt_count", "common_pid", "common_tgid", - "common_lock_depth", FIELD_STRING_IP, FIELD_STRING_RETIP, FIELD_STRING_FUNC, @@ -353,6 +352,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) kfree(data); } +/* Bitfield fetch function */ +struct bitfield_fetch_param { + struct fetch_param orig; + unsigned char hi_shift; + unsigned char low_shift; +}; + +#define DEFINE_FETCH_bitfield(type) \ +static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\ + void *data, void *dest) \ +{ \ + struct bitfield_fetch_param *bprm = data; \ + type buf = 0; \ + call_fetch(&bprm->orig, regs, &buf); \ + if (buf) { \ + buf <<= bprm->hi_shift; \ + buf >>= bprm->low_shift; \ + } \ + *(type *)dest = buf; \ +} +DEFINE_BASIC_FETCH_FUNCS(bitfield) +#define fetch_bitfield_string NULL +#define fetch_bitfield_string_size NULL + +static __kprobes void +free_bitfield_fetch_param(struct bitfield_fetch_param *data) +{ + /* + * Don't check the bitfield itself, because this must be the + * last fetch function. + */ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + free_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + free_symbol_cache(data->orig.data); + kfree(data); +} /* Default (unsigned long) fetch type */ #define __DEFAULT_FETCH_TYPE(t) u##t #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) @@ -367,6 +403,7 @@ enum { FETCH_MTD_memory, FETCH_MTD_symbol, FETCH_MTD_deref, + FETCH_MTD_bitfield, FETCH_MTD_END, }; @@ -387,6 +424,7 @@ ASSIGN_FETCH_FUNC(retval, ftype), \ ASSIGN_FETCH_FUNC(memory, ftype), \ ASSIGN_FETCH_FUNC(symbol, ftype), \ ASSIGN_FETCH_FUNC(deref, ftype), \ +ASSIGN_FETCH_FUNC(bitfield, ftype), \ } \ } @@ -430,9 +468,33 @@ static const struct fetch_type *find_fetch_type(const char *type) if (!type) type = DEFAULT_FETCH_TYPE_STR; + /* Special case: bitfield */ + if (*type == 'b') { + unsigned long bs; + type = strchr(type, '/'); + if (!type) + goto fail; + type++; + if (strict_strtoul(type, 0, &bs)) + goto fail; + switch (bs) { + case 8: + return find_fetch_type("u8"); + case 16: + return find_fetch_type("u16"); + case 32: + return find_fetch_type("u32"); + case 64: + return find_fetch_type("u64"); + default: + goto fail; + } + } + for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) if (strcmp(type, fetch_type_table[i].name) == 0) return &fetch_type_table[i]; +fail: return NULL; } @@ -586,7 +648,9 @@ error: static void free_probe_arg(struct probe_arg *arg) { - if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) + if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) + free_bitfield_fetch_param(arg->fetch.data); + else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) free_deref_fetch_param(arg->fetch.data); else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) free_symbol_cache(arg->fetch.data); @@ -767,16 +831,15 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, } break; case '+': /* deref memory */ + arg++; /* Skip '+', because strict_strtol() rejects it. */ case '-': tmp = strchr(arg, '('); if (!tmp) break; *tmp = '\0'; - ret = strict_strtol(arg + 1, 0, &offset); + ret = strict_strtol(arg, 0, &offset); if (ret) break; - if (arg[0] == '-') - offset = -offset; arg = tmp + 1; tmp = strrchr(arg, ')'); if (tmp) { @@ -807,6 +870,41 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, return ret; } +#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long)) + +/* Bitfield type needs to be parsed into a fetch function */ +static int __parse_bitfield_probe_arg(const char *bf, + const struct fetch_type *t, + struct fetch_param *f) +{ + struct bitfield_fetch_param *bprm; + unsigned long bw, bo; + char *tail; + + if (*bf != 'b') + return 0; + + bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); + if (!bprm) + return -ENOMEM; + bprm->orig = *f; + f->fn = t->fetch[FETCH_MTD_bitfield]; + f->data = (void *)bprm; + + bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */ + if (bw == 0 || *tail != '@') + return -EINVAL; + + bf = tail + 1; + bo = simple_strtoul(bf, &tail, 0); + if (tail == bf || *tail != '/') + return -EINVAL; + + bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo); + bprm->low_shift = bprm->hi_shift + bo; + return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; +} + /* String length checking wrapper */ static int parse_probe_arg(char *arg, struct trace_probe *tp, struct probe_arg *parg, int is_return) @@ -836,6 +934,8 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp, parg->offset = tp->size; tp->size += parg->type->size; ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); + if (ret >= 0 && t != NULL) + ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); if (ret >= 0) { parg->fetch_size.fn = get_fetch_size_function(parg->type, parg->fetch.fn); @@ -1130,7 +1230,7 @@ static int command_trace_probe(const char *buf) return ret; } -#define WRITE_BUFSIZE 128 +#define WRITE_BUFSIZE 4096 static ssize_t probes_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) @@ -1738,7 +1838,7 @@ static void unregister_probe_event(struct trace_probe *tp) kfree(tp->call.print_fmt); } -/* Make a debugfs interface for controling probe points */ +/* Make a debugfs interface for controlling probe points */ static __init int init_kprobe_trace(void) { struct dentry *d_tracer; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 02272baa2206..e37de492a9e1 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -353,6 +353,33 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, } EXPORT_SYMBOL(ftrace_print_symbols_seq); +#if BITS_PER_LONG == 32 +const char * +ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, + const struct trace_print_flags_u64 *symbol_array) +{ + int i; + const char *ret = p->buffer + p->len; + + for (i = 0; symbol_array[i].name; i++) { + + if (val != symbol_array[i].mask) + continue; + + trace_seq_puts(p, symbol_array[i].name); + break; + } + + if (!p->len) + trace_seq_printf(p, "0x%llx", val); + + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(ftrace_print_symbols_seq_u64); +#endif + const char * ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) { @@ -529,24 +556,34 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) * @entry: The trace entry field from the ring buffer * * Prints the generic fields of irqs off, in hard or softirq, preempt - * count and lock depth. + * count. */ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { - int hardirq, softirq; + char hardsoft_irq; + char need_resched; + char irqs_off; + int hardirq; + int softirq; int ret; hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + irqs_off = + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : + '.'; + need_resched = + (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'; + hardsoft_irq = + (hardirq && softirq) ? 'H' : + hardirq ? 'h' : + softirq ? 's' : + '.'; + if (!trace_seq_printf(s, "%c%c%c", - (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : - (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? - 'X' : '.', - (entry->flags & TRACE_FLAG_NEED_RESCHED) ? - 'N' : '.', - (hardirq && softirq) ? 'H' : - hardirq ? 'h' : softirq ? 's' : '.')) + irqs_off, need_resched, hardsoft_irq)) return 0; if (entry->preempt_count) @@ -554,13 +591,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) else ret = trace_seq_putc(s, '.'); - if (!ret) - return 0; - - if (entry->lock_depth < 0) - return trace_seq_putc(s, '.'); - - return trace_seq_printf(s, "%d", entry->lock_depth); + return ret; } static int @@ -826,6 +857,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event); enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, struct trace_event *event) { + if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type)) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; } diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 2547d8813cf0..dff763b7baf1 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -32,7 +32,7 @@ static DEFINE_MUTEX(btrace_mutex); struct trace_bprintk_fmt { struct list_head list; - char fmt[0]; + const char *fmt; }; static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) @@ -49,6 +49,7 @@ static void hold_module_trace_bprintk_format(const char **start, const char **end) { const char **iter; + char *fmt; mutex_lock(&btrace_mutex); for (iter = start; iter < end; iter++) { @@ -58,14 +59,18 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) continue; } - tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt) - + strlen(*iter) + 1, GFP_KERNEL); - if (tb_fmt) { + tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL); + if (tb_fmt) + fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL); + if (tb_fmt && fmt) { list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); - strcpy(tb_fmt->fmt, *iter); + strcpy(fmt, *iter); + tb_fmt->fmt = fmt; *iter = tb_fmt->fmt; - } else + } else { + kfree(tb_fmt); *iter = NULL; + } } mutex_unlock(&btrace_mutex); } @@ -84,6 +89,76 @@ static int module_trace_bprintk_format_notify(struct notifier_block *self, return 0; } +/* + * The debugfs/tracing/printk_formats file maps the addresses with + * the ASCII formats that are used in the bprintk events in the + * buffer. For userspace tools to be able to decode the events from + * the buffer, they need to be able to map the address with the format. + * + * The addresses of the bprintk formats are in their own section + * __trace_printk_fmt. But for modules we copy them into a link list. + * The code to print the formats and their addresses passes around the + * address of the fmt string. If the fmt address passed into the seq + * functions is within the kernel core __trace_printk_fmt section, then + * it simply uses the next pointer in the list. + * + * When the fmt pointer is outside the kernel core __trace_printk_fmt + * section, then we need to read the link list pointers. The trick is + * we pass the address of the string to the seq function just like + * we do for the kernel core formats. To get back the structure that + * holds the format, we simply use containerof() and then go to the + * next format in the list. + */ +static const char ** +find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos) +{ + struct trace_bprintk_fmt *mod_fmt; + + if (list_empty(&trace_bprintk_fmt_list)) + return NULL; + + /* + * v will point to the address of the fmt record from t_next + * v will be NULL from t_start. + * If this is the first pointer or called from start + * then we need to walk the list. + */ + if (!v || start_index == *pos) { + struct trace_bprintk_fmt *p; + + /* search the module list */ + list_for_each_entry(p, &trace_bprintk_fmt_list, list) { + if (start_index == *pos) + return &p->fmt; + start_index++; + } + /* pos > index */ + return NULL; + } + + /* + * v points to the address of the fmt field in the mod list + * structure that holds the module print format. + */ + mod_fmt = container_of(v, typeof(*mod_fmt), fmt); + if (mod_fmt->list.next == &trace_bprintk_fmt_list) + return NULL; + + mod_fmt = container_of(mod_fmt->list.next, typeof(*mod_fmt), list); + + return &mod_fmt->fmt; +} + +static void format_mod_start(void) +{ + mutex_lock(&btrace_mutex); +} + +static void format_mod_stop(void) +{ + mutex_unlock(&btrace_mutex); +} + #else /* !CONFIG_MODULES */ __init static int module_trace_bprintk_format_notify(struct notifier_block *self, @@ -91,6 +166,13 @@ module_trace_bprintk_format_notify(struct notifier_block *self, { return 0; } +static inline const char ** +find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos) +{ + return NULL; +} +static inline void format_mod_start(void) { } +static inline void format_mod_stop(void) { } #endif /* CONFIG_MODULES */ @@ -153,20 +235,33 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) } EXPORT_SYMBOL_GPL(__ftrace_vprintk); +static const char **find_next(void *v, loff_t *pos) +{ + const char **fmt = v; + int start_index; + + if (!fmt) + fmt = __start___trace_bprintk_fmt + *pos; + + start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; + + if (*pos < start_index) + return fmt; + + return find_next_mod_format(start_index, v, fmt, pos); +} + static void * t_start(struct seq_file *m, loff_t *pos) { - const char **fmt = __start___trace_bprintk_fmt + *pos; - - if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt) - return NULL; - return fmt; + format_mod_start(); + return find_next(NULL, pos); } static void *t_next(struct seq_file *m, void * v, loff_t *pos) { (*pos)++; - return t_start(m, pos); + return find_next(v, pos); } static int t_show(struct seq_file *m, void *v) @@ -205,6 +300,7 @@ static int t_show(struct seq_file *m, void *v) static void t_stop(struct seq_file *m, void *p) { + format_mod_stop(); } static const struct seq_operations show_format_seq_ops = { diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 8f758d070c43..7e62c0a18456 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -247,51 +247,3 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr) ctx_trace = tr; } -static void stop_sched_trace(struct trace_array *tr) -{ - tracing_stop_sched_switch_record(); -} - -static int sched_switch_trace_init(struct trace_array *tr) -{ - ctx_trace = tr; - tracing_reset_online_cpus(tr); - tracing_start_sched_switch_record(); - return 0; -} - -static void sched_switch_trace_reset(struct trace_array *tr) -{ - if (sched_ref) - stop_sched_trace(tr); -} - -static void sched_switch_trace_start(struct trace_array *tr) -{ - sched_stopped = 0; -} - -static void sched_switch_trace_stop(struct trace_array *tr) -{ - sched_stopped = 1; -} - -static struct tracer sched_switch_trace __read_mostly = -{ - .name = "sched_switch", - .init = sched_switch_trace_init, - .reset = sched_switch_trace_reset, - .start = sched_switch_trace_start, - .stop = sched_switch_trace_stop, - .wait_pipe = poll_wait_pipe, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_sched_switch, -#endif -}; - -__init static int init_sched_switch_trace(void) -{ - return register_tracer(&sched_switch_trace); -} -device_initcall(init_sched_switch_trace); - diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 7319559ed59f..f029dd4fd2ca 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -129,6 +129,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __read_mostly = { .func = wakeup_tracer_call, + .flags = FTRACE_OPS_FL_GLOBAL, }; #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 659732eba07c..288541f977fb 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -101,6 +101,206 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret) #ifdef CONFIG_DYNAMIC_FTRACE +static int trace_selftest_test_probe1_cnt; +static void trace_selftest_test_probe1_func(unsigned long ip, + unsigned long pip) +{ + trace_selftest_test_probe1_cnt++; +} + +static int trace_selftest_test_probe2_cnt; +static void trace_selftest_test_probe2_func(unsigned long ip, + unsigned long pip) +{ + trace_selftest_test_probe2_cnt++; +} + +static int trace_selftest_test_probe3_cnt; +static void trace_selftest_test_probe3_func(unsigned long ip, + unsigned long pip) +{ + trace_selftest_test_probe3_cnt++; +} + +static int trace_selftest_test_global_cnt; +static void trace_selftest_test_global_func(unsigned long ip, + unsigned long pip) +{ + trace_selftest_test_global_cnt++; +} + +static int trace_selftest_test_dyn_cnt; +static void trace_selftest_test_dyn_func(unsigned long ip, + unsigned long pip) +{ + trace_selftest_test_dyn_cnt++; +} + +static struct ftrace_ops test_probe1 = { + .func = trace_selftest_test_probe1_func, +}; + +static struct ftrace_ops test_probe2 = { + .func = trace_selftest_test_probe2_func, +}; + +static struct ftrace_ops test_probe3 = { + .func = trace_selftest_test_probe3_func, +}; + +static struct ftrace_ops test_global = { + .func = trace_selftest_test_global_func, + .flags = FTRACE_OPS_FL_GLOBAL, +}; + +static void print_counts(void) +{ + printk("(%d %d %d %d %d) ", + trace_selftest_test_probe1_cnt, + trace_selftest_test_probe2_cnt, + trace_selftest_test_probe3_cnt, + trace_selftest_test_global_cnt, + trace_selftest_test_dyn_cnt); +} + +static void reset_counts(void) +{ + trace_selftest_test_probe1_cnt = 0; + trace_selftest_test_probe2_cnt = 0; + trace_selftest_test_probe3_cnt = 0; + trace_selftest_test_global_cnt = 0; + trace_selftest_test_dyn_cnt = 0; +} + +static int trace_selftest_ops(int cnt) +{ + int save_ftrace_enabled = ftrace_enabled; + struct ftrace_ops *dyn_ops; + char *func1_name; + char *func2_name; + int len1; + int len2; + int ret = -1; + + printk(KERN_CONT "PASSED\n"); + pr_info("Testing dynamic ftrace ops #%d: ", cnt); + + ftrace_enabled = 1; + reset_counts(); + + /* Handle PPC64 '.' name */ + func1_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + func2_name = "*" __stringify(DYN_FTRACE_TEST_NAME2); + len1 = strlen(func1_name); + len2 = strlen(func2_name); + + /* + * Probe 1 will trace function 1. + * Probe 2 will trace function 2. + * Probe 3 will trace functions 1 and 2. + */ + ftrace_set_filter(&test_probe1, func1_name, len1, 1); + ftrace_set_filter(&test_probe2, func2_name, len2, 1); + ftrace_set_filter(&test_probe3, func1_name, len1, 1); + ftrace_set_filter(&test_probe3, func2_name, len2, 0); + + register_ftrace_function(&test_probe1); + register_ftrace_function(&test_probe2); + register_ftrace_function(&test_probe3); + register_ftrace_function(&test_global); + + DYN_FTRACE_TEST_NAME(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 1) + goto out; + if (trace_selftest_test_probe2_cnt != 0) + goto out; + if (trace_selftest_test_probe3_cnt != 1) + goto out; + if (trace_selftest_test_global_cnt == 0) + goto out; + + DYN_FTRACE_TEST_NAME2(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 1) + goto out; + if (trace_selftest_test_probe2_cnt != 1) + goto out; + if (trace_selftest_test_probe3_cnt != 2) + goto out; + + /* Add a dynamic probe */ + dyn_ops = kzalloc(sizeof(*dyn_ops), GFP_KERNEL); + if (!dyn_ops) { + printk("MEMORY ERROR "); + goto out; + } + + dyn_ops->func = trace_selftest_test_dyn_func; + + register_ftrace_function(dyn_ops); + + trace_selftest_test_global_cnt = 0; + + DYN_FTRACE_TEST_NAME(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 2) + goto out_free; + if (trace_selftest_test_probe2_cnt != 1) + goto out_free; + if (trace_selftest_test_probe3_cnt != 3) + goto out_free; + if (trace_selftest_test_global_cnt == 0) + goto out; + if (trace_selftest_test_dyn_cnt == 0) + goto out_free; + + DYN_FTRACE_TEST_NAME2(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 2) + goto out_free; + if (trace_selftest_test_probe2_cnt != 2) + goto out_free; + if (trace_selftest_test_probe3_cnt != 4) + goto out_free; + + ret = 0; + out_free: + unregister_ftrace_function(dyn_ops); + kfree(dyn_ops); + + out: + /* Purposely unregister in the same order */ + unregister_ftrace_function(&test_probe1); + unregister_ftrace_function(&test_probe2); + unregister_ftrace_function(&test_probe3); + unregister_ftrace_function(&test_global); + + /* Make sure everything is off */ + reset_counts(); + DYN_FTRACE_TEST_NAME(); + DYN_FTRACE_TEST_NAME(); + + if (trace_selftest_test_probe1_cnt || + trace_selftest_test_probe2_cnt || + trace_selftest_test_probe3_cnt || + trace_selftest_test_global_cnt || + trace_selftest_test_dyn_cnt) + ret = -1; + + ftrace_enabled = save_ftrace_enabled; + + return ret; +} + /* Test dynamic code modification and ftrace filters */ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, struct trace_array *tr, @@ -131,7 +331,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); /* filter only on our function */ - ftrace_set_filter(func_name, strlen(func_name), 1); + ftrace_set_global_filter(func_name, strlen(func_name), 1); /* enable tracing */ ret = tracer_init(trace, tr); @@ -166,22 +366,30 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, /* check the trace buffer */ ret = trace_test_buffer(tr, &count); - trace->reset(tr); tracing_start(); /* we should only have one item */ if (!ret && count != 1) { + trace->reset(tr); printk(KERN_CONT ".. filter failed count=%ld ..", count); ret = -1; goto out; } + /* Test the ops with global tracing running */ + ret = trace_selftest_ops(1); + trace->reset(tr); + out: ftrace_enabled = save_ftrace_enabled; tracer_enabled = save_tracer_enabled; /* Enable tracing on all functions again */ - ftrace_set_filter(NULL, 0, 1); + ftrace_set_global_filter(NULL, 0, 1); + + /* Test the ops with global tracing off */ + if (!ret) + ret = trace_selftest_ops(2); return ret; } diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c index 54dd77cce5bf..b4c475a0a48b 100644 --- a/kernel/trace/trace_selftest_dynamic.c +++ b/kernel/trace/trace_selftest_dynamic.c @@ -5,3 +5,9 @@ int DYN_FTRACE_TEST_NAME(void) /* used to call mcount */ return 0; } + +int DYN_FTRACE_TEST_NAME2(void) +{ + /* used to call mcount */ + return 0; +} diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 4c5dead0c239..b0b53b8e4c25 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -133,6 +133,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_ops __read_mostly = { .func = stack_trace_call, + .flags = FTRACE_OPS_FL_GLOBAL, }; static ssize_t diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 5c9fe08d2093..ee7b5a0bb9f8 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -60,6 +60,19 @@ extern struct syscall_metadata *__stop_syscalls_metadata[]; static struct syscall_metadata **syscalls_metadata; +#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME +static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) +{ + /* + * Only compare after the "sys" prefix. Archs that use + * syscall wrappers may have syscalls symbols aliases prefixed + * with "SyS" instead of "sys", leading to an unwanted + * mismatch. + */ + return !strcmp(sym + 3, name + 3); +} +#endif + static __init struct syscall_metadata * find_syscall_meta(unsigned long syscall) { @@ -72,14 +85,11 @@ find_syscall_meta(unsigned long syscall) stop = __stop_syscalls_metadata; kallsyms_lookup(syscall, NULL, NULL, NULL, str); + if (arch_syscall_match_sym_name(str, "sys_ni_syscall")) + return NULL; + for ( ; start < stop; start++) { - /* - * Only compare after the "sys" prefix. Archs that use - * syscall wrappers may have syscalls symbols aliases prefixed - * with "SyS" instead of "sys", leading to an unwanted - * mismatch. - */ - if ((*start)->name && !strcmp((*start)->name + 3, str + 3)) + if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name)) return *start; } return NULL; @@ -359,7 +369,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call) int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; - if (num < 0 || num >= NR_syscalls) + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_enter) @@ -377,7 +387,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call) int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; - if (num < 0 || num >= NR_syscalls) + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return; mutex_lock(&syscall_trace_lock); sys_refcount_enter--; @@ -393,7 +403,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call) int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; - if (num < 0 || num >= NR_syscalls) + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_exit) @@ -411,7 +421,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call) int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; - if (num < 0 || num >= NR_syscalls) + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return; mutex_lock(&syscall_trace_lock); sys_refcount_exit--; @@ -424,6 +434,14 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call) int init_syscall_trace(struct ftrace_event_call *call) { int id; + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + if (num < 0 || num >= NR_syscalls) { + pr_debug("syscall %s metadata not mapped, disabling ftrace event\n", + ((struct syscall_metadata *)call->data)->name); + return -ENOSYS; + } if (set_syscall_print_fmt(call) < 0) return -ENOMEM; @@ -438,7 +456,7 @@ int init_syscall_trace(struct ftrace_event_call *call) return id; } -unsigned long __init arch_syscall_addr(int nr) +unsigned long __init __weak arch_syscall_addr(int nr) { return (unsigned long)sys_call_table[nr]; } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 68187af4889e..b219f1449c54 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -251,9 +251,9 @@ static void set_tracepoint(struct tracepoint_entry **entry, { WARN_ON(strcmp((*entry)->name, elem->name) != 0); - if (elem->regfunc && !elem->state && active) + if (elem->regfunc && !jump_label_enabled(&elem->key) && active) elem->regfunc(); - else if (elem->unregfunc && elem->state && !active) + else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active) elem->unregfunc(); /* @@ -264,13 +264,10 @@ static void set_tracepoint(struct tracepoint_entry **entry, * is used. */ rcu_assign_pointer(elem->funcs, (*entry)->funcs); - if (!elem->state && active) { - jump_label_enable(&elem->state); - elem->state = active; - } else if (elem->state && !active) { - jump_label_disable(&elem->state); - elem->state = active; - } + if (active && !jump_label_enabled(&elem->key)) + jump_label_inc(&elem->key); + else if (!active && jump_label_enabled(&elem->key)) + jump_label_dec(&elem->key); } /* @@ -281,13 +278,11 @@ static void set_tracepoint(struct tracepoint_entry **entry, */ static void disable_tracepoint(struct tracepoint *elem) { - if (elem->unregfunc && elem->state) + if (elem->unregfunc && jump_label_enabled(&elem->key)) elem->unregfunc(); - if (elem->state) { - jump_label_disable(&elem->state); - elem->state = 0; - } + if (jump_label_enabled(&elem->key)) + jump_label_dec(&elem->key); rcu_assign_pointer(elem->funcs, NULL); } diff --git a/kernel/uid16.c b/kernel/uid16.c index 419209893d87..51c6e89e8619 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -189,7 +189,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) struct group_info *group_info; int retval; - if (!capable(CAP_SETGID)) + if (!nsown_capable(CAP_SETGID)) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c index eb27fd3430a2..92cb706c7fc8 100644 --- a/kernel/user-return-notifier.c +++ b/kernel/user-return-notifier.c @@ -20,7 +20,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register); /* * Removes a registered user return notifier. Must be called from atomic - * context, and from the same cpu registration occured in. + * context, and from the same cpu registration occurred in. */ void user_return_notifier_unregister(struct user_return_notifier *urn) { diff --git a/kernel/user.c b/kernel/user.c index 5c598ca781df..9e03e9c1df8d 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -17,9 +17,13 @@ #include <linux/module.h> #include <linux/user_namespace.h> +/* + * userns count is 1 for root user, 1 for init_uts_ns, + * and 1 for... ? + */ struct user_namespace init_user_ns = { .kref = { - .refcount = ATOMIC_INIT(2), + .refcount = ATOMIC_INIT(3), }, .creator = &root_user, }; @@ -47,7 +51,7 @@ static struct kmem_cache *uid_cachep; */ static DEFINE_SPINLOCK(uidhash_lock); -/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */ +/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */ struct user_struct root_user = { .__count = ATOMIC_INIT(2), .processes = ATOMIC_INIT(1), diff --git a/kernel/utsname.c b/kernel/utsname.c index 8a82b4b8ea52..bff131b9510a 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -14,6 +14,8 @@ #include <linux/utsname.h> #include <linux/err.h> #include <linux/slab.h> +#include <linux/user_namespace.h> +#include <linux/proc_fs.h> static struct uts_namespace *create_uts_ns(void) { @@ -30,7 +32,8 @@ static struct uts_namespace *create_uts_ns(void) * @old_ns: namespace to clone * Return NULL on error (failure to kmalloc), new ns otherwise */ -static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) +static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, + struct uts_namespace *old_ns) { struct uts_namespace *ns; @@ -40,6 +43,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); + ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns); up_read(&uts_sem); return ns; } @@ -50,8 +54,10 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) * utsname of this process won't be seen by parent, and vice * versa. */ -struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns) +struct uts_namespace *copy_utsname(unsigned long flags, + struct task_struct *tsk) { + struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; struct uts_namespace *new_ns; BUG_ON(!old_ns); @@ -60,7 +66,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *ol if (!(flags & CLONE_NEWUTS)) return old_ns; - new_ns = clone_uts_ns(old_ns); + new_ns = clone_uts_ns(tsk, old_ns); put_uts_ns(old_ns); return new_ns; @@ -71,5 +77,44 @@ void free_uts_ns(struct kref *kref) struct uts_namespace *ns; ns = container_of(kref, struct uts_namespace, kref); + put_user_ns(ns->user_ns); kfree(ns); } + +static void *utsns_get(struct task_struct *task) +{ + struct uts_namespace *ns = NULL; + struct nsproxy *nsproxy; + + rcu_read_lock(); + nsproxy = task_nsproxy(task); + if (nsproxy) { + ns = nsproxy->uts_ns; + get_uts_ns(ns); + } + rcu_read_unlock(); + + return ns; +} + +static void utsns_put(void *ns) +{ + put_uts_ns(ns); +} + +static int utsns_install(struct nsproxy *nsproxy, void *ns) +{ + get_uts_ns(ns); + put_uts_ns(nsproxy->uts_ns); + nsproxy->uts_ns = ns; + return 0; +} + +const struct proc_ns_operations utsns_operations = { + .name = "uts", + .type = CLONE_NEWUTS, + .get = utsns_get, + .put = utsns_put, + .install = utsns_install, +}; + diff --git a/kernel/wait.c b/kernel/wait.c index b0310eb6cc1e..f45ea8d2a1ce 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -142,7 +142,7 @@ EXPORT_SYMBOL(finish_wait); * woken up through the queue. * * This prevents waiter starvation where an exclusive waiter - * aborts and is woken up concurrently and noone wakes up + * aborts and is woken up concurrently and no one wakes up * the next waiter. */ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 18bb15776c57..3d0c56ad4792 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -28,7 +28,7 @@ #include <linux/perf_event.h> int watchdog_enabled = 1; -int __read_mostly softlockup_thresh = 60; +int __read_mostly watchdog_thresh = 10; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); @@ -48,12 +48,15 @@ static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); * Should we panic when a soft-lockup or hard-lockup occurs: */ #ifdef CONFIG_HARDLOCKUP_DETECTOR -static int hardlockup_panic; +static int hardlockup_panic = + CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; static int __init hardlockup_panic_setup(char *str) { if (!strncmp(str, "panic", 5)) hardlockup_panic = 1; + else if (!strncmp(str, "nopanic", 7)) + hardlockup_panic = 0; else if (!strncmp(str, "0", 1)) watchdog_enabled = 0; return 1; @@ -88,6 +91,17 @@ static int __init nosoftlockup_setup(char *str) __setup("nosoftlockup", nosoftlockup_setup); /* */ +/* + * Hard-lockup warnings should be triggered after just a few seconds. Soft- + * lockups can have false positives under extreme conditions. So we generally + * want a higher threshold for soft lockups than for hard lockups. So we couple + * the thresholds with a factor: we make the soft threshold twice the amount of + * time the hard threshold is. + */ +static int get_softlockup_thresh(void) +{ + return watchdog_thresh * 2; +} /* * Returns seconds, approximately. We don't need nanosecond @@ -102,12 +116,12 @@ static unsigned long get_timestamp(int this_cpu) static unsigned long get_sample_period(void) { /* - * convert softlockup_thresh from seconds to ns + * convert watchdog_thresh from seconds to ns * the divide by 5 is to give hrtimer 5 chances to * increment before the hardlockup detector generates * a warning */ - return softlockup_thresh / 5 * NSEC_PER_SEC; + return get_softlockup_thresh() * (NSEC_PER_SEC / 5); } /* Commands for resetting the watchdog */ @@ -179,7 +193,7 @@ static int is_softlockup(unsigned long touch_ts) unsigned long now = get_timestamp(smp_processor_id()); /* Warn about unreasonable delays: */ - if (time_after(now, touch_ts + softlockup_thresh)) + if (time_after(now, touch_ts + get_softlockup_thresh())) return now - touch_ts; return 0; @@ -356,7 +370,7 @@ static int watchdog_nmi_enable(int cpu) /* Try to register using hardware perf events */ wd_attr = &wd_hw_attr; - wd_attr->sample_period = hw_nmi_get_sample_period(); + wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); if (!IS_ERR(event)) { printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); @@ -401,33 +415,37 @@ static void watchdog_nmi_disable(int cpu) { return; } #endif /* CONFIG_HARDLOCKUP_DETECTOR */ /* prepare/enable/disable routines */ -static int watchdog_prepare_cpu(int cpu) +static void watchdog_prepare_cpu(int cpu) { struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); WARN_ON(per_cpu(softlockup_watchdog, cpu)); hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = watchdog_timer_fn; - - return 0; } static int watchdog_enable(int cpu) { struct task_struct *p = per_cpu(softlockup_watchdog, cpu); - int err; + int err = 0; /* enable the perf event */ err = watchdog_nmi_enable(cpu); - if (err) - return err; + + /* Regardless of err above, fall through and start softlockup */ /* create the watchdog thread */ if (!p) { p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); if (IS_ERR(p)) { printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); - return PTR_ERR(p); + if (!err) { + /* if hardlockup hasn't already set this */ + err = PTR_ERR(p); + /* and disable the perf event */ + watchdog_nmi_disable(cpu); + } + goto out; } kthread_bind(p, cpu); per_cpu(watchdog_touch_ts, cpu) = 0; @@ -435,7 +453,8 @@ static int watchdog_enable(int cpu) wake_up_process(p); } - return 0; +out: + return err; } static void watchdog_disable(int cpu) @@ -491,28 +510,25 @@ static void watchdog_disable_all_cpus(void) /* sysctl functions */ #ifdef CONFIG_SYSCTL /* - * proc handler for /proc/sys/kernel/nmi_watchdog + * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh */ -int proc_dowatchdog_enabled(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) +int proc_dowatchdog(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { - proc_dointvec(table, write, buffer, length, ppos); + int ret; - if (write) { - if (watchdog_enabled) - watchdog_enable_all_cpus(); - else - watchdog_disable_all_cpus(); - } - return 0; -} + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (ret || !write) + goto out; -int proc_dowatchdog_thresh(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - return proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (watchdog_enabled && watchdog_thresh) + watchdog_enable_all_cpus(); + else + watchdog_disable_all_cpus(); + +out: + return ret; } #endif /* CONFIG_SYSCTL */ @@ -524,17 +540,16 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { int hotcpu = (unsigned long)hcpu; - int err = 0; switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - err = watchdog_prepare_cpu(hotcpu); + watchdog_prepare_cpu(hotcpu); break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: if (watchdog_enabled) - err = watchdog_enable(hotcpu); + watchdog_enable(hotcpu); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: @@ -547,7 +562,13 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; #endif /* CONFIG_HOTPLUG_CPU */ } - return notifier_from_errno(err); + + /* + * hardlockup and softlockup are not important enough + * to block cpu bring up. Just always succeed and + * rely on printk output to flag problems. + */ + return NOTIFY_OK; } static struct notifier_block __cpuinitdata cpu_nfb = { diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ee6578b578ad..0400553f0d04 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -251,10 +251,12 @@ struct workqueue_struct *system_wq __read_mostly; struct workqueue_struct *system_long_wq __read_mostly; struct workqueue_struct *system_nrt_wq __read_mostly; struct workqueue_struct *system_unbound_wq __read_mostly; +struct workqueue_struct *system_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_wq); EXPORT_SYMBOL_GPL(system_long_wq); EXPORT_SYMBOL_GPL(system_nrt_wq); EXPORT_SYMBOL_GPL(system_unbound_wq); +EXPORT_SYMBOL_GPL(system_freezable_wq); #define CREATE_TRACE_POINTS #include <trace/events/workqueue.h> @@ -316,6 +318,11 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, static struct debug_obj_descr work_debug_descr; +static void *work_debug_hint(void *addr) +{ + return ((struct work_struct *) addr)->func; +} + /* * fixup_init is called when: * - an active object is initialized @@ -387,6 +394,7 @@ static int work_fixup_free(void *addr, enum debug_obj_state state) static struct debug_obj_descr work_debug_descr = { .name = "work_struct", + .debug_hint = work_debug_hint, .fixup_init = work_fixup_init, .fixup_activate = work_fixup_activate, .fixup_free = work_fixup_free, @@ -1283,8 +1291,14 @@ __acquires(&gcwq->lock) return true; spin_unlock_irq(&gcwq->lock); - /* CPU has come up inbetween, retry migration */ + /* + * We've raced with CPU hot[un]plug. Give it a breather + * and retry migration. cond_resched() is required here; + * otherwise, we might deadlock against cpu_stop trying to + * bring down the CPU on non-preemptive kernel. + */ cpu_relax(); + cond_resched(); } } @@ -1358,8 +1372,10 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind) worker->id = id; if (!on_unbound_cpu) - worker->task = kthread_create(worker_thread, worker, - "kworker/%u:%d", gcwq->cpu, id); + worker->task = kthread_create_on_node(worker_thread, + worker, + cpu_to_node(gcwq->cpu), + "kworker/%u:%d", gcwq->cpu, id); else worker->task = kthread_create(worker_thread, worker, "kworker/u:%d", id); @@ -2850,9 +2866,7 @@ static int alloc_cwqs(struct workqueue_struct *wq) } } - /* just in case, make sure it's actually aligned - * - this is affected by PERCPU() alignment in vmlinux.lds.S - */ + /* just in case, make sure it's actually aligned */ BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); return wq->cpu_wq.v ? 0 : -ENOMEM; } @@ -3775,8 +3789,10 @@ static int __init init_workqueues(void) system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_UNBOUND_MAX_ACTIVE); + system_freezable_wq = alloc_workqueue("events_freezable", + WQ_FREEZABLE, 0); BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || - !system_unbound_wq); + !system_unbound_wq || !system_freezable_wq); return 0; } early_initcall(init_workqueues); |